lybic-guiagents 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lybic-guiagents might be problematic. Click here for more details.
- desktop_env/__init__.py +1 -0
- desktop_env/actions.py +203 -0
- desktop_env/controllers/__init__.py +0 -0
- desktop_env/controllers/python.py +471 -0
- desktop_env/controllers/setup.py +882 -0
- desktop_env/desktop_env.py +509 -0
- desktop_env/evaluators/__init__.py +5 -0
- desktop_env/evaluators/getters/__init__.py +41 -0
- desktop_env/evaluators/getters/calc.py +15 -0
- desktop_env/evaluators/getters/chrome.py +1774 -0
- desktop_env/evaluators/getters/file.py +154 -0
- desktop_env/evaluators/getters/general.py +42 -0
- desktop_env/evaluators/getters/gimp.py +38 -0
- desktop_env/evaluators/getters/impress.py +126 -0
- desktop_env/evaluators/getters/info.py +24 -0
- desktop_env/evaluators/getters/misc.py +406 -0
- desktop_env/evaluators/getters/replay.py +20 -0
- desktop_env/evaluators/getters/vlc.py +86 -0
- desktop_env/evaluators/getters/vscode.py +35 -0
- desktop_env/evaluators/metrics/__init__.py +160 -0
- desktop_env/evaluators/metrics/basic_os.py +68 -0
- desktop_env/evaluators/metrics/chrome.py +493 -0
- desktop_env/evaluators/metrics/docs.py +1011 -0
- desktop_env/evaluators/metrics/general.py +665 -0
- desktop_env/evaluators/metrics/gimp.py +637 -0
- desktop_env/evaluators/metrics/libreoffice.py +28 -0
- desktop_env/evaluators/metrics/others.py +92 -0
- desktop_env/evaluators/metrics/pdf.py +31 -0
- desktop_env/evaluators/metrics/slides.py +957 -0
- desktop_env/evaluators/metrics/table.py +585 -0
- desktop_env/evaluators/metrics/thunderbird.py +176 -0
- desktop_env/evaluators/metrics/utils.py +719 -0
- desktop_env/evaluators/metrics/vlc.py +524 -0
- desktop_env/evaluators/metrics/vscode.py +283 -0
- desktop_env/providers/__init__.py +35 -0
- desktop_env/providers/aws/__init__.py +0 -0
- desktop_env/providers/aws/manager.py +278 -0
- desktop_env/providers/aws/provider.py +186 -0
- desktop_env/providers/aws/provider_with_proxy.py +315 -0
- desktop_env/providers/aws/proxy_pool.py +193 -0
- desktop_env/providers/azure/__init__.py +0 -0
- desktop_env/providers/azure/manager.py +87 -0
- desktop_env/providers/azure/provider.py +207 -0
- desktop_env/providers/base.py +97 -0
- desktop_env/providers/gcp/__init__.py +0 -0
- desktop_env/providers/gcp/manager.py +0 -0
- desktop_env/providers/gcp/provider.py +0 -0
- desktop_env/providers/virtualbox/__init__.py +0 -0
- desktop_env/providers/virtualbox/manager.py +463 -0
- desktop_env/providers/virtualbox/provider.py +124 -0
- desktop_env/providers/vmware/__init__.py +0 -0
- desktop_env/providers/vmware/manager.py +455 -0
- desktop_env/providers/vmware/provider.py +105 -0
- gui_agents/__init__.py +0 -0
- gui_agents/agents/Action.py +209 -0
- gui_agents/agents/__init__.py +0 -0
- gui_agents/agents/agent_s.py +832 -0
- gui_agents/agents/global_state.py +610 -0
- gui_agents/agents/grounding.py +651 -0
- gui_agents/agents/hardware_interface.py +129 -0
- gui_agents/agents/manager.py +568 -0
- gui_agents/agents/translator.py +132 -0
- gui_agents/agents/worker.py +355 -0
- gui_agents/cli_app.py +560 -0
- gui_agents/core/__init__.py +0 -0
- gui_agents/core/engine.py +1496 -0
- gui_agents/core/knowledge.py +449 -0
- gui_agents/core/mllm.py +555 -0
- gui_agents/tools/__init__.py +0 -0
- gui_agents/tools/tools.py +727 -0
- gui_agents/unit_test/__init__.py +0 -0
- gui_agents/unit_test/run_tests.py +65 -0
- gui_agents/unit_test/test_manager.py +330 -0
- gui_agents/unit_test/test_worker.py +269 -0
- gui_agents/utils/__init__.py +0 -0
- gui_agents/utils/analyze_display.py +301 -0
- gui_agents/utils/common_utils.py +263 -0
- gui_agents/utils/display_viewer.py +281 -0
- gui_agents/utils/embedding_manager.py +53 -0
- gui_agents/utils/image_axis_utils.py +27 -0
- lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
- lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
- lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
- lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
- lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,727 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tools module for GUI agents.
|
|
3
|
+
|
|
4
|
+
This module provides various tools for GUI agents to perform tasks such as web search,
|
|
5
|
+
context fusion, subtask planning, trajectory reflection, memory retrieval, grounding,
|
|
6
|
+
evaluation, and action generation.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import json
|
|
11
|
+
import base64
|
|
12
|
+
import requests
|
|
13
|
+
import time
|
|
14
|
+
from typing import Dict, Any, Optional, List, Union, Tuple
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
import logging
|
|
17
|
+
from gui_agents.core.mllm import LLMAgent, WebSearchAgent, EmbeddingAgent
|
|
18
|
+
import threading
|
|
19
|
+
from gui_agents.prompts.prompts import system_prompts
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger("desktopenv.tools")
|
|
22
|
+
|
|
23
|
+
class BaseTool(ABC):
|
|
24
|
+
"""Base class for all tools."""
|
|
25
|
+
_prompts_dict = None
|
|
26
|
+
_prompts_dict_lock = threading.Lock()
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def _load_prompts_dict(cls):
|
|
30
|
+
if cls._prompts_dict is None:
|
|
31
|
+
with cls._prompts_dict_lock:
|
|
32
|
+
if cls._prompts_dict is None:
|
|
33
|
+
try:
|
|
34
|
+
# Import prompts from prompts.py module
|
|
35
|
+
cls._prompts_dict = system_prompts
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.error(f"Failed to load prompts from prompts.py: {e}")
|
|
38
|
+
cls._prompts_dict = {}
|
|
39
|
+
|
|
40
|
+
def __init__(self, provider: str, model_name: str, tool_name: str):
|
|
41
|
+
"""
|
|
42
|
+
Initialize the base tool.
|
|
43
|
+
Args:
|
|
44
|
+
provider: API provider name (e.g., "gemini", "openai")
|
|
45
|
+
model_name: Model name to use (e.g., "gemini-2.5-pro")
|
|
46
|
+
tool_name: Name of the tool (used as key in prompts.py)
|
|
47
|
+
"""
|
|
48
|
+
self.provider = provider
|
|
49
|
+
self.model_name = model_name
|
|
50
|
+
self.tool_name = tool_name
|
|
51
|
+
self._load_prompts_dict()
|
|
52
|
+
self._prompt_template = self._get_prompt_template()
|
|
53
|
+
# Create LLMAgent instance for tool usage
|
|
54
|
+
self.engine_params = {
|
|
55
|
+
"engine_type": provider,
|
|
56
|
+
"model": model_name
|
|
57
|
+
}
|
|
58
|
+
self.llm_agent = LLMAgent(engine_params=self.engine_params, system_prompt=self._prompt_template)
|
|
59
|
+
|
|
60
|
+
def _get_prompt_template(self) -> str:
|
|
61
|
+
if self.tool_name is None:
|
|
62
|
+
return ""
|
|
63
|
+
prompts = self.__class__._prompts_dict
|
|
64
|
+
if prompts is None:
|
|
65
|
+
return ""
|
|
66
|
+
return prompts.get(self.tool_name, "")
|
|
67
|
+
|
|
68
|
+
def _call_lmm(self, input_data: Dict[str, Any], temperature: float = 0.0):
|
|
69
|
+
"""
|
|
70
|
+
Call the LMM model for inference using the prompt template with retry mechanism
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
input_data: Dictionary containing input data to format the prompt template
|
|
74
|
+
temperature: Temperature parameter to control randomness of output
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Model response as text
|
|
78
|
+
"""
|
|
79
|
+
# self.llm_agent.reset()
|
|
80
|
+
|
|
81
|
+
# Extract text and image inputs
|
|
82
|
+
text_input = input_data.get('str_input', '')
|
|
83
|
+
image_input = input_data.get('img_input', None)
|
|
84
|
+
|
|
85
|
+
# Add the message with the formatted prompt
|
|
86
|
+
self.llm_agent.add_message(text_input, image_content=image_input, role="user")
|
|
87
|
+
|
|
88
|
+
# Implement safe retry mechanism
|
|
89
|
+
max_retries = 3
|
|
90
|
+
attempt = 0
|
|
91
|
+
content, total_tokens, cost_string = "", [0, 0, 0], ""
|
|
92
|
+
|
|
93
|
+
while attempt < max_retries:
|
|
94
|
+
try:
|
|
95
|
+
content, total_tokens, cost_string = self.llm_agent.get_response(temperature=temperature)
|
|
96
|
+
break # If successful, break out of the loop
|
|
97
|
+
except Exception as e:
|
|
98
|
+
attempt += 1
|
|
99
|
+
logger.error(f"LLM call attempt {attempt} failed: {str(e)}")
|
|
100
|
+
if attempt == max_retries:
|
|
101
|
+
logger.error("Max retries reached. Returning error message.")
|
|
102
|
+
return f"Error: LLM call failed after {max_retries} attempts: {str(e)}", [0, 0, 0], ""
|
|
103
|
+
time.sleep(1.0)
|
|
104
|
+
return content, total_tokens, cost_string
|
|
105
|
+
|
|
106
|
+
@abstractmethod
|
|
107
|
+
def execute(self, tool_input: Dict[str, Any]) -> Tuple[str, List[int], str]:
|
|
108
|
+
"""
|
|
109
|
+
Execute the tool with the given input.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
tool_input: Dictionary containing the input for the tool
|
|
113
|
+
Expected to have 'str_input' and/or 'img_input' keys
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
The output of the tool as a string
|
|
117
|
+
"""
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class ToolFactory:
|
|
122
|
+
"""Factory class for creating tools."""
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def create_tool(tool_name: str, provider: str, model_name: str, **kwargs) -> 'BaseTool':
|
|
126
|
+
"""
|
|
127
|
+
Create a tool instance based on the tool name.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
tool_name: Name of the tool to create
|
|
131
|
+
provider: API provider name
|
|
132
|
+
model_name: Model name to use
|
|
133
|
+
**kwargs: Additional parameters to pass to the tool
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
An instance of the specified tool
|
|
137
|
+
|
|
138
|
+
Raises:
|
|
139
|
+
ValueError: If the tool name is not recognized
|
|
140
|
+
"""
|
|
141
|
+
tool_map = {
|
|
142
|
+
"websearch": (WebSearchTool, None),
|
|
143
|
+
"context_fusion": (ContextFusionTool, "context_fusion"),
|
|
144
|
+
"subtask_planner": (SubtaskPlannerTool, "subtask_planner"),
|
|
145
|
+
"traj_reflector": (TrajReflectorTool, "traj_reflector"),
|
|
146
|
+
"grounding": (GroundingTool, "grounding"),
|
|
147
|
+
"evaluator": (EvaluatorTool, "evaluator"),
|
|
148
|
+
"action_generator": (ActionGeneratorTool, "action_generator"),
|
|
149
|
+
"action_generator_with_takeover": (ActionGeneratorTool, "action_generator_with_takeover"),
|
|
150
|
+
"fast_action_generator": (FastActionGeneratorTool, "fast_action_generator"),
|
|
151
|
+
"fast_action_generator_with_takeover": (FastActionGeneratorTool, "fast_action_generator_with_takeover"),
|
|
152
|
+
"dag_translator": (DAGTranslatorTool, "dag_translator"),
|
|
153
|
+
"embedding": (EmbeddingTool, None),
|
|
154
|
+
"query_formulator": (QueryFormulatorTool, "query_formulator"),
|
|
155
|
+
"text_span": (TextSpanTool, "text_span"),
|
|
156
|
+
"narrative_summarization": (NarrativeSummarizationTool, "narrative_summarization"),
|
|
157
|
+
"episode_summarization": (EpisodeSummarizationTool, "episode_summarization")
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if tool_name not in tool_map:
|
|
161
|
+
raise ValueError(f"Unknown tool name: {tool_name}")
|
|
162
|
+
|
|
163
|
+
tool_class, prompt_key = tool_map[tool_name]
|
|
164
|
+
|
|
165
|
+
# WebSearchTool and EmbeddingTool don't need a prompt
|
|
166
|
+
if tool_name == "websearch":
|
|
167
|
+
return tool_class(provider, model_name, None, **kwargs)
|
|
168
|
+
if tool_name == "embedding":
|
|
169
|
+
return tool_class(provider, model_name, None, **kwargs)
|
|
170
|
+
|
|
171
|
+
return tool_class(provider, model_name, prompt_key, **kwargs)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class WebSearchTool(BaseTool):
|
|
175
|
+
"""Tool for performing web searches."""
|
|
176
|
+
|
|
177
|
+
def __init__(self, provider: str, model_name: str, tool_name: str):
|
|
178
|
+
"""
|
|
179
|
+
Initialize the web search tool.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
provider: API provider name (e.g., "bocha", "exa")
|
|
183
|
+
model_name: Model name to use (not used for WebSearchAgent)
|
|
184
|
+
tool_name: Name of the tool (used as key in prompts.json)
|
|
185
|
+
"""
|
|
186
|
+
self.provider = provider
|
|
187
|
+
|
|
188
|
+
# Create WebSearchAgent instance for search
|
|
189
|
+
self.engine_params = {
|
|
190
|
+
"engine_type": provider,
|
|
191
|
+
"model": model_name,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
# Initialize WebSearchAgent
|
|
195
|
+
self.search_agent = WebSearchAgent(engine_params=self.engine_params)
|
|
196
|
+
|
|
197
|
+
def execute(self, tool_input: Dict[str, Any]) -> Tuple[str, List[int], str]:
|
|
198
|
+
"""
|
|
199
|
+
Execute a web search with the given query.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
tool_input: Dictionary containing the search query
|
|
203
|
+
Expected to have 'str_input' key with the search query
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Search results as a string
|
|
207
|
+
"""
|
|
208
|
+
query = tool_input.get('str_input', '')
|
|
209
|
+
if not query:
|
|
210
|
+
return "Error: No search query provided", [0, 0, 0], ""
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
# Get the answer from the search results
|
|
214
|
+
answer, total_tokens, cost = self.search_agent.get_answer(query)
|
|
215
|
+
|
|
216
|
+
# Return just the answer
|
|
217
|
+
return answer, total_tokens, cost # type: ignore
|
|
218
|
+
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"Error during web search: {str(e)}")
|
|
221
|
+
return f"Error: Web search failed: {str(e)}", [0, 0, 0], ""
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class ContextFusionTool(BaseTool):
|
|
225
|
+
"""Tool for fusing multiple contexts together."""
|
|
226
|
+
|
|
227
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
228
|
+
"""
|
|
229
|
+
Fuse multiple contexts together.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
tool_input: Dictionary containing the contexts to fuse
|
|
233
|
+
Expected to have 'str_input' key with JSON-formatted contexts
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Fused context as a string
|
|
237
|
+
"""
|
|
238
|
+
contexts = tool_input.get('str_input', '')
|
|
239
|
+
if not contexts:
|
|
240
|
+
return "Error: No contexts provided"
|
|
241
|
+
|
|
242
|
+
# Use the prompt template and LMM for context fusion
|
|
243
|
+
return self._call_lmm(tool_input)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class SubtaskPlannerTool(BaseTool):
|
|
247
|
+
"""Tool for planning subtasks."""
|
|
248
|
+
|
|
249
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
250
|
+
"""
|
|
251
|
+
Plan subtasks for a given task.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
tool_input: Dictionary containing the task description
|
|
255
|
+
Expected to have 'str_input' key with the task description
|
|
256
|
+
May also have 'img_input' key with a screenshot
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Subtask plan as a string
|
|
260
|
+
"""
|
|
261
|
+
task = tool_input.get('str_input', '')
|
|
262
|
+
if not task:
|
|
263
|
+
return "Error: No task description provided"
|
|
264
|
+
|
|
265
|
+
# Use the prompt template and LMM for subtask planning
|
|
266
|
+
return self._call_lmm(tool_input)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class NarrativeSummarizationTool(BaseTool):
|
|
270
|
+
"""Tool for summarizing narrative memories."""
|
|
271
|
+
|
|
272
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
273
|
+
"""
|
|
274
|
+
Summarize narrative memories.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
tool_input: Dictionary containing the narrative memory data
|
|
278
|
+
Expected to have 'str_input' key with the narrative memory data
|
|
279
|
+
May also have 'img_input' key with relevant images
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Summarized narrative as a string
|
|
283
|
+
"""
|
|
284
|
+
narrative_data = tool_input.get('str_input', '')
|
|
285
|
+
if not narrative_data:
|
|
286
|
+
return "Error: No narrative memory data provided"
|
|
287
|
+
|
|
288
|
+
# Use the prompt template and LMM for narrative summarization
|
|
289
|
+
return self._call_lmm(tool_input)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class EpisodeSummarizationTool(BaseTool):
|
|
293
|
+
"""Tool for summarizing episodic memories."""
|
|
294
|
+
|
|
295
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
296
|
+
"""
|
|
297
|
+
Summarize episodic memories.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
tool_input: Dictionary containing the episodic memory data
|
|
301
|
+
Expected to have 'str_input' key with the episodic memory data
|
|
302
|
+
May also have 'img_input' key with relevant images
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Summarized episode as a string
|
|
306
|
+
"""
|
|
307
|
+
episode_data = tool_input.get('str_input', '')
|
|
308
|
+
if not episode_data:
|
|
309
|
+
return "Error: No episodic memory data provided"
|
|
310
|
+
|
|
311
|
+
# Use the prompt template and LMM for episode summarization
|
|
312
|
+
return self._call_lmm(tool_input)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class TextSpanTool(BaseTool):
|
|
316
|
+
"""Tool for processing text spans."""
|
|
317
|
+
|
|
318
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
319
|
+
"""
|
|
320
|
+
Process text spans for a given input.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
tool_input: Dictionary containing the text input
|
|
324
|
+
Expected to have 'str_input' key with the text content
|
|
325
|
+
May also have 'img_input' key with a screenshot
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Processed text spans as a string
|
|
329
|
+
"""
|
|
330
|
+
text = tool_input.get('str_input', '')
|
|
331
|
+
if not text:
|
|
332
|
+
return "Error: No text content provided"
|
|
333
|
+
|
|
334
|
+
# Use the prompt template and LMM for text span processing
|
|
335
|
+
return self._call_lmm(tool_input)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class DAGTranslatorTool(BaseTool):
|
|
339
|
+
"""Tool for translating task descriptions into a DAG (Directed Acyclic Graph) structure."""
|
|
340
|
+
|
|
341
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
342
|
+
"""
|
|
343
|
+
Translate task descriptions into a DAG structure.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
tool_input: Dictionary containing the task description
|
|
347
|
+
Expected to have 'str_input' key with the task description
|
|
348
|
+
May also have 'img_input' key with a screenshot
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
DAG representation as a string
|
|
352
|
+
"""
|
|
353
|
+
task = tool_input.get('str_input', '')
|
|
354
|
+
if not task:
|
|
355
|
+
return "Error: No task description provided"
|
|
356
|
+
|
|
357
|
+
# Use the prompt template and LMM for DAG translation
|
|
358
|
+
return self._call_lmm(tool_input)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class TrajReflectorTool(BaseTool):
|
|
362
|
+
"""Tool for reflecting on execution trajectories."""
|
|
363
|
+
|
|
364
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
365
|
+
"""
|
|
366
|
+
Reflect on an execution trajectory.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
tool_input: Dictionary containing the trajectory
|
|
370
|
+
Expected to have 'str_input' key with the trajectory
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
Reflection as a string
|
|
374
|
+
"""
|
|
375
|
+
trajectory = tool_input.get('str_input', '')
|
|
376
|
+
if not trajectory:
|
|
377
|
+
return "Error: No trajectory provided"
|
|
378
|
+
|
|
379
|
+
# Use the prompt template and LMM for trajectory reflection
|
|
380
|
+
return self._call_lmm(tool_input)
|
|
381
|
+
|
|
382
|
+
class GroundingTool(BaseTool):
|
|
383
|
+
"""Tool for grounding agent actions in the environment."""
|
|
384
|
+
|
|
385
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
386
|
+
"""
|
|
387
|
+
Ground agent actions in the environment.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
tool_input: Dictionary containing the action and environment state
|
|
391
|
+
Expected to have 'str_input' key with the action
|
|
392
|
+
Expected to have 'img_input' key with a screenshot
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
Grounded action as a string
|
|
396
|
+
"""
|
|
397
|
+
action = tool_input.get('str_input', '')
|
|
398
|
+
screenshot = tool_input.get('img_input')
|
|
399
|
+
|
|
400
|
+
if not action:
|
|
401
|
+
return "Error: No action provided"
|
|
402
|
+
if not screenshot:
|
|
403
|
+
return "Error: No screenshot provided"
|
|
404
|
+
|
|
405
|
+
# Use the prompt template and LMM for action grounding
|
|
406
|
+
return self._call_lmm(tool_input)
|
|
407
|
+
|
|
408
|
+
def get_grounding_wh(self):
|
|
409
|
+
"""
|
|
410
|
+
Get grounding width and height based on provider and model name.
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
If provider is doubao and model_name contains 'ui-tars', returns two values:
|
|
414
|
+
grounding_width (int): Width value (1024)
|
|
415
|
+
grounding_height (int): Height value (768)
|
|
416
|
+
Otherwise returns None, None
|
|
417
|
+
"""
|
|
418
|
+
if self.provider == "doubao" and "ui-tars" in self.model_name:
|
|
419
|
+
grounding_width = 1000
|
|
420
|
+
grounding_height = 1000
|
|
421
|
+
return grounding_width, grounding_height
|
|
422
|
+
return None, None
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
class EvaluatorTool(BaseTool):
|
|
426
|
+
"""Tool for evaluating agent performance."""
|
|
427
|
+
|
|
428
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
429
|
+
"""
|
|
430
|
+
Evaluate agent performance.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
tool_input: Dictionary containing the evaluation data
|
|
434
|
+
Expected to have 'str_input' key with the evaluation data
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
Evaluation result as a string
|
|
438
|
+
"""
|
|
439
|
+
eval_data = tool_input.get('str_input', '')
|
|
440
|
+
if not eval_data:
|
|
441
|
+
return "Error: No evaluation data provided"
|
|
442
|
+
|
|
443
|
+
# Use the prompt template and LMM for performance evaluation
|
|
444
|
+
return self._call_lmm(tool_input)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
class ActionGeneratorTool(BaseTool):
|
|
448
|
+
"""Tool for generating executable actions."""
|
|
449
|
+
|
|
450
|
+
def __init__(self, provider: str, model_name: str, tool_name: str, **kwargs):
|
|
451
|
+
"""
|
|
452
|
+
Initialize the action generator tool.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
provider: API provider name
|
|
456
|
+
model_name: Model name to use
|
|
457
|
+
tool_name: Name of the tool (used as key in prompts.json)
|
|
458
|
+
**kwargs: Additional parameters, including:
|
|
459
|
+
enable_search: Whether to enable web search functionality
|
|
460
|
+
search_provider: Provider for web search (defaults to "bocha")
|
|
461
|
+
search_model: Model for web search (defaults to "")
|
|
462
|
+
"""
|
|
463
|
+
super().__init__(provider, model_name, tool_name)
|
|
464
|
+
|
|
465
|
+
# Extract search-related parameters
|
|
466
|
+
self.enable_search = kwargs.get("enable_search", False)
|
|
467
|
+
search_provider = kwargs.get("search_provider", "bocha")
|
|
468
|
+
search_model = kwargs.get("search_model", "")
|
|
469
|
+
|
|
470
|
+
# Initialize search tool if enabled
|
|
471
|
+
self.search_tool = None
|
|
472
|
+
if self.enable_search:
|
|
473
|
+
self.search_tool = WebSearchTool(search_provider, search_model, "")
|
|
474
|
+
logger.info(f"Web search enabled for {tool_name} using provider: {search_provider}")
|
|
475
|
+
|
|
476
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
477
|
+
"""
|
|
478
|
+
Generate executable actions.
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
tool_input: Dictionary containing the action request
|
|
482
|
+
Expected to have 'str_input' key with the action request
|
|
483
|
+
May also have 'img_input' key with a screenshot
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
Generated action as a string
|
|
487
|
+
"""
|
|
488
|
+
action_request = tool_input.get('str_input', '')
|
|
489
|
+
if not action_request:
|
|
490
|
+
return "Error: No action request provided", [0, 0, 0], ""
|
|
491
|
+
|
|
492
|
+
# Check if search is enabled
|
|
493
|
+
if self.enable_search and self.search_tool:
|
|
494
|
+
try:
|
|
495
|
+
# Use the input text directly as search query
|
|
496
|
+
search_query = action_request
|
|
497
|
+
logger.info(f"Performing web search for query: {search_query}")
|
|
498
|
+
search_results, tokens, cost = self.search_tool.execute({"str_input": search_query})
|
|
499
|
+
|
|
500
|
+
# Enhance the action request with search results
|
|
501
|
+
enhanced_request = f"[Action Request]\n{action_request}\n[End of Action Request]\n\n[Web Search Results for '{action_request}']\n{search_results}\n\n[End of Web Search Results]"
|
|
502
|
+
tool_input["str_input"] = enhanced_request
|
|
503
|
+
|
|
504
|
+
logger.info(f"Search completed. Found information: {len(search_results)} characters")
|
|
505
|
+
except Exception as e:
|
|
506
|
+
logger.error(f"Error during web search: {e}")
|
|
507
|
+
# Continue with original request if search fails
|
|
508
|
+
|
|
509
|
+
# Use the prompt template and LMM for action generation
|
|
510
|
+
return self._call_lmm(tool_input)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
class FastActionGeneratorTool(BaseTool):
|
|
514
|
+
"""Tool for directly generating executable actions without intermediate planning."""
|
|
515
|
+
|
|
516
|
+
def __init__(self, provider: str, model_name: str, tool_name: str, **kwargs):
|
|
517
|
+
"""
|
|
518
|
+
Initialize the fast action generator tool.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
provider: API provider name
|
|
522
|
+
model_name: Model name to use
|
|
523
|
+
tool_name: Name of the tool (used as key in prompts.json)
|
|
524
|
+
**kwargs: Additional parameters, including:
|
|
525
|
+
enable_search: Whether to enable web search functionality
|
|
526
|
+
search_provider: Provider for web search (defaults to "bocha")
|
|
527
|
+
search_model: Model for web search (defaults to "")
|
|
528
|
+
"""
|
|
529
|
+
super().__init__(provider, model_name, tool_name)
|
|
530
|
+
|
|
531
|
+
# Extract search-related parameters
|
|
532
|
+
self.enable_search = kwargs.get("enable_search", False)
|
|
533
|
+
search_provider = kwargs.get("search_provider", "bocha")
|
|
534
|
+
search_model = kwargs.get("search_model", "")
|
|
535
|
+
|
|
536
|
+
# Initialize search tool if enabled
|
|
537
|
+
self.search_tool = None
|
|
538
|
+
if self.enable_search:
|
|
539
|
+
self.search_tool = WebSearchTool(search_provider, search_model, "")
|
|
540
|
+
logger.info(f"Web search enabled for {tool_name} using provider: {search_provider}")
|
|
541
|
+
|
|
542
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
543
|
+
"""
|
|
544
|
+
Generate executable actions directly from the instruction and screenshot.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
tool_input: Dictionary containing the action request
|
|
548
|
+
Expected to have 'str_input' key with the instruction
|
|
549
|
+
Expected to have 'img_input' key with a screenshot
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
Generated action as a string, token count, and cost
|
|
553
|
+
"""
|
|
554
|
+
action_request = tool_input.get('str_input', '')
|
|
555
|
+
screenshot = tool_input.get('img_input')
|
|
556
|
+
if not action_request:
|
|
557
|
+
return "Error: No action request provided", [0, 0, 0], ""
|
|
558
|
+
if not screenshot:
|
|
559
|
+
return "Error: No screenshot provided", [0, 0, 0], ""
|
|
560
|
+
# Check if search is enabled
|
|
561
|
+
if self.enable_search and self.search_tool:
|
|
562
|
+
try:
|
|
563
|
+
# Use the input text directly as search query
|
|
564
|
+
search_query = action_request
|
|
565
|
+
logger.info(f"Performing web search for query: {search_query}")
|
|
566
|
+
search_results, tokens, cost = self.search_tool.execute({"str_input": search_query})
|
|
567
|
+
|
|
568
|
+
# Enhance the action request with search results
|
|
569
|
+
enhanced_request = f"[Action Request]\n{action_request}\n[End of Action Request]\n\n[Web Search Results for '{action_request}']\n{search_results}\n\n[End of Web Search Results]"
|
|
570
|
+
tool_input["str_input"] = enhanced_request
|
|
571
|
+
|
|
572
|
+
logger.info(f"Search completed. Found information: {len(search_results)} characters")
|
|
573
|
+
except Exception as e:
|
|
574
|
+
logger.error(f"Error during web search: {e}")
|
|
575
|
+
# Continue with original request if search fails
|
|
576
|
+
|
|
577
|
+
# Use the prompt template and LMM for action generation
|
|
578
|
+
return self._call_lmm(tool_input)
|
|
579
|
+
|
|
580
|
+
def get_grounding_wh(self):
|
|
581
|
+
"""
|
|
582
|
+
Get grounding width and height based on provider and model name.
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
If provider is doubao and model_name contains 'ui-tars', returns two values:
|
|
586
|
+
grounding_width (int): Width value (1024)
|
|
587
|
+
grounding_height (int): Height value (768)
|
|
588
|
+
Otherwise returns None, None
|
|
589
|
+
"""
|
|
590
|
+
if self.provider == "doubao" and "ui-tars" in self.model_name:
|
|
591
|
+
grounding_width = 1000
|
|
592
|
+
grounding_height = 1000
|
|
593
|
+
return grounding_width, grounding_height
|
|
594
|
+
return None, None
|
|
595
|
+
|
|
596
|
+
class EmbeddingTool(BaseTool):
|
|
597
|
+
"""Tool for generating text embeddings."""
|
|
598
|
+
|
|
599
|
+
def __init__(self, provider: str, model_name: str, tool_name: str):
|
|
600
|
+
"""
|
|
601
|
+
Initialize the embedding tool.
|
|
602
|
+
|
|
603
|
+
Args:
|
|
604
|
+
provider: API provider name (e.g., "openai", "gemini")
|
|
605
|
+
model_name: Model name to use
|
|
606
|
+
tool_name: Name of the tool (used as key in prompts.json)
|
|
607
|
+
"""
|
|
608
|
+
self.provider = provider
|
|
609
|
+
self.model_name = model_name
|
|
610
|
+
self.tool_name = tool_name
|
|
611
|
+
|
|
612
|
+
# Create EmbeddingAgent instance
|
|
613
|
+
self.engine_params = {
|
|
614
|
+
"engine_type": provider,
|
|
615
|
+
"embedding_model": model_name
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
# Initialize EmbeddingAgent
|
|
619
|
+
self.embedding_agent = EmbeddingAgent(engine_params=self.engine_params)
|
|
620
|
+
|
|
621
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
622
|
+
"""
|
|
623
|
+
Generate embeddings for the given text.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
tool_input: Dictionary containing the text to embed
|
|
627
|
+
Expected to have 'str_input' key with the text
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
Embeddings as a JSON string
|
|
631
|
+
"""
|
|
632
|
+
text = tool_input.get('str_input', '')
|
|
633
|
+
|
|
634
|
+
if not text:
|
|
635
|
+
return "Error: No text provided for embedding", [0, 0, 0], ""
|
|
636
|
+
|
|
637
|
+
try:
|
|
638
|
+
# Get embeddings for the text
|
|
639
|
+
embeddings, total_tokens, cost_string = self.embedding_agent.get_embeddings(text)
|
|
640
|
+
return embeddings, total_tokens, cost_string
|
|
641
|
+
|
|
642
|
+
except Exception as e:
|
|
643
|
+
logger.error(f"Error during embedding operation: {str(e)}")
|
|
644
|
+
return f"Error: Embedding operation failed: {str(e)}", [0, 0, 0], ""
|
|
645
|
+
|
|
646
|
+
class QueryFormulatorTool(BaseTool):
|
|
647
|
+
"""Tool for formulating queries from tasks or contexts."""
|
|
648
|
+
|
|
649
|
+
def execute(self, tool_input: Dict[str, Any]):
|
|
650
|
+
"""
|
|
651
|
+
Formulate a query for a given task or context.
|
|
652
|
+
|
|
653
|
+
Args:
|
|
654
|
+
tool_input: Dictionary containing the task or context description
|
|
655
|
+
Expected to have 'str_input' key with the description
|
|
656
|
+
May also have 'img_input' key with a screenshot
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
Formulated query as a string
|
|
660
|
+
"""
|
|
661
|
+
task = tool_input.get('str_input', '')
|
|
662
|
+
if not task:
|
|
663
|
+
return "Error: No task or context description provided"
|
|
664
|
+
|
|
665
|
+
# Use the prompt template and LMM for query formulation
|
|
666
|
+
return self._call_lmm(tool_input)
|
|
667
|
+
|
|
668
|
+
class Tools:
|
|
669
|
+
"""Main Tools class that provides access to all available tools."""
|
|
670
|
+
|
|
671
|
+
def __init__(self):
|
|
672
|
+
"""Initialize the Tools class."""
|
|
673
|
+
self.tools = {}
|
|
674
|
+
|
|
675
|
+
def register_tool(self, tool_name: str, provider: str, model_name: str, **kwargs):
|
|
676
|
+
"""
|
|
677
|
+
Register a tool with the specified parameters.
|
|
678
|
+
|
|
679
|
+
Args:
|
|
680
|
+
tool_name: Name of the tool to register
|
|
681
|
+
provider: API provider name
|
|
682
|
+
model_name: Model name to use
|
|
683
|
+
**kwargs: Additional parameters to pass to the tool
|
|
684
|
+
"""
|
|
685
|
+
tool: BaseTool = ToolFactory.create_tool(tool_name, provider, model_name, **kwargs)
|
|
686
|
+
self.tools[tool_name] = tool
|
|
687
|
+
|
|
688
|
+
def execute_tool(self, tool_name: str, tool_input: Dict[str, Any]):
|
|
689
|
+
"""
|
|
690
|
+
Execute a tool with the given input.
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
tool_name: Name of the tool to execute
|
|
694
|
+
tool_input: Input for the tool
|
|
695
|
+
|
|
696
|
+
Returns:
|
|
697
|
+
The output of the tool as a string
|
|
698
|
+
|
|
699
|
+
Raises:
|
|
700
|
+
ValueError: If the tool is not registered
|
|
701
|
+
"""
|
|
702
|
+
if tool_name not in self.tools:
|
|
703
|
+
raise ValueError(f"Tool {tool_name} is not registered")
|
|
704
|
+
|
|
705
|
+
return self.tools[tool_name].execute(tool_input)
|
|
706
|
+
|
|
707
|
+
def reset(self, tool_name: Optional[str] = None):
|
|
708
|
+
"""
|
|
709
|
+
Reset tools by resetting their llm_agent if available.
|
|
710
|
+
|
|
711
|
+
Args:
|
|
712
|
+
tool_name: Optional name of the specific tool to reset. If None, resets all tools.
|
|
713
|
+
"""
|
|
714
|
+
if tool_name is not None:
|
|
715
|
+
# Reset a specific tool
|
|
716
|
+
if tool_name not in self.tools:
|
|
717
|
+
raise ValueError(f"Tool {tool_name} is not registered")
|
|
718
|
+
|
|
719
|
+
tool = self.tools[tool_name]
|
|
720
|
+
if hasattr(tool, 'llm_agent') and tool.llm_agent is not None:
|
|
721
|
+
tool.llm_agent.reset()
|
|
722
|
+
else:
|
|
723
|
+
# Reset all tools
|
|
724
|
+
for tool in self.tools.values():
|
|
725
|
+
# Only reset if the tool has an llm_agent attribute
|
|
726
|
+
if hasattr(tool, 'llm_agent') and tool.llm_agent is not None:
|
|
727
|
+
tool.llm_agent.reset()
|