lybic-guiagents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lybic-guiagents might be problematic. Click here for more details.

Files changed (85) hide show
  1. desktop_env/__init__.py +1 -0
  2. desktop_env/actions.py +203 -0
  3. desktop_env/controllers/__init__.py +0 -0
  4. desktop_env/controllers/python.py +471 -0
  5. desktop_env/controllers/setup.py +882 -0
  6. desktop_env/desktop_env.py +509 -0
  7. desktop_env/evaluators/__init__.py +5 -0
  8. desktop_env/evaluators/getters/__init__.py +41 -0
  9. desktop_env/evaluators/getters/calc.py +15 -0
  10. desktop_env/evaluators/getters/chrome.py +1774 -0
  11. desktop_env/evaluators/getters/file.py +154 -0
  12. desktop_env/evaluators/getters/general.py +42 -0
  13. desktop_env/evaluators/getters/gimp.py +38 -0
  14. desktop_env/evaluators/getters/impress.py +126 -0
  15. desktop_env/evaluators/getters/info.py +24 -0
  16. desktop_env/evaluators/getters/misc.py +406 -0
  17. desktop_env/evaluators/getters/replay.py +20 -0
  18. desktop_env/evaluators/getters/vlc.py +86 -0
  19. desktop_env/evaluators/getters/vscode.py +35 -0
  20. desktop_env/evaluators/metrics/__init__.py +160 -0
  21. desktop_env/evaluators/metrics/basic_os.py +68 -0
  22. desktop_env/evaluators/metrics/chrome.py +493 -0
  23. desktop_env/evaluators/metrics/docs.py +1011 -0
  24. desktop_env/evaluators/metrics/general.py +665 -0
  25. desktop_env/evaluators/metrics/gimp.py +637 -0
  26. desktop_env/evaluators/metrics/libreoffice.py +28 -0
  27. desktop_env/evaluators/metrics/others.py +92 -0
  28. desktop_env/evaluators/metrics/pdf.py +31 -0
  29. desktop_env/evaluators/metrics/slides.py +957 -0
  30. desktop_env/evaluators/metrics/table.py +585 -0
  31. desktop_env/evaluators/metrics/thunderbird.py +176 -0
  32. desktop_env/evaluators/metrics/utils.py +719 -0
  33. desktop_env/evaluators/metrics/vlc.py +524 -0
  34. desktop_env/evaluators/metrics/vscode.py +283 -0
  35. desktop_env/providers/__init__.py +35 -0
  36. desktop_env/providers/aws/__init__.py +0 -0
  37. desktop_env/providers/aws/manager.py +278 -0
  38. desktop_env/providers/aws/provider.py +186 -0
  39. desktop_env/providers/aws/provider_with_proxy.py +315 -0
  40. desktop_env/providers/aws/proxy_pool.py +193 -0
  41. desktop_env/providers/azure/__init__.py +0 -0
  42. desktop_env/providers/azure/manager.py +87 -0
  43. desktop_env/providers/azure/provider.py +207 -0
  44. desktop_env/providers/base.py +97 -0
  45. desktop_env/providers/gcp/__init__.py +0 -0
  46. desktop_env/providers/gcp/manager.py +0 -0
  47. desktop_env/providers/gcp/provider.py +0 -0
  48. desktop_env/providers/virtualbox/__init__.py +0 -0
  49. desktop_env/providers/virtualbox/manager.py +463 -0
  50. desktop_env/providers/virtualbox/provider.py +124 -0
  51. desktop_env/providers/vmware/__init__.py +0 -0
  52. desktop_env/providers/vmware/manager.py +455 -0
  53. desktop_env/providers/vmware/provider.py +105 -0
  54. gui_agents/__init__.py +0 -0
  55. gui_agents/agents/Action.py +209 -0
  56. gui_agents/agents/__init__.py +0 -0
  57. gui_agents/agents/agent_s.py +832 -0
  58. gui_agents/agents/global_state.py +610 -0
  59. gui_agents/agents/grounding.py +651 -0
  60. gui_agents/agents/hardware_interface.py +129 -0
  61. gui_agents/agents/manager.py +568 -0
  62. gui_agents/agents/translator.py +132 -0
  63. gui_agents/agents/worker.py +355 -0
  64. gui_agents/cli_app.py +560 -0
  65. gui_agents/core/__init__.py +0 -0
  66. gui_agents/core/engine.py +1496 -0
  67. gui_agents/core/knowledge.py +449 -0
  68. gui_agents/core/mllm.py +555 -0
  69. gui_agents/tools/__init__.py +0 -0
  70. gui_agents/tools/tools.py +727 -0
  71. gui_agents/unit_test/__init__.py +0 -0
  72. gui_agents/unit_test/run_tests.py +65 -0
  73. gui_agents/unit_test/test_manager.py +330 -0
  74. gui_agents/unit_test/test_worker.py +269 -0
  75. gui_agents/utils/__init__.py +0 -0
  76. gui_agents/utils/analyze_display.py +301 -0
  77. gui_agents/utils/common_utils.py +263 -0
  78. gui_agents/utils/display_viewer.py +281 -0
  79. gui_agents/utils/embedding_manager.py +53 -0
  80. gui_agents/utils/image_axis_utils.py +27 -0
  81. lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
  82. lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
  83. lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
  84. lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
  85. lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,727 @@
1
+ """
2
+ Tools module for GUI agents.
3
+
4
+ This module provides various tools for GUI agents to perform tasks such as web search,
5
+ context fusion, subtask planning, trajectory reflection, memory retrieval, grounding,
6
+ evaluation, and action generation.
7
+ """
8
+
9
+ import os
10
+ import json
11
+ import base64
12
+ import requests
13
+ import time
14
+ from typing import Dict, Any, Optional, List, Union, Tuple
15
+ from abc import ABC, abstractmethod
16
+ import logging
17
+ from gui_agents.core.mllm import LLMAgent, WebSearchAgent, EmbeddingAgent
18
+ import threading
19
+ from gui_agents.prompts.prompts import system_prompts
20
+
21
+ logger = logging.getLogger("desktopenv.tools")
22
+
23
+ class BaseTool(ABC):
24
+ """Base class for all tools."""
25
+ _prompts_dict = None
26
+ _prompts_dict_lock = threading.Lock()
27
+
28
+ @classmethod
29
+ def _load_prompts_dict(cls):
30
+ if cls._prompts_dict is None:
31
+ with cls._prompts_dict_lock:
32
+ if cls._prompts_dict is None:
33
+ try:
34
+ # Import prompts from prompts.py module
35
+ cls._prompts_dict = system_prompts
36
+ except Exception as e:
37
+ logger.error(f"Failed to load prompts from prompts.py: {e}")
38
+ cls._prompts_dict = {}
39
+
40
+ def __init__(self, provider: str, model_name: str, tool_name: str):
41
+ """
42
+ Initialize the base tool.
43
+ Args:
44
+ provider: API provider name (e.g., "gemini", "openai")
45
+ model_name: Model name to use (e.g., "gemini-2.5-pro")
46
+ tool_name: Name of the tool (used as key in prompts.py)
47
+ """
48
+ self.provider = provider
49
+ self.model_name = model_name
50
+ self.tool_name = tool_name
51
+ self._load_prompts_dict()
52
+ self._prompt_template = self._get_prompt_template()
53
+ # Create LLMAgent instance for tool usage
54
+ self.engine_params = {
55
+ "engine_type": provider,
56
+ "model": model_name
57
+ }
58
+ self.llm_agent = LLMAgent(engine_params=self.engine_params, system_prompt=self._prompt_template)
59
+
60
+ def _get_prompt_template(self) -> str:
61
+ if self.tool_name is None:
62
+ return ""
63
+ prompts = self.__class__._prompts_dict
64
+ if prompts is None:
65
+ return ""
66
+ return prompts.get(self.tool_name, "")
67
+
68
+ def _call_lmm(self, input_data: Dict[str, Any], temperature: float = 0.0):
69
+ """
70
+ Call the LMM model for inference using the prompt template with retry mechanism
71
+
72
+ Args:
73
+ input_data: Dictionary containing input data to format the prompt template
74
+ temperature: Temperature parameter to control randomness of output
75
+
76
+ Returns:
77
+ Model response as text
78
+ """
79
+ # self.llm_agent.reset()
80
+
81
+ # Extract text and image inputs
82
+ text_input = input_data.get('str_input', '')
83
+ image_input = input_data.get('img_input', None)
84
+
85
+ # Add the message with the formatted prompt
86
+ self.llm_agent.add_message(text_input, image_content=image_input, role="user")
87
+
88
+ # Implement safe retry mechanism
89
+ max_retries = 3
90
+ attempt = 0
91
+ content, total_tokens, cost_string = "", [0, 0, 0], ""
92
+
93
+ while attempt < max_retries:
94
+ try:
95
+ content, total_tokens, cost_string = self.llm_agent.get_response(temperature=temperature)
96
+ break # If successful, break out of the loop
97
+ except Exception as e:
98
+ attempt += 1
99
+ logger.error(f"LLM call attempt {attempt} failed: {str(e)}")
100
+ if attempt == max_retries:
101
+ logger.error("Max retries reached. Returning error message.")
102
+ return f"Error: LLM call failed after {max_retries} attempts: {str(e)}", [0, 0, 0], ""
103
+ time.sleep(1.0)
104
+ return content, total_tokens, cost_string
105
+
106
+ @abstractmethod
107
+ def execute(self, tool_input: Dict[str, Any]) -> Tuple[str, List[int], str]:
108
+ """
109
+ Execute the tool with the given input.
110
+
111
+ Args:
112
+ tool_input: Dictionary containing the input for the tool
113
+ Expected to have 'str_input' and/or 'img_input' keys
114
+
115
+ Returns:
116
+ The output of the tool as a string
117
+ """
118
+ pass
119
+
120
+
121
+ class ToolFactory:
122
+ """Factory class for creating tools."""
123
+
124
+ @staticmethod
125
+ def create_tool(tool_name: str, provider: str, model_name: str, **kwargs) -> 'BaseTool':
126
+ """
127
+ Create a tool instance based on the tool name.
128
+
129
+ Args:
130
+ tool_name: Name of the tool to create
131
+ provider: API provider name
132
+ model_name: Model name to use
133
+ **kwargs: Additional parameters to pass to the tool
134
+
135
+ Returns:
136
+ An instance of the specified tool
137
+
138
+ Raises:
139
+ ValueError: If the tool name is not recognized
140
+ """
141
+ tool_map = {
142
+ "websearch": (WebSearchTool, None),
143
+ "context_fusion": (ContextFusionTool, "context_fusion"),
144
+ "subtask_planner": (SubtaskPlannerTool, "subtask_planner"),
145
+ "traj_reflector": (TrajReflectorTool, "traj_reflector"),
146
+ "grounding": (GroundingTool, "grounding"),
147
+ "evaluator": (EvaluatorTool, "evaluator"),
148
+ "action_generator": (ActionGeneratorTool, "action_generator"),
149
+ "action_generator_with_takeover": (ActionGeneratorTool, "action_generator_with_takeover"),
150
+ "fast_action_generator": (FastActionGeneratorTool, "fast_action_generator"),
151
+ "fast_action_generator_with_takeover": (FastActionGeneratorTool, "fast_action_generator_with_takeover"),
152
+ "dag_translator": (DAGTranslatorTool, "dag_translator"),
153
+ "embedding": (EmbeddingTool, None),
154
+ "query_formulator": (QueryFormulatorTool, "query_formulator"),
155
+ "text_span": (TextSpanTool, "text_span"),
156
+ "narrative_summarization": (NarrativeSummarizationTool, "narrative_summarization"),
157
+ "episode_summarization": (EpisodeSummarizationTool, "episode_summarization")
158
+ }
159
+
160
+ if tool_name not in tool_map:
161
+ raise ValueError(f"Unknown tool name: {tool_name}")
162
+
163
+ tool_class, prompt_key = tool_map[tool_name]
164
+
165
+ # WebSearchTool and EmbeddingTool don't need a prompt
166
+ if tool_name == "websearch":
167
+ return tool_class(provider, model_name, None, **kwargs)
168
+ if tool_name == "embedding":
169
+ return tool_class(provider, model_name, None, **kwargs)
170
+
171
+ return tool_class(provider, model_name, prompt_key, **kwargs)
172
+
173
+
174
+ class WebSearchTool(BaseTool):
175
+ """Tool for performing web searches."""
176
+
177
+ def __init__(self, provider: str, model_name: str, tool_name: str):
178
+ """
179
+ Initialize the web search tool.
180
+
181
+ Args:
182
+ provider: API provider name (e.g., "bocha", "exa")
183
+ model_name: Model name to use (not used for WebSearchAgent)
184
+ tool_name: Name of the tool (used as key in prompts.json)
185
+ """
186
+ self.provider = provider
187
+
188
+ # Create WebSearchAgent instance for search
189
+ self.engine_params = {
190
+ "engine_type": provider,
191
+ "model": model_name,
192
+ }
193
+
194
+ # Initialize WebSearchAgent
195
+ self.search_agent = WebSearchAgent(engine_params=self.engine_params)
196
+
197
+ def execute(self, tool_input: Dict[str, Any]) -> Tuple[str, List[int], str]:
198
+ """
199
+ Execute a web search with the given query.
200
+
201
+ Args:
202
+ tool_input: Dictionary containing the search query
203
+ Expected to have 'str_input' key with the search query
204
+
205
+ Returns:
206
+ Search results as a string
207
+ """
208
+ query = tool_input.get('str_input', '')
209
+ if not query:
210
+ return "Error: No search query provided", [0, 0, 0], ""
211
+
212
+ try:
213
+ # Get the answer from the search results
214
+ answer, total_tokens, cost = self.search_agent.get_answer(query)
215
+
216
+ # Return just the answer
217
+ return answer, total_tokens, cost # type: ignore
218
+
219
+ except Exception as e:
220
+ logger.error(f"Error during web search: {str(e)}")
221
+ return f"Error: Web search failed: {str(e)}", [0, 0, 0], ""
222
+
223
+
224
+ class ContextFusionTool(BaseTool):
225
+ """Tool for fusing multiple contexts together."""
226
+
227
+ def execute(self, tool_input: Dict[str, Any]):
228
+ """
229
+ Fuse multiple contexts together.
230
+
231
+ Args:
232
+ tool_input: Dictionary containing the contexts to fuse
233
+ Expected to have 'str_input' key with JSON-formatted contexts
234
+
235
+ Returns:
236
+ Fused context as a string
237
+ """
238
+ contexts = tool_input.get('str_input', '')
239
+ if not contexts:
240
+ return "Error: No contexts provided"
241
+
242
+ # Use the prompt template and LMM for context fusion
243
+ return self._call_lmm(tool_input)
244
+
245
+
246
+ class SubtaskPlannerTool(BaseTool):
247
+ """Tool for planning subtasks."""
248
+
249
+ def execute(self, tool_input: Dict[str, Any]):
250
+ """
251
+ Plan subtasks for a given task.
252
+
253
+ Args:
254
+ tool_input: Dictionary containing the task description
255
+ Expected to have 'str_input' key with the task description
256
+ May also have 'img_input' key with a screenshot
257
+
258
+ Returns:
259
+ Subtask plan as a string
260
+ """
261
+ task = tool_input.get('str_input', '')
262
+ if not task:
263
+ return "Error: No task description provided"
264
+
265
+ # Use the prompt template and LMM for subtask planning
266
+ return self._call_lmm(tool_input)
267
+
268
+
269
+ class NarrativeSummarizationTool(BaseTool):
270
+ """Tool for summarizing narrative memories."""
271
+
272
+ def execute(self, tool_input: Dict[str, Any]):
273
+ """
274
+ Summarize narrative memories.
275
+
276
+ Args:
277
+ tool_input: Dictionary containing the narrative memory data
278
+ Expected to have 'str_input' key with the narrative memory data
279
+ May also have 'img_input' key with relevant images
280
+
281
+ Returns:
282
+ Summarized narrative as a string
283
+ """
284
+ narrative_data = tool_input.get('str_input', '')
285
+ if not narrative_data:
286
+ return "Error: No narrative memory data provided"
287
+
288
+ # Use the prompt template and LMM for narrative summarization
289
+ return self._call_lmm(tool_input)
290
+
291
+
292
+ class EpisodeSummarizationTool(BaseTool):
293
+ """Tool for summarizing episodic memories."""
294
+
295
+ def execute(self, tool_input: Dict[str, Any]):
296
+ """
297
+ Summarize episodic memories.
298
+
299
+ Args:
300
+ tool_input: Dictionary containing the episodic memory data
301
+ Expected to have 'str_input' key with the episodic memory data
302
+ May also have 'img_input' key with relevant images
303
+
304
+ Returns:
305
+ Summarized episode as a string
306
+ """
307
+ episode_data = tool_input.get('str_input', '')
308
+ if not episode_data:
309
+ return "Error: No episodic memory data provided"
310
+
311
+ # Use the prompt template and LMM for episode summarization
312
+ return self._call_lmm(tool_input)
313
+
314
+
315
+ class TextSpanTool(BaseTool):
316
+ """Tool for processing text spans."""
317
+
318
+ def execute(self, tool_input: Dict[str, Any]):
319
+ """
320
+ Process text spans for a given input.
321
+
322
+ Args:
323
+ tool_input: Dictionary containing the text input
324
+ Expected to have 'str_input' key with the text content
325
+ May also have 'img_input' key with a screenshot
326
+
327
+ Returns:
328
+ Processed text spans as a string
329
+ """
330
+ text = tool_input.get('str_input', '')
331
+ if not text:
332
+ return "Error: No text content provided"
333
+
334
+ # Use the prompt template and LMM for text span processing
335
+ return self._call_lmm(tool_input)
336
+
337
+
338
+ class DAGTranslatorTool(BaseTool):
339
+ """Tool for translating task descriptions into a DAG (Directed Acyclic Graph) structure."""
340
+
341
+ def execute(self, tool_input: Dict[str, Any]):
342
+ """
343
+ Translate task descriptions into a DAG structure.
344
+
345
+ Args:
346
+ tool_input: Dictionary containing the task description
347
+ Expected to have 'str_input' key with the task description
348
+ May also have 'img_input' key with a screenshot
349
+
350
+ Returns:
351
+ DAG representation as a string
352
+ """
353
+ task = tool_input.get('str_input', '')
354
+ if not task:
355
+ return "Error: No task description provided"
356
+
357
+ # Use the prompt template and LMM for DAG translation
358
+ return self._call_lmm(tool_input)
359
+
360
+
361
+ class TrajReflectorTool(BaseTool):
362
+ """Tool for reflecting on execution trajectories."""
363
+
364
+ def execute(self, tool_input: Dict[str, Any]):
365
+ """
366
+ Reflect on an execution trajectory.
367
+
368
+ Args:
369
+ tool_input: Dictionary containing the trajectory
370
+ Expected to have 'str_input' key with the trajectory
371
+
372
+ Returns:
373
+ Reflection as a string
374
+ """
375
+ trajectory = tool_input.get('str_input', '')
376
+ if not trajectory:
377
+ return "Error: No trajectory provided"
378
+
379
+ # Use the prompt template and LMM for trajectory reflection
380
+ return self._call_lmm(tool_input)
381
+
382
+ class GroundingTool(BaseTool):
383
+ """Tool for grounding agent actions in the environment."""
384
+
385
+ def execute(self, tool_input: Dict[str, Any]):
386
+ """
387
+ Ground agent actions in the environment.
388
+
389
+ Args:
390
+ tool_input: Dictionary containing the action and environment state
391
+ Expected to have 'str_input' key with the action
392
+ Expected to have 'img_input' key with a screenshot
393
+
394
+ Returns:
395
+ Grounded action as a string
396
+ """
397
+ action = tool_input.get('str_input', '')
398
+ screenshot = tool_input.get('img_input')
399
+
400
+ if not action:
401
+ return "Error: No action provided"
402
+ if not screenshot:
403
+ return "Error: No screenshot provided"
404
+
405
+ # Use the prompt template and LMM for action grounding
406
+ return self._call_lmm(tool_input)
407
+
408
+ def get_grounding_wh(self):
409
+ """
410
+ Get grounding width and height based on provider and model name.
411
+
412
+ Returns:
413
+ If provider is doubao and model_name contains 'ui-tars', returns two values:
414
+ grounding_width (int): Width value (1024)
415
+ grounding_height (int): Height value (768)
416
+ Otherwise returns None, None
417
+ """
418
+ if self.provider == "doubao" and "ui-tars" in self.model_name:
419
+ grounding_width = 1000
420
+ grounding_height = 1000
421
+ return grounding_width, grounding_height
422
+ return None, None
423
+
424
+
425
+ class EvaluatorTool(BaseTool):
426
+ """Tool for evaluating agent performance."""
427
+
428
+ def execute(self, tool_input: Dict[str, Any]):
429
+ """
430
+ Evaluate agent performance.
431
+
432
+ Args:
433
+ tool_input: Dictionary containing the evaluation data
434
+ Expected to have 'str_input' key with the evaluation data
435
+
436
+ Returns:
437
+ Evaluation result as a string
438
+ """
439
+ eval_data = tool_input.get('str_input', '')
440
+ if not eval_data:
441
+ return "Error: No evaluation data provided"
442
+
443
+ # Use the prompt template and LMM for performance evaluation
444
+ return self._call_lmm(tool_input)
445
+
446
+
447
+ class ActionGeneratorTool(BaseTool):
448
+ """Tool for generating executable actions."""
449
+
450
+ def __init__(self, provider: str, model_name: str, tool_name: str, **kwargs):
451
+ """
452
+ Initialize the action generator tool.
453
+
454
+ Args:
455
+ provider: API provider name
456
+ model_name: Model name to use
457
+ tool_name: Name of the tool (used as key in prompts.json)
458
+ **kwargs: Additional parameters, including:
459
+ enable_search: Whether to enable web search functionality
460
+ search_provider: Provider for web search (defaults to "bocha")
461
+ search_model: Model for web search (defaults to "")
462
+ """
463
+ super().__init__(provider, model_name, tool_name)
464
+
465
+ # Extract search-related parameters
466
+ self.enable_search = kwargs.get("enable_search", False)
467
+ search_provider = kwargs.get("search_provider", "bocha")
468
+ search_model = kwargs.get("search_model", "")
469
+
470
+ # Initialize search tool if enabled
471
+ self.search_tool = None
472
+ if self.enable_search:
473
+ self.search_tool = WebSearchTool(search_provider, search_model, "")
474
+ logger.info(f"Web search enabled for {tool_name} using provider: {search_provider}")
475
+
476
+ def execute(self, tool_input: Dict[str, Any]):
477
+ """
478
+ Generate executable actions.
479
+
480
+ Args:
481
+ tool_input: Dictionary containing the action request
482
+ Expected to have 'str_input' key with the action request
483
+ May also have 'img_input' key with a screenshot
484
+
485
+ Returns:
486
+ Generated action as a string
487
+ """
488
+ action_request = tool_input.get('str_input', '')
489
+ if not action_request:
490
+ return "Error: No action request provided", [0, 0, 0], ""
491
+
492
+ # Check if search is enabled
493
+ if self.enable_search and self.search_tool:
494
+ try:
495
+ # Use the input text directly as search query
496
+ search_query = action_request
497
+ logger.info(f"Performing web search for query: {search_query}")
498
+ search_results, tokens, cost = self.search_tool.execute({"str_input": search_query})
499
+
500
+ # Enhance the action request with search results
501
+ enhanced_request = f"[Action Request]\n{action_request}\n[End of Action Request]\n\n[Web Search Results for '{action_request}']\n{search_results}\n\n[End of Web Search Results]"
502
+ tool_input["str_input"] = enhanced_request
503
+
504
+ logger.info(f"Search completed. Found information: {len(search_results)} characters")
505
+ except Exception as e:
506
+ logger.error(f"Error during web search: {e}")
507
+ # Continue with original request if search fails
508
+
509
+ # Use the prompt template and LMM for action generation
510
+ return self._call_lmm(tool_input)
511
+
512
+
513
+ class FastActionGeneratorTool(BaseTool):
514
+ """Tool for directly generating executable actions without intermediate planning."""
515
+
516
+ def __init__(self, provider: str, model_name: str, tool_name: str, **kwargs):
517
+ """
518
+ Initialize the fast action generator tool.
519
+
520
+ Args:
521
+ provider: API provider name
522
+ model_name: Model name to use
523
+ tool_name: Name of the tool (used as key in prompts.json)
524
+ **kwargs: Additional parameters, including:
525
+ enable_search: Whether to enable web search functionality
526
+ search_provider: Provider for web search (defaults to "bocha")
527
+ search_model: Model for web search (defaults to "")
528
+ """
529
+ super().__init__(provider, model_name, tool_name)
530
+
531
+ # Extract search-related parameters
532
+ self.enable_search = kwargs.get("enable_search", False)
533
+ search_provider = kwargs.get("search_provider", "bocha")
534
+ search_model = kwargs.get("search_model", "")
535
+
536
+ # Initialize search tool if enabled
537
+ self.search_tool = None
538
+ if self.enable_search:
539
+ self.search_tool = WebSearchTool(search_provider, search_model, "")
540
+ logger.info(f"Web search enabled for {tool_name} using provider: {search_provider}")
541
+
542
+ def execute(self, tool_input: Dict[str, Any]):
543
+ """
544
+ Generate executable actions directly from the instruction and screenshot.
545
+
546
+ Args:
547
+ tool_input: Dictionary containing the action request
548
+ Expected to have 'str_input' key with the instruction
549
+ Expected to have 'img_input' key with a screenshot
550
+
551
+ Returns:
552
+ Generated action as a string, token count, and cost
553
+ """
554
+ action_request = tool_input.get('str_input', '')
555
+ screenshot = tool_input.get('img_input')
556
+ if not action_request:
557
+ return "Error: No action request provided", [0, 0, 0], ""
558
+ if not screenshot:
559
+ return "Error: No screenshot provided", [0, 0, 0], ""
560
+ # Check if search is enabled
561
+ if self.enable_search and self.search_tool:
562
+ try:
563
+ # Use the input text directly as search query
564
+ search_query = action_request
565
+ logger.info(f"Performing web search for query: {search_query}")
566
+ search_results, tokens, cost = self.search_tool.execute({"str_input": search_query})
567
+
568
+ # Enhance the action request with search results
569
+ enhanced_request = f"[Action Request]\n{action_request}\n[End of Action Request]\n\n[Web Search Results for '{action_request}']\n{search_results}\n\n[End of Web Search Results]"
570
+ tool_input["str_input"] = enhanced_request
571
+
572
+ logger.info(f"Search completed. Found information: {len(search_results)} characters")
573
+ except Exception as e:
574
+ logger.error(f"Error during web search: {e}")
575
+ # Continue with original request if search fails
576
+
577
+ # Use the prompt template and LMM for action generation
578
+ return self._call_lmm(tool_input)
579
+
580
+ def get_grounding_wh(self):
581
+ """
582
+ Get grounding width and height based on provider and model name.
583
+
584
+ Returns:
585
+ If provider is doubao and model_name contains 'ui-tars', returns two values:
586
+ grounding_width (int): Width value (1024)
587
+ grounding_height (int): Height value (768)
588
+ Otherwise returns None, None
589
+ """
590
+ if self.provider == "doubao" and "ui-tars" in self.model_name:
591
+ grounding_width = 1000
592
+ grounding_height = 1000
593
+ return grounding_width, grounding_height
594
+ return None, None
595
+
596
+ class EmbeddingTool(BaseTool):
597
+ """Tool for generating text embeddings."""
598
+
599
+ def __init__(self, provider: str, model_name: str, tool_name: str):
600
+ """
601
+ Initialize the embedding tool.
602
+
603
+ Args:
604
+ provider: API provider name (e.g., "openai", "gemini")
605
+ model_name: Model name to use
606
+ tool_name: Name of the tool (used as key in prompts.json)
607
+ """
608
+ self.provider = provider
609
+ self.model_name = model_name
610
+ self.tool_name = tool_name
611
+
612
+ # Create EmbeddingAgent instance
613
+ self.engine_params = {
614
+ "engine_type": provider,
615
+ "embedding_model": model_name
616
+ }
617
+
618
+ # Initialize EmbeddingAgent
619
+ self.embedding_agent = EmbeddingAgent(engine_params=self.engine_params)
620
+
621
+ def execute(self, tool_input: Dict[str, Any]):
622
+ """
623
+ Generate embeddings for the given text.
624
+
625
+ Args:
626
+ tool_input: Dictionary containing the text to embed
627
+ Expected to have 'str_input' key with the text
628
+
629
+ Returns:
630
+ Embeddings as a JSON string
631
+ """
632
+ text = tool_input.get('str_input', '')
633
+
634
+ if not text:
635
+ return "Error: No text provided for embedding", [0, 0, 0], ""
636
+
637
+ try:
638
+ # Get embeddings for the text
639
+ embeddings, total_tokens, cost_string = self.embedding_agent.get_embeddings(text)
640
+ return embeddings, total_tokens, cost_string
641
+
642
+ except Exception as e:
643
+ logger.error(f"Error during embedding operation: {str(e)}")
644
+ return f"Error: Embedding operation failed: {str(e)}", [0, 0, 0], ""
645
+
646
+ class QueryFormulatorTool(BaseTool):
647
+ """Tool for formulating queries from tasks or contexts."""
648
+
649
+ def execute(self, tool_input: Dict[str, Any]):
650
+ """
651
+ Formulate a query for a given task or context.
652
+
653
+ Args:
654
+ tool_input: Dictionary containing the task or context description
655
+ Expected to have 'str_input' key with the description
656
+ May also have 'img_input' key with a screenshot
657
+
658
+ Returns:
659
+ Formulated query as a string
660
+ """
661
+ task = tool_input.get('str_input', '')
662
+ if not task:
663
+ return "Error: No task or context description provided"
664
+
665
+ # Use the prompt template and LMM for query formulation
666
+ return self._call_lmm(tool_input)
667
+
668
+ class Tools:
669
+ """Main Tools class that provides access to all available tools."""
670
+
671
+ def __init__(self):
672
+ """Initialize the Tools class."""
673
+ self.tools = {}
674
+
675
+ def register_tool(self, tool_name: str, provider: str, model_name: str, **kwargs):
676
+ """
677
+ Register a tool with the specified parameters.
678
+
679
+ Args:
680
+ tool_name: Name of the tool to register
681
+ provider: API provider name
682
+ model_name: Model name to use
683
+ **kwargs: Additional parameters to pass to the tool
684
+ """
685
+ tool: BaseTool = ToolFactory.create_tool(tool_name, provider, model_name, **kwargs)
686
+ self.tools[tool_name] = tool
687
+
688
+ def execute_tool(self, tool_name: str, tool_input: Dict[str, Any]):
689
+ """
690
+ Execute a tool with the given input.
691
+
692
+ Args:
693
+ tool_name: Name of the tool to execute
694
+ tool_input: Input for the tool
695
+
696
+ Returns:
697
+ The output of the tool as a string
698
+
699
+ Raises:
700
+ ValueError: If the tool is not registered
701
+ """
702
+ if tool_name not in self.tools:
703
+ raise ValueError(f"Tool {tool_name} is not registered")
704
+
705
+ return self.tools[tool_name].execute(tool_input)
706
+
707
+ def reset(self, tool_name: Optional[str] = None):
708
+ """
709
+ Reset tools by resetting their llm_agent if available.
710
+
711
+ Args:
712
+ tool_name: Optional name of the specific tool to reset. If None, resets all tools.
713
+ """
714
+ if tool_name is not None:
715
+ # Reset a specific tool
716
+ if tool_name not in self.tools:
717
+ raise ValueError(f"Tool {tool_name} is not registered")
718
+
719
+ tool = self.tools[tool_name]
720
+ if hasattr(tool, 'llm_agent') and tool.llm_agent is not None:
721
+ tool.llm_agent.reset()
722
+ else:
723
+ # Reset all tools
724
+ for tool in self.tools.values():
725
+ # Only reset if the tool has an llm_agent attribute
726
+ if hasattr(tool, 'llm_agent') and tool.llm_agent is not None:
727
+ tool.llm_agent.reset()