lybic-guiagents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lybic-guiagents might be problematic. Click here for more details.

Files changed (85) hide show
  1. desktop_env/__init__.py +1 -0
  2. desktop_env/actions.py +203 -0
  3. desktop_env/controllers/__init__.py +0 -0
  4. desktop_env/controllers/python.py +471 -0
  5. desktop_env/controllers/setup.py +882 -0
  6. desktop_env/desktop_env.py +509 -0
  7. desktop_env/evaluators/__init__.py +5 -0
  8. desktop_env/evaluators/getters/__init__.py +41 -0
  9. desktop_env/evaluators/getters/calc.py +15 -0
  10. desktop_env/evaluators/getters/chrome.py +1774 -0
  11. desktop_env/evaluators/getters/file.py +154 -0
  12. desktop_env/evaluators/getters/general.py +42 -0
  13. desktop_env/evaluators/getters/gimp.py +38 -0
  14. desktop_env/evaluators/getters/impress.py +126 -0
  15. desktop_env/evaluators/getters/info.py +24 -0
  16. desktop_env/evaluators/getters/misc.py +406 -0
  17. desktop_env/evaluators/getters/replay.py +20 -0
  18. desktop_env/evaluators/getters/vlc.py +86 -0
  19. desktop_env/evaluators/getters/vscode.py +35 -0
  20. desktop_env/evaluators/metrics/__init__.py +160 -0
  21. desktop_env/evaluators/metrics/basic_os.py +68 -0
  22. desktop_env/evaluators/metrics/chrome.py +493 -0
  23. desktop_env/evaluators/metrics/docs.py +1011 -0
  24. desktop_env/evaluators/metrics/general.py +665 -0
  25. desktop_env/evaluators/metrics/gimp.py +637 -0
  26. desktop_env/evaluators/metrics/libreoffice.py +28 -0
  27. desktop_env/evaluators/metrics/others.py +92 -0
  28. desktop_env/evaluators/metrics/pdf.py +31 -0
  29. desktop_env/evaluators/metrics/slides.py +957 -0
  30. desktop_env/evaluators/metrics/table.py +585 -0
  31. desktop_env/evaluators/metrics/thunderbird.py +176 -0
  32. desktop_env/evaluators/metrics/utils.py +719 -0
  33. desktop_env/evaluators/metrics/vlc.py +524 -0
  34. desktop_env/evaluators/metrics/vscode.py +283 -0
  35. desktop_env/providers/__init__.py +35 -0
  36. desktop_env/providers/aws/__init__.py +0 -0
  37. desktop_env/providers/aws/manager.py +278 -0
  38. desktop_env/providers/aws/provider.py +186 -0
  39. desktop_env/providers/aws/provider_with_proxy.py +315 -0
  40. desktop_env/providers/aws/proxy_pool.py +193 -0
  41. desktop_env/providers/azure/__init__.py +0 -0
  42. desktop_env/providers/azure/manager.py +87 -0
  43. desktop_env/providers/azure/provider.py +207 -0
  44. desktop_env/providers/base.py +97 -0
  45. desktop_env/providers/gcp/__init__.py +0 -0
  46. desktop_env/providers/gcp/manager.py +0 -0
  47. desktop_env/providers/gcp/provider.py +0 -0
  48. desktop_env/providers/virtualbox/__init__.py +0 -0
  49. desktop_env/providers/virtualbox/manager.py +463 -0
  50. desktop_env/providers/virtualbox/provider.py +124 -0
  51. desktop_env/providers/vmware/__init__.py +0 -0
  52. desktop_env/providers/vmware/manager.py +455 -0
  53. desktop_env/providers/vmware/provider.py +105 -0
  54. gui_agents/__init__.py +0 -0
  55. gui_agents/agents/Action.py +209 -0
  56. gui_agents/agents/__init__.py +0 -0
  57. gui_agents/agents/agent_s.py +832 -0
  58. gui_agents/agents/global_state.py +610 -0
  59. gui_agents/agents/grounding.py +651 -0
  60. gui_agents/agents/hardware_interface.py +129 -0
  61. gui_agents/agents/manager.py +568 -0
  62. gui_agents/agents/translator.py +132 -0
  63. gui_agents/agents/worker.py +355 -0
  64. gui_agents/cli_app.py +560 -0
  65. gui_agents/core/__init__.py +0 -0
  66. gui_agents/core/engine.py +1496 -0
  67. gui_agents/core/knowledge.py +449 -0
  68. gui_agents/core/mllm.py +555 -0
  69. gui_agents/tools/__init__.py +0 -0
  70. gui_agents/tools/tools.py +727 -0
  71. gui_agents/unit_test/__init__.py +0 -0
  72. gui_agents/unit_test/run_tests.py +65 -0
  73. gui_agents/unit_test/test_manager.py +330 -0
  74. gui_agents/unit_test/test_worker.py +269 -0
  75. gui_agents/utils/__init__.py +0 -0
  76. gui_agents/utils/analyze_display.py +301 -0
  77. gui_agents/utils/common_utils.py +263 -0
  78. gui_agents/utils/display_viewer.py +281 -0
  79. gui_agents/utils/embedding_manager.py +53 -0
  80. gui_agents/utils/image_axis_utils.py +27 -0
  81. lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
  82. lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
  83. lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
  84. lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
  85. lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,832 @@
1
+ import json
2
+ import logging
3
+ from math import log
4
+ import os
5
+ import platform
6
+ import textwrap
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+ from gui_agents.agents.grounding import ACI
10
+ from gui_agents.agents.worker import Worker
11
+ from gui_agents.agents.manager import Manager
12
+ from gui_agents.agents.grounding import Grounding, FastGrounding
13
+ from gui_agents.utils.common_utils import Node
14
+ from gui_agents.agents.global_state import GlobalState
15
+ from gui_agents.store.registry import Registry
16
+ from gui_agents.utils.common_utils import (
17
+ # call_llm_safe,
18
+ parse_single_code_from_string,
19
+ sanitize_code,
20
+ extract_first_agent_function,
21
+ agent_log_to_string,
22
+ )
23
+ from gui_agents.tools.tools import Tools
24
+
25
+ logger = logging.getLogger("desktopenv.agent")
26
+
27
+ class UIAgent:
28
+ """Base class for UI automation agents"""
29
+
30
+ def __init__(
31
+ self,
32
+ platform: str = platform.system().lower(),
33
+ ):
34
+ """Initialize UIAgent
35
+
36
+ Args:
37
+ platform: Operating system platform (macos, linux, windows)
38
+ """
39
+ self.platform = platform
40
+
41
+ def reset(self) -> None:
42
+ """Reset agent state"""
43
+ pass
44
+
45
+ def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]|None:
46
+ """Generate next action prediction
47
+
48
+ Args:
49
+ instruction: Natural language instruction
50
+ observation: Current UI state observation
51
+
52
+ Returns:
53
+ Tuple containing agent info dictionary and list of actions
54
+ """
55
+ pass
56
+
57
+ def update_narrative_memory(self, trajectory: str) -> None:
58
+ """Update narrative memory with task trajectory
59
+
60
+ Args:
61
+ trajectory: String containing task execution trajectory
62
+ """
63
+ pass
64
+
65
+ def update_episodic_memory(self, meta_data: Dict, subtask_trajectory: str) -> str|None:
66
+ """Update episodic memory with subtask trajectory
67
+
68
+ Args:
69
+ meta_data: Metadata about current subtask execution
70
+ subtask_trajectory: String containing subtask execution trajectory
71
+
72
+ Returns:
73
+ Updated subtask trajectory
74
+ """
75
+ pass
76
+
77
+ class AgentS2(UIAgent):
78
+ """Agent that uses hierarchical planning and directed acyclic graph modeling for UI automation"""
79
+
80
+ def __init__(
81
+ self,
82
+ platform: str = platform.system().lower(),
83
+ screen_size: List[int] = [1920, 1080],
84
+ memory_root_path: str = os.getcwd(),
85
+ memory_folder_name: str = "kb_s2",
86
+ kb_release_tag: str = "v0.2.2",
87
+ enable_takeover: bool = False,
88
+ enable_search: bool = True,
89
+ ):
90
+ """Initialize AgentS2
91
+
92
+ Args:
93
+ platform: Operating system platform (darwin, linux, windows)
94
+ memory_root_path: Path to memory directory. Defaults to current working directory.
95
+ memory_folder_name: Name of memory folder. Defaults to "kb_s2".
96
+ kb_release_tag: Release tag for knowledge base. Defaults to "v0.2.2".
97
+ enable_takeover: Whether to enable user takeover functionality. Defaults to False.
98
+ enable_search: Whether to enable web search functionality. Defaults to True.
99
+ """
100
+ super().__init__(
101
+ platform,
102
+ )
103
+
104
+ self.memory_root_path = memory_root_path
105
+ self.memory_folder_name = memory_folder_name
106
+ self.kb_release_tag = kb_release_tag
107
+ self.screen_size = screen_size
108
+ self.enable_takeover = enable_takeover
109
+ self.enable_search = enable_search
110
+
111
+ # Load tools configuration from tools_config.json
112
+ tools_config_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tools", "tools_config.json")
113
+ with open(tools_config_path, "r") as f:
114
+ self.tools_config = json.load(f)
115
+ print(f"Loaded tools configuration from: {tools_config_path}")
116
+ self.Tools_dict = {}
117
+ for tool in self.tools_config["tools"]:
118
+ tool_name = tool["tool_name"]
119
+ self.Tools_dict[tool_name] = {
120
+ "provider": tool["provider"],
121
+ "model": tool["model_name"]
122
+ }
123
+ print(f"Tools configuration: {self.Tools_dict}")
124
+
125
+ # Initialize agent's knowledge base path
126
+ self.local_kb_path = os.path.join(
127
+ self.memory_root_path, self.memory_folder_name
128
+ )
129
+
130
+ # Check if knowledge base exists
131
+ kb_platform_path = os.path.join(self.local_kb_path, self.platform)
132
+ if not os.path.exists(kb_platform_path):
133
+ print(f"Warning: Knowledge base for {self.platform} platform not found in {self.local_kb_path}")
134
+ os.makedirs(kb_platform_path, exist_ok=True)
135
+ print(f"Created directory: {kb_platform_path}")
136
+ # raise FileNotFoundError(f"Knowledge base path does not exist: {kb_platform_path}")
137
+ else:
138
+ print(f"Found local knowledge base path: {kb_platform_path}")
139
+
140
+ self.reset()
141
+
142
+ def reset(self) -> None:
143
+ """Reset agent state and initialize components"""
144
+ # Initialize core components
145
+
146
+ self.manager = Manager(
147
+ Tools_dict=self.Tools_dict,
148
+ local_kb_path=self.local_kb_path,
149
+ platform=self.platform,
150
+ enable_search=self.enable_search, # Pass global switch to Manager
151
+ )
152
+
153
+ self.worker = Worker(
154
+ Tools_dict=self.Tools_dict,
155
+ local_kb_path=self.local_kb_path,
156
+ platform=self.platform,
157
+ enable_takeover=self.enable_takeover,
158
+ enable_search=self.enable_search, # Pass global switch to Worker
159
+ tools_config=self.tools_config, # Pass complete tools configuration
160
+ )
161
+
162
+ self.grounding = Grounding(
163
+ Tools_dict=self.Tools_dict,
164
+ platform=self.platform,
165
+ width=self.screen_size[0],
166
+ height=self.screen_size[1]
167
+ )
168
+
169
+ # Reset state variables
170
+ self.requires_replan: bool = True
171
+ self.needs_next_subtask: bool = True
172
+ self.step_count: int = 0
173
+ self.turn_count: int = 0
174
+ self.failure_subtask: Optional[Node] = None
175
+ self.should_send_action: bool = False
176
+ self.completed_tasks: List[Node] = []
177
+ self.current_subtask: Optional[Node] = None
178
+ self.subtasks: List[Node] = []
179
+ self.search_query: str = ""
180
+ self.subtask_status: str = "Start"
181
+ self.global_state: GlobalState = Registry.get("GlobalStateStore") # type: ignore
182
+
183
+ def reset_executor_state(self) -> None:
184
+ """Reset executor and step counter"""
185
+ self.worker.reset()
186
+ self.step_count = 0
187
+
188
+ def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:
189
+ # Initialize the three info dictionaries
190
+ planner_info = {}
191
+ executor_info = {}
192
+ evaluator_info = {
193
+ "obs_evaluator_response": "",
194
+ "num_input_tokens_evaluator": 0,
195
+ "num_output_tokens_evaluator": 0,
196
+ "evaluator_cost": 0.0,
197
+ }
198
+ actions = []
199
+
200
+ # 记录预测开始时间
201
+ import time
202
+ predict_start_time = time.time()
203
+
204
+ # If the DONE response by the executor is for a subtask, then the agent should continue with the next subtask without sending the action to the environment
205
+ while not self.should_send_action:
206
+ time.sleep(5.0)
207
+ self.subtask_status = "In"
208
+ # Always time get_action_queue, even if not called
209
+ import time
210
+ manager_start = time.time()
211
+ # If replan is true, generate a new plan. True at start, after a failed plan, or after subtask completion
212
+ if self.requires_replan:
213
+ logger.info("(RE)PLANNING...")
214
+ Manager_info, self.subtasks = self.manager.get_action_queue(
215
+ Tu=self.global_state.get_Tu(),
216
+ observation=self.global_state.get_obs_for_manager(),
217
+ running_state=self.global_state.get_running_state(),
218
+ failed_subtask=self.failure_subtask,
219
+ completed_subtasks_list=self.global_state.get_completed_subtasks(),
220
+ remaining_subtasks_list=self.global_state.get_remaining_subtasks(),
221
+ )
222
+ self.global_state.set_remaining_subtasks(self.subtasks) # type: ignore
223
+
224
+ self.requires_replan = False
225
+ if "search_query" in Manager_info:
226
+ self.search_query = Manager_info["search_query"]
227
+ else:
228
+ self.search_query = ""
229
+ get_action_queue_time = time.time() - manager_start
230
+ logger.info(f"[Timing] manager.get_action_queue execution time: {get_action_queue_time:.2f} seconds")
231
+ self.global_state.log_operation(
232
+ module="manager",
233
+ operation="manager.get_action_queue",
234
+ data={"duration": get_action_queue_time}
235
+ )
236
+
237
+ # use the exectuor to complete the topmost subtask
238
+ if self.needs_next_subtask:
239
+ logger.info("GETTING NEXT SUBTASK...")
240
+
241
+ # this can be empty if the DAG planner deems that all subtasks are completed
242
+ if len(self.subtasks) <= 0:
243
+ self.requires_replan = True
244
+ self.needs_next_subtask = True
245
+ self.failure_subtask = None
246
+ if self.current_subtask is not None:
247
+ self.global_state.add_completed_subtask(self.current_subtask)
248
+ # reset executor state
249
+ self.reset_executor_state()
250
+ self.should_send_action = True
251
+ self.subtask_status = "Done"
252
+ executor_info = {
253
+ "executor_plan": "agent.done()",
254
+ "plan_code": "agent.done()",
255
+ "reflection": "agent.done()",
256
+ }
257
+ actions = [{"type": "DONE"}]
258
+
259
+ # 记录任务完成
260
+ self.global_state.log_operation(
261
+ module="agent",
262
+ operation="task_complete",
263
+ data={
264
+ "content": "All subtasks completed, task finished",
265
+ "status": "done"
266
+ }
267
+ )
268
+ break
269
+
270
+ self.current_subtask = self.subtasks.pop(0)
271
+ self.global_state.set_remaining_subtasks(self.subtasks)
272
+ logger.info(f"NEXT SUBTASK: {self.current_subtask}")
273
+ logger.info(f"REMAINING SUBTASKS: {self.subtasks}")
274
+ logger.info(f"REMAINING SUBTASKS FROM GLOBAL STATE: {self.global_state.get_remaining_subtasks()}")
275
+ self.needs_next_subtask = False
276
+ self.subtask_status = "Start"
277
+
278
+ self.global_state.log_operation(
279
+ module="agent",
280
+ operation="current_subtask",
281
+ data={
282
+ "content": str(self.current_subtask),
283
+ "status": "start"
284
+ }
285
+ )
286
+
287
+ worker_start_time = time.time()
288
+
289
+ # get the next action from the worker
290
+ executor_info = self.worker.generate_next_action(
291
+ Tu=instruction,
292
+ search_query=self.search_query,
293
+ subtask=self.current_subtask.name, # type: ignore
294
+ subtask_info=self.current_subtask.info, # type: ignore
295
+ future_tasks=self.global_state.get_remaining_subtasks(),
296
+ done_task=self.global_state.get_completed_subtasks(),
297
+ obs=self.global_state.get_obs_for_manager(),
298
+ )
299
+
300
+ worker_execution_time = time.time() - worker_start_time
301
+
302
+ self.global_state.log_operation(
303
+ module="agent",
304
+ operation="worker_execution",
305
+ data={
306
+ "duration": worker_execution_time,
307
+ "subtask": self.current_subtask.name # type: ignore
308
+ }
309
+ )
310
+
311
+ try:
312
+ grounding_start_time = time.time()
313
+ current_width, current_height = self.global_state.get_screen_size()
314
+ self.grounding.reset_screen_size(current_width, current_height)
315
+ self.grounding.assign_coordinates(executor_info["executor_plan"], observation)
316
+ plan_code = parse_single_code_from_string(executor_info["executor_plan"].split("Grounded Action")[-1])
317
+ plan_code = sanitize_code(plan_code)
318
+ plan_code = extract_first_agent_function(plan_code)
319
+ agent: Grounding = self.grounding # type: ignore
320
+ exec_code = eval(plan_code) # type: ignore
321
+ grounding_execution_time = time.time() - grounding_start_time
322
+
323
+ # 记录grounding执行时间
324
+ self.global_state.log_operation(
325
+ module="agent",
326
+ operation="grounding_execution",
327
+ data={
328
+ "duration": grounding_execution_time,
329
+ "content": plan_code
330
+ }
331
+ )
332
+ except Exception as e:
333
+ logger.error("Error in parsing plan code: %s", e)
334
+ plan_code = "agent.wait(1.0)"
335
+ agent: Grounding = self.grounding # this agent will be used in next code
336
+ exec_code = eval(plan_code) # type: ignore
337
+
338
+ # 记录grounding错误
339
+ self.global_state.log_operation(
340
+ module="agent",
341
+ operation="grounding_error",
342
+ data={
343
+ "content": str(e),
344
+ "fallback_action": plan_code
345
+ }
346
+ )
347
+
348
+ actions = [exec_code]
349
+
350
+ self.step_count += 1
351
+
352
+ # set the should_send_action flag to True if the executor returns an action
353
+ self.should_send_action = True
354
+
355
+ # replan on failure
356
+ if "fail" in actions[0]["type"].lower():
357
+ self.requires_replan = True
358
+ self.needs_next_subtask = True
359
+
360
+ # assign the failed subtask
361
+ self.global_state.add_failed_subtask(self.current_subtask) # type: ignore
362
+ self.failure_subtask = self.global_state.get_latest_failed_subtask()
363
+
364
+ # 记录失败的子任务
365
+ self.global_state.log_operation(
366
+ module="agent",
367
+ operation="subtask_failed",
368
+ data={
369
+ "content": str(self.current_subtask),
370
+ "status": "failed"
371
+ }
372
+ )
373
+
374
+ # reset the step count, executor, and evaluator
375
+ self.reset_executor_state()
376
+
377
+ # if more subtasks are remaining, we don't want to send DONE to the environment but move on to the next subtask
378
+ if self.subtasks:
379
+ self.should_send_action = False
380
+
381
+ # replan on subtask completion
382
+ elif "done" in actions[0]["type"].lower():
383
+ self.requires_replan = True
384
+ self.needs_next_subtask = True
385
+ self.failure_subtask = None
386
+ self.global_state.add_completed_subtask(self.current_subtask) # type: ignore
387
+
388
+ # 记录完成的子任务
389
+ self.global_state.log_operation(
390
+ module="agent",
391
+ operation="subtask_completed",
392
+ data={
393
+ "content": str(self.current_subtask),
394
+ "status": "completed"
395
+ }
396
+ )
397
+
398
+ # reset the step count, executor, and evaluator
399
+ self.reset_executor_state()
400
+
401
+ # if more subtasks are remaining, we don't want to send DONE to the environment but move on to the next subtask
402
+ if self.subtasks:
403
+ self.should_send_action = False
404
+ self.subtask_status = "Done"
405
+
406
+ self.turn_count += 1
407
+
408
+ # reset the should_send_action flag for next iteration
409
+ self.should_send_action = False
410
+
411
+ # concatenate the three info dictionaries
412
+ info = {
413
+ **{
414
+ k: v
415
+ for d in [planner_info or {}, executor_info or {}, evaluator_info or {}]
416
+ for k, v in d.items()
417
+ }
418
+ }
419
+ info.update(
420
+ {
421
+ "subtask": self.current_subtask.name, # type: ignore
422
+ "subtask_info": self.current_subtask.info, # type: ignore
423
+ "subtask_status": self.subtask_status,
424
+ }
425
+ )
426
+
427
+ # 记录predict函数总执行时间
428
+ predict_total_time = time.time() - predict_start_time
429
+ self.global_state.log_operation(
430
+ module="agent",
431
+ operation="predict_execution",
432
+ data={
433
+ "duration": predict_total_time,
434
+ "step_count": self.step_count,
435
+ "turn_count": self.turn_count,
436
+ "subtask_status": self.subtask_status
437
+ }
438
+ )
439
+
440
+ return info, actions # type: ignore
441
+
442
+ def update_narrative_memory(self, trajectory: str) -> None:
443
+ """Update narrative memory from task trajectory
444
+
445
+ Args:
446
+ trajectory: String containing task execution trajectory
447
+ """
448
+ try:
449
+ reflection_path = os.path.join(
450
+ self.local_kb_path, self.platform, "narrative_memory.json"
451
+ )
452
+ try:
453
+ reflections = json.load(open(reflection_path))
454
+ except:
455
+ reflections = {}
456
+
457
+ if self.search_query not in reflections:
458
+ reflection = self.manager.summarize_narrative(trajectory)
459
+ reflections[self.search_query] = reflection
460
+
461
+ with open(reflection_path, "w") as f:
462
+ json.dump(reflections, f, indent=2)
463
+
464
+ except Exception as e:
465
+ logger.error(f"Failed to update narrative memory: {e}")
466
+
467
+ def update_episodic_memory(self, meta_data: Dict, subtask_trajectory: str) -> str:
468
+ """Update episodic memory from subtask trajectory
469
+
470
+ Args:
471
+ meta_data: Metadata about current subtask execution
472
+ subtask_trajectory: String containing subtask execution trajectory
473
+
474
+ Returns:
475
+ Updated subtask trajectory
476
+ """
477
+ subtask = meta_data["subtask"]
478
+ subtask_info = meta_data["subtask_info"]
479
+ subtask_status = meta_data["subtask_status"]
480
+ # Handle subtask trajectory
481
+ if subtask_status == "Start" or subtask_status == "Done":
482
+ # If it's a new subtask start, finalize the previous subtask trajectory if it exists
483
+ if subtask_trajectory:
484
+ subtask_trajectory += "\nSubtask Completed.\n"
485
+ subtask_key = subtask_trajectory.split(
486
+ "\n----------------------\n\nPlan:\n"
487
+ )[0]
488
+ try:
489
+ subtask_path = os.path.join(
490
+ self.local_kb_path, self.platform, "episodic_memory.json"
491
+ )
492
+ kb = json.load(open(subtask_path))
493
+ except:
494
+ kb = {}
495
+ if subtask_key not in kb.keys():
496
+ subtask_summarization = self.manager.summarize_episode(
497
+ subtask_trajectory
498
+ )
499
+ kb[subtask_key] = subtask_summarization
500
+ else:
501
+ subtask_summarization = kb[subtask_key]
502
+ logger.info("subtask_key: %s", subtask_key)
503
+ logger.info("subtask_summarization: %s", subtask_summarization)
504
+ with open(subtask_path, "w") as fout:
505
+ json.dump(kb, fout, indent=2)
506
+ # Reset for the next subtask
507
+ subtask_trajectory = ""
508
+ # Start a new subtask trajectory
509
+ subtask_trajectory = (
510
+ "Task:\n"
511
+ + self.search_query
512
+ + "\n\nSubtask: "
513
+ + subtask
514
+ + "\nSubtask Instruction: "
515
+ + subtask_info
516
+ + "\n----------------------\n\nPlan:\n"
517
+ + meta_data["executor_plan"]
518
+ + "\n"
519
+ )
520
+ elif subtask_status == "In":
521
+ # Continue appending to the current subtask trajectory if it's still ongoing
522
+ subtask_trajectory += (
523
+ "\n----------------------\n\nPlan:\n"
524
+ + meta_data["executor_plan"]
525
+ + "\n"
526
+ )
527
+
528
+ return subtask_trajectory
529
+
530
+ class AgentSFast(UIAgent):
531
+ """Fast version of AgentS2 that directly generates actions using the fast_action_generator tool"""
532
+
533
+ def __init__(
534
+ self,
535
+ platform: str = platform.system().lower(),
536
+ screen_size: List[int] = [1920, 1080],
537
+ memory_root_path: str = os.getcwd(),
538
+ memory_folder_name: str = "kb_s2",
539
+ kb_release_tag: str = "v0.2.2",
540
+ enable_takeover: bool = False,
541
+ enable_search: bool = True,
542
+ enable_reflection: bool = True,
543
+ # enable_reflection: bool = False,
544
+ ):
545
+ """Initialize AgentSFast
546
+
547
+ Args:
548
+ platform: Operating system platform (darwin, linux, windows)
549
+ memory_root_path: Path to memory directory. Defaults to current working directory.
550
+ memory_folder_name: Name of memory folder. Defaults to "kb_s2".
551
+ kb_release_tag: Release tag for knowledge base. Defaults to "v0.2.2".
552
+ enable_takeover: Whether to enable user takeover functionality. Defaults to False.
553
+ enable_search: Whether to enable web search functionality. Defaults to True.
554
+ enable_reflection: Whether to enable reflection functionality. Defaults to True.
555
+ """
556
+ super().__init__(
557
+ platform,
558
+ )
559
+
560
+ self.memory_root_path = memory_root_path
561
+ self.memory_folder_name = memory_folder_name
562
+ self.kb_release_tag = kb_release_tag
563
+ self.screen_size = screen_size
564
+ self.enable_takeover = enable_takeover
565
+ self.enable_search = enable_search
566
+ self.enable_reflection = enable_reflection
567
+
568
+ # Load tools configuration from tools_config.json
569
+ tools_config_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tools", "tools_config.json")
570
+ with open(tools_config_path, "r") as f:
571
+ self.tools_config = json.load(f)
572
+ print(f"Loaded tools configuration from: {tools_config_path}")
573
+ self.Tools_dict = {}
574
+ for tool in self.tools_config["tools"]:
575
+ tool_name = tool["tool_name"]
576
+ self.Tools_dict[tool_name] = {
577
+ "provider": tool["provider"],
578
+ "model": tool["model_name"]
579
+ }
580
+ print(f"Tools configuration: {self.Tools_dict}")
581
+
582
+ # Initialize agent's knowledge base path
583
+ self.local_kb_path = os.path.join(
584
+ self.memory_root_path, self.memory_folder_name
585
+ )
586
+
587
+ # Check if knowledge base exists
588
+ kb_platform_path = os.path.join(self.local_kb_path, self.platform)
589
+ if not os.path.exists(kb_platform_path):
590
+ print(f"Warning: Knowledge base for {self.platform} platform not found in {self.local_kb_path}")
591
+ os.makedirs(kb_platform_path, exist_ok=True)
592
+ print(f"Created directory: {kb_platform_path}")
593
+ else:
594
+ print(f"Found local knowledge base path: {kb_platform_path}")
595
+
596
+ self.reset()
597
+
598
+ def reset(self) -> None:
599
+ """Reset agent state and initialize components"""
600
+ # Initialize the fast action generator tool
601
+ self.fast_action_generator = Tools()
602
+ self.fast_action_generator_tool = "fast_action_generator_with_takeover" if self.enable_takeover else "fast_action_generator"
603
+
604
+ # Get tool configuration from tools_config
605
+ tool_config = None
606
+ for tool in self.tools_config["tools"]:
607
+ if tool["tool_name"] == self.fast_action_generator_tool:
608
+ tool_config = tool
609
+ break
610
+
611
+ # Prepare tool parameters
612
+ tool_params = {}
613
+
614
+ # First check global search switch
615
+ if not self.enable_search:
616
+ # If global search is disabled, force disable search for this tool
617
+ tool_params["enable_search"] = False
618
+ logger.info(f"Configuring {self.fast_action_generator_tool} with search DISABLED (global switch off)")
619
+ else:
620
+ # If global search is enabled, check tool-specific config
621
+ if tool_config and "enable_search" in tool_config:
622
+ # Use enable_search from config file
623
+ enable_search = tool_config.get("enable_search", False)
624
+ tool_params["enable_search"] = enable_search
625
+ tool_params["search_provider"] = tool_config.get("search_provider", "bocha")
626
+ tool_params["search_model"] = tool_config.get("search_model", "")
627
+
628
+ logger.info(f"Configuring {self.fast_action_generator_tool} with search enabled: {enable_search} (from config)")
629
+
630
+ # Register the tool with parameters
631
+ self.fast_action_generator.register_tool(
632
+ self.fast_action_generator_tool,
633
+ self.Tools_dict[self.fast_action_generator_tool]["provider"],
634
+ self.Tools_dict[self.fast_action_generator_tool]["model"],
635
+ **tool_params
636
+ )
637
+
638
+ if self.enable_reflection:
639
+ self.reflection_agent = Tools()
640
+ self.reflection_agent.register_tool(
641
+ "traj_reflector", self.Tools_dict["traj_reflector"]["provider"],
642
+ self.Tools_dict["traj_reflector"]["model"])
643
+ self.reflections = []
644
+ self.planner_history = []
645
+
646
+ self.grounding_width, self.grounding_height = self.fast_action_generator.tools[self.fast_action_generator_tool].get_grounding_wh()
647
+ if self.grounding_width is None or self.grounding_height is None:
648
+ self.grounding_width = self.screen_size[0]
649
+ self.grounding_height = self.screen_size[1]
650
+ self.grounding = FastGrounding(
651
+ Tools_dict=self.Tools_dict,
652
+ platform=self.platform,
653
+ width=self.screen_size[0],
654
+ height=self.screen_size[1],
655
+ grounding_width=self.grounding_width,
656
+ grounding_height=self.grounding_height
657
+ )
658
+
659
+ # Reset state variables
660
+ self.step_count: int = 0
661
+ self.turn_count: int = 0
662
+ self.global_state: GlobalState = Registry.get("GlobalStateStore") # type: ignore
663
+ self.latest_action = None
664
+
665
+ def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:
666
+ """Generate next action prediction using only the fast_action_generator tool
667
+
668
+ Args:
669
+ instruction: Natural language instruction
670
+ observation: Current UI state observation
671
+
672
+ Returns:
673
+ Tuple containing agent info dictionary and list of actions
674
+ """
675
+ import time
676
+ predict_start_time = time.time()
677
+
678
+ fast_action_start_time = time.time()
679
+
680
+ reflection = None
681
+ if self.enable_reflection:
682
+ if self.turn_count == 0:
683
+ text_content = textwrap.dedent(f"""
684
+ Task Description: {instruction}
685
+ """)
686
+ self.reflection_agent.tools["traj_reflector"].llm_agent.add_message(
687
+ text_content + "\n\nThe initial screen is provided. No action has been taken yet.",
688
+ image_content=observation["screenshot"],
689
+ role="user")
690
+ self.global_state.add_agent_log({
691
+ "type": "passive",
692
+ "content": "Reflection: " + text_content + "\n\nThe initial screen is provided. No action has been taken yet."
693
+ })
694
+ else:
695
+ agent_log = agent_log_to_string(self.global_state.get_agent_log())
696
+ text_content = f"Please refer to the agent log to understand the progress and context of the task so far.\n{agent_log}"
697
+
698
+ reflection_start = time.time()
699
+ reflection, total_tokens, cost_string = self.reflection_agent.execute_tool(
700
+ "traj_reflector", {
701
+ "str_input": text_content,
702
+ "img_input": observation["screenshot"]
703
+ })
704
+ reflection = str(reflection)
705
+ self.reflection_agent.reset("traj_reflector")
706
+ self.global_state.add_agent_log({
707
+ "type": "passive",
708
+ "content": "Reflection: " + reflection
709
+ })
710
+ logger.info(f"Trajectory reflector tokens: {total_tokens}, cost: {cost_string}")
711
+ reflection_time = time.time() - reflection_start
712
+ logger.info(f"[Timing] AgentSFast.traj_reflector execution time: {reflection_time:.2f} seconds")
713
+ self.reflections.append(reflection)
714
+ logger.info("REFLECTION: %s", reflection)
715
+ self.global_state.log_operation(
716
+ module="agent",
717
+ operation="reflection",
718
+ data={
719
+ "tokens": total_tokens,
720
+ "cost": cost_string,
721
+ "content": reflection,
722
+ "duration": reflection_time
723
+ })
724
+
725
+ agent_log = agent_log_to_string(self.global_state.get_agent_log())
726
+
727
+ generator_message = textwrap.dedent(f"""
728
+ Task Description: {instruction}
729
+ """)
730
+
731
+ generator_message += f"\n\nPlease refer to the agent log to understand the progress and context of the task so far.\n{agent_log}"
732
+
733
+ fast_action_start_time = time.time()
734
+
735
+ plan, total_tokens, cost_string = self.fast_action_generator.execute_tool(
736
+ self.fast_action_generator_tool,
737
+ {
738
+ "str_input": generator_message,
739
+ "img_input": observation["screenshot"]
740
+ }
741
+ )
742
+ self.fast_action_generator.reset(self.fast_action_generator_tool)
743
+
744
+ fast_action_execution_time = time.time() - fast_action_start_time
745
+
746
+ self.global_state.log_operation(
747
+ module="agent",
748
+ operation="fast_action_execution",
749
+ data={
750
+ "duration": fast_action_execution_time,
751
+ "tokens": total_tokens,
752
+ "cost": cost_string
753
+ }
754
+ )
755
+
756
+ logger.info("Fast Action Plan: %s", plan)
757
+
758
+ current_width, current_height = self.global_state.get_screen_size()
759
+ self.grounding.reset_screen_size(current_width, current_height)
760
+ try:
761
+ code_pattern = r"```python\s*(.*?)\s*```"
762
+ import re
763
+ match = re.search(code_pattern, plan, re.DOTALL)
764
+
765
+ if match:
766
+ action_code = match.group(1).strip()
767
+ logger.info("Extracted action code: %s", action_code)
768
+
769
+ agent: FastGrounding = self.grounding # type: ignore
770
+ exec_code = eval(action_code) # type: ignore
771
+ actions = [exec_code]
772
+ self.latest_action = action_code
773
+ else:
774
+ logger.warning("No code block found, trying to parse the entire response")
775
+ action_code = plan.strip()
776
+
777
+ if action_code.startswith("agent."):
778
+ agent: FastGrounding = self.grounding # type: ignore
779
+ exec_code = eval(action_code) # type: ignore
780
+ actions = [exec_code]
781
+ self.latest_action = action_code
782
+ else:
783
+ logger.error("Could not parse action, using wait action")
784
+ self.global_state.add_agent_log({
785
+ "type": "Wrong action code format",
786
+ "content": action_code
787
+ })
788
+ agent: FastGrounding = self.grounding # type: ignore
789
+ exec_code = eval("agent.wait(1000)") # type: ignore
790
+ actions = [exec_code]
791
+ self.latest_action = "agent.wait(1000)"
792
+ except Exception as e:
793
+ logger.error("Error in parsing action code: %s", e)
794
+ self.global_state.add_agent_log({
795
+ "type": "Error in parsing action code",
796
+ "content": str(e) # Convert Exception to string
797
+ })
798
+ agent: FastGrounding = self.grounding # type: ignore
799
+ exec_code = eval("agent.wait(1000)") # type: ignore
800
+ actions = [exec_code]
801
+ self.latest_action = "agent.wait(1000)"
802
+
803
+ self.global_state.log_operation(
804
+ module="agent",
805
+ operation="fast_action_error",
806
+ data={
807
+ "content": str(e),
808
+ "fallback_action": "agent.wait(1000)"
809
+ }
810
+ )
811
+
812
+ self.step_count += 1
813
+ self.turn_count += 1
814
+
815
+ executor_info = {
816
+ "executor_plan": plan,
817
+ "reflection": reflection or "",
818
+ "plan_code": self.latest_action
819
+ }
820
+
821
+ predict_total_time = time.time() - predict_start_time
822
+ self.global_state.log_operation(
823
+ module="agent",
824
+ operation="predict_execution_fast_direct",
825
+ data={
826
+ "duration": predict_total_time,
827
+ "step_count": self.step_count,
828
+ "turn_count": self.turn_count
829
+ }
830
+ )
831
+
832
+ return executor_info, actions