lybic-guiagents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lybic-guiagents might be problematic. Click here for more details.

Files changed (85) hide show
  1. desktop_env/__init__.py +1 -0
  2. desktop_env/actions.py +203 -0
  3. desktop_env/controllers/__init__.py +0 -0
  4. desktop_env/controllers/python.py +471 -0
  5. desktop_env/controllers/setup.py +882 -0
  6. desktop_env/desktop_env.py +509 -0
  7. desktop_env/evaluators/__init__.py +5 -0
  8. desktop_env/evaluators/getters/__init__.py +41 -0
  9. desktop_env/evaluators/getters/calc.py +15 -0
  10. desktop_env/evaluators/getters/chrome.py +1774 -0
  11. desktop_env/evaluators/getters/file.py +154 -0
  12. desktop_env/evaluators/getters/general.py +42 -0
  13. desktop_env/evaluators/getters/gimp.py +38 -0
  14. desktop_env/evaluators/getters/impress.py +126 -0
  15. desktop_env/evaluators/getters/info.py +24 -0
  16. desktop_env/evaluators/getters/misc.py +406 -0
  17. desktop_env/evaluators/getters/replay.py +20 -0
  18. desktop_env/evaluators/getters/vlc.py +86 -0
  19. desktop_env/evaluators/getters/vscode.py +35 -0
  20. desktop_env/evaluators/metrics/__init__.py +160 -0
  21. desktop_env/evaluators/metrics/basic_os.py +68 -0
  22. desktop_env/evaluators/metrics/chrome.py +493 -0
  23. desktop_env/evaluators/metrics/docs.py +1011 -0
  24. desktop_env/evaluators/metrics/general.py +665 -0
  25. desktop_env/evaluators/metrics/gimp.py +637 -0
  26. desktop_env/evaluators/metrics/libreoffice.py +28 -0
  27. desktop_env/evaluators/metrics/others.py +92 -0
  28. desktop_env/evaluators/metrics/pdf.py +31 -0
  29. desktop_env/evaluators/metrics/slides.py +957 -0
  30. desktop_env/evaluators/metrics/table.py +585 -0
  31. desktop_env/evaluators/metrics/thunderbird.py +176 -0
  32. desktop_env/evaluators/metrics/utils.py +719 -0
  33. desktop_env/evaluators/metrics/vlc.py +524 -0
  34. desktop_env/evaluators/metrics/vscode.py +283 -0
  35. desktop_env/providers/__init__.py +35 -0
  36. desktop_env/providers/aws/__init__.py +0 -0
  37. desktop_env/providers/aws/manager.py +278 -0
  38. desktop_env/providers/aws/provider.py +186 -0
  39. desktop_env/providers/aws/provider_with_proxy.py +315 -0
  40. desktop_env/providers/aws/proxy_pool.py +193 -0
  41. desktop_env/providers/azure/__init__.py +0 -0
  42. desktop_env/providers/azure/manager.py +87 -0
  43. desktop_env/providers/azure/provider.py +207 -0
  44. desktop_env/providers/base.py +97 -0
  45. desktop_env/providers/gcp/__init__.py +0 -0
  46. desktop_env/providers/gcp/manager.py +0 -0
  47. desktop_env/providers/gcp/provider.py +0 -0
  48. desktop_env/providers/virtualbox/__init__.py +0 -0
  49. desktop_env/providers/virtualbox/manager.py +463 -0
  50. desktop_env/providers/virtualbox/provider.py +124 -0
  51. desktop_env/providers/vmware/__init__.py +0 -0
  52. desktop_env/providers/vmware/manager.py +455 -0
  53. desktop_env/providers/vmware/provider.py +105 -0
  54. gui_agents/__init__.py +0 -0
  55. gui_agents/agents/Action.py +209 -0
  56. gui_agents/agents/__init__.py +0 -0
  57. gui_agents/agents/agent_s.py +832 -0
  58. gui_agents/agents/global_state.py +610 -0
  59. gui_agents/agents/grounding.py +651 -0
  60. gui_agents/agents/hardware_interface.py +129 -0
  61. gui_agents/agents/manager.py +568 -0
  62. gui_agents/agents/translator.py +132 -0
  63. gui_agents/agents/worker.py +355 -0
  64. gui_agents/cli_app.py +560 -0
  65. gui_agents/core/__init__.py +0 -0
  66. gui_agents/core/engine.py +1496 -0
  67. gui_agents/core/knowledge.py +449 -0
  68. gui_agents/core/mllm.py +555 -0
  69. gui_agents/tools/__init__.py +0 -0
  70. gui_agents/tools/tools.py +727 -0
  71. gui_agents/unit_test/__init__.py +0 -0
  72. gui_agents/unit_test/run_tests.py +65 -0
  73. gui_agents/unit_test/test_manager.py +330 -0
  74. gui_agents/unit_test/test_worker.py +269 -0
  75. gui_agents/utils/__init__.py +0 -0
  76. gui_agents/utils/analyze_display.py +301 -0
  77. gui_agents/utils/common_utils.py +263 -0
  78. gui_agents/utils/display_viewer.py +281 -0
  79. gui_agents/utils/embedding_manager.py +53 -0
  80. gui_agents/utils/image_axis_utils.py +27 -0
  81. lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
  82. lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
  83. lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
  84. lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
  85. lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ import pyautogui
4
+ from gui_agents.agents.Backend.Backend import Backend
5
+ from gui_agents.agents.Backend.ADBBackend import ADBBackend
6
+ from gui_agents.agents.Backend.LybicBackend import LybicBackend
7
+ from gui_agents.agents.Backend.PyAutoGUIBackend import PyAutoGUIBackend
8
+ from gui_agents.agents.Backend.PyAutoGUIVMwareBackend import PyAutoGUIVMwareBackend
9
+ """hardware_interface.py ▸ Execute Action objects on real devices / emulators
10
+ ===============================================================================
11
+ This module is the *single entry point* that upper‑layer planners / executors
12
+ use to perform UI operations. It is deliberately thin:
13
+
14
+ * Accepts one `Action` **or** a `List[Action]` (defined in *actions.py*).
15
+ * Delegates to a concrete *Backend* which knows how to translate the `Action`
16
+ into platform‑specific calls (PyAutoGUI, ADB, Lybic cloud device, …).
17
+ * Performs minimal capability checks + error propagation.
18
+
19
+ The default backend implemented here is **PyAutoGUIBackend**. Stubs for
20
+ **ADBBackend** and **LybicBackend** show how to extend the system.
21
+
22
+ --------------------------------------------------------------------------
23
+ Quick usage
24
+ --------------------------------------------------------------------------
25
+ ```python
26
+ from actions import Click
27
+ from hardware_interface import HardwareInterface
28
+
29
+ hwi = HardwareInterface(backend="pyautogui")
30
+
31
+ # Single action
32
+ hwi.dispatch(Click(xy=(960, 540)))
33
+
34
+ # Batch
35
+ plan = [Click(xy=(100,200)), Click(xy=(300,400))]
36
+ hwi.dispatch(plan)
37
+
38
+ # actionDict
39
+ hwi.dispatchDict({"type": "Click", "xy": [200, 300]})
40
+
41
+ ```
42
+ """
43
+
44
+ from typing import List, Type, Dict, Set, Union
45
+
46
+ # Import your Action primitives
47
+ from gui_agents.agents.Action import (
48
+ Action,
49
+ Screenshot,
50
+ )
51
+
52
+ __all__ = [
53
+ "HardwareInterface",
54
+ "Backend",
55
+ "PyAutoGUIBackend",
56
+ "ADBBackend",
57
+ "LybicBackend",
58
+ "PyAutoGUIVMwareBackend",
59
+ ]
60
+
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Facade – single entry point
65
+ # ---------------------------------------------------------------------------
66
+ class HardwareInterface:
67
+ """High‑level facade that routes Action objects to a chosen backend."""
68
+
69
+ BACKEND_MAP: Dict[str, Type[Backend]] = {
70
+ "pyautogui": PyAutoGUIBackend,
71
+ "adb": ADBBackend,
72
+ "lybic": LybicBackend,
73
+ "pyautogui_vmware": PyAutoGUIVMwareBackend,
74
+ }
75
+
76
+ # ------------------------------------------------------------------
77
+ def __init__(self, backend: str | Backend = "pyautogui", **backend_kwargs):
78
+ if isinstance(backend, Backend):
79
+ self.backend: Backend = backend
80
+ else:
81
+ key = backend.lower()
82
+ if key not in self.BACKEND_MAP:
83
+ raise ValueError(f"Unsupported backend '{backend}'. Available: {list(self.BACKEND_MAP)}")
84
+ self.backend = self.BACKEND_MAP[key](**backend_kwargs)
85
+
86
+ # ------------------------------------------------------------------
87
+ def dispatch(self, actions: Action | List[Action]):
88
+ """Execute one or multiple actions *in order*.
89
+
90
+ Args:
91
+ actions: `Action` instance or list thereof.
92
+ """
93
+ if isinstance(actions, Action):
94
+ actions = [actions]
95
+
96
+ for act in actions:
97
+ # 特殊处理Memorize动作,不传递给后端执行
98
+ if type(act).__name__ == "Memorize":
99
+ return None
100
+ if not self.backend.supports(type(act)):
101
+ raise NotImplementedError(
102
+ f"{type(act).__name__} is not supported by backend {self.backend.__class__.__name__}"
103
+ )
104
+ if (not isinstance(actions, list)) or (len(actions)==1):
105
+ return self.backend.execute(act)
106
+ else:
107
+ self.backend.execute(act)
108
+
109
+ def dispatchDict(self, actionDict: Dict):
110
+ """Execute one actions *in order*.
111
+
112
+ Args:
113
+ actionDict: `Action` instance or list thereof.
114
+ """
115
+ """
116
+ Convenience helper – accept JSON-style dict(s) instead of Action objects.
117
+
118
+ Parameters
119
+ ----------
120
+ payload : Dict | List[Dict]
121
+ - Dict: single action, e.g. {"type": "Click", "xy": [100,200], ...}
122
+ - List: sequence of actions in the above format
123
+ """
124
+ if isinstance(actionDict, list):
125
+ actions = [Action.from_dict(item) for item in actionDict]
126
+ else:
127
+ actions = Action.from_dict(actionDict)
128
+
129
+ return self.dispatch(actions)
@@ -0,0 +1,568 @@
1
+ import logging
2
+ import re
3
+ from collections import defaultdict
4
+ from typing import Dict, List, Optional, Tuple
5
+ import platform
6
+
7
+ from gui_agents.agents.grounding import ACI
8
+ from gui_agents.core.knowledge import KnowledgeBase
9
+ from gui_agents.agents.global_state import GlobalState
10
+ from gui_agents.store.registry import Registry
11
+ from gui_agents.utils.common_utils import (
12
+ Dag,
13
+ Node,
14
+ parse_dag,
15
+ agent_log_to_string,
16
+ )
17
+ from gui_agents.tools.tools import Tools
18
+ from PIL import Image
19
+ import io
20
+
21
+ logger = logging.getLogger("desktopenv.agent")
22
+
23
+ NUM_IMAGE_TOKEN = 1105 # Value set of screen of size 1920x1080 for openai vision
24
+
25
+ class Manager:
26
+ def __init__(
27
+ self,
28
+ Tools_dict: Dict,
29
+ # engine_params: Dict,
30
+ local_kb_path: str,
31
+ multi_round: bool = False,
32
+ platform: str = platform.system().lower(),
33
+ enable_search: bool = True,
34
+ ):
35
+ self.platform = platform
36
+ self.Tools_dict = Tools_dict
37
+
38
+ self.generator_agent = Tools()
39
+ self.generator_agent.register_tool("subtask_planner", Tools_dict["subtask_planner"]["provider"], Tools_dict["subtask_planner"]["model"])
40
+
41
+ self.dag_translator_agent = Tools()
42
+ self.dag_translator_agent.register_tool("dag_translator", self.Tools_dict["dag_translator"]["provider"], self.Tools_dict["dag_translator"]["model"])
43
+
44
+ self.narrative_summarization_agent = Tools()
45
+ self.narrative_summarization_agent.register_tool("narrative_summarization", self.Tools_dict["narrative_summarization"]["provider"], self.Tools_dict["narrative_summarization"]["model"])
46
+
47
+ self.episode_summarization_agent = Tools()
48
+ self.episode_summarization_agent.register_tool("episode_summarization", self.Tools_dict["episode_summarization"]["provider"], self.Tools_dict["episode_summarization"]["model"])
49
+
50
+ self.local_kb_path = local_kb_path
51
+
52
+ self.embedding_engine = Tools()
53
+ self.embedding_engine.register_tool("embedding", self.Tools_dict["embedding"]["provider"], self.Tools_dict["embedding"]["model"])
54
+ KB_Tools_dict = {
55
+ "embedding": self.Tools_dict["embedding"],
56
+ "query_formulator": self.Tools_dict["query_formulator"],
57
+ "context_fusion": self.Tools_dict["context_fusion"],
58
+ "narrative_summarization": self.Tools_dict["narrative_summarization"],
59
+ "episode_summarization": self.Tools_dict["episode_summarization"],
60
+ }
61
+
62
+
63
+ self.knowledge_base = KnowledgeBase(
64
+ embedding_engine=self.embedding_engine,
65
+ local_kb_path=self.local_kb_path,
66
+ platform=platform,
67
+ Tools_dict=KB_Tools_dict,
68
+ )
69
+
70
+ self.global_state: GlobalState = Registry.get("GlobalStateStore") # type: ignore
71
+
72
+ self.planner_history = []
73
+
74
+ self.turn_count = 0
75
+
76
+ # Initialize search engine based on enable_search parameter
77
+ if enable_search:
78
+ self.search_engine = Tools()
79
+ self.search_engine.register_tool("websearch", self.Tools_dict["websearch"]["provider"], self.Tools_dict["websearch"]["model"])
80
+ else:
81
+ self.search_engine = None
82
+
83
+ self.multi_round = multi_round
84
+
85
+ def summarize_episode(self, trajectory):
86
+ """Summarize the episode experience for lifelong learning reflection
87
+ Args:
88
+ trajectory: str: The episode experience to be summarized
89
+ """
90
+
91
+ # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
92
+ subtask_summarization, total_tokens, cost_string = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
93
+ logger.info(f"Episode summarization tokens: {total_tokens}, cost: {cost_string}")
94
+
95
+ self.global_state.log_operation(
96
+ module="manager",
97
+ operation="episode_summarization",
98
+ data={
99
+ "tokens": total_tokens,
100
+ "cost": cost_string,
101
+ "content": subtask_summarization
102
+ }
103
+ )
104
+
105
+ return subtask_summarization
106
+
107
+ def summarize_narrative(self, trajectory):
108
+ """Summarize the narrative experience for lifelong learning reflection
109
+ Args:
110
+ trajectory: str: The narrative experience to be summarized
111
+ """
112
+ # Create Reflection on whole trajectories for next round trial
113
+ lifelong_learning_reflection, total_tokens, cost_string = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
114
+ logger.info(f"Narrative summarization tokens: {total_tokens}, cost: {cost_string}")
115
+
116
+ self.global_state.log_operation(
117
+ module="manager",
118
+ operation="narrative_summarization",
119
+ data={
120
+ "tokens": total_tokens,
121
+ "cost": cost_string,
122
+ "content": lifelong_learning_reflection
123
+ }
124
+ )
125
+
126
+ return lifelong_learning_reflection
127
+
128
+ def _generate_step_by_step_plan(
129
+ self,
130
+ observation: Dict,
131
+ instruction: str,
132
+ failed_subtask: Optional[Node] = None,
133
+ completed_subtasks_list: List[Node] = [],
134
+ remaining_subtasks_list: List[Node] = [],
135
+ ) -> Tuple[Dict, str]:
136
+
137
+ import time
138
+ step_start = time.time()
139
+ # Converts a list of DAG Nodes into a natural langauge list
140
+ def format_subtask_list(subtasks: List[Node]) -> str:
141
+ res = ""
142
+ for idx, node in enumerate(subtasks):
143
+ res += f"{idx+1}. **{node.name}**:\n"
144
+ bullets = re.split(r"(?<=[.!?;]) +", node.info)
145
+ for bullet in bullets:
146
+ res += f" - {bullet}\n"
147
+ res += "\n"
148
+ return res
149
+ prefix_message = ""
150
+ # Perform Retrieval only at the first planning step
151
+ if self.turn_count == 0:
152
+ formulate_query_start = time.time()
153
+ self.search_query, total_tokens, cost_string = self.knowledge_base.formulate_query(
154
+ instruction, observation
155
+ )
156
+ formulate_query_time = time.time() - formulate_query_start
157
+ logger.info(f"Formulate query tokens: {total_tokens}, cost: {cost_string}")
158
+ self.global_state.log_operation(
159
+ module="manager",
160
+ operation="formulate_query",
161
+ data={
162
+ "tokens": total_tokens,
163
+ "cost": cost_string,
164
+ "content": self.search_query,
165
+ "duration": formulate_query_time
166
+ }
167
+ )
168
+ self.global_state.set_search_query(self.search_query)
169
+
170
+ most_similar_task = ""
171
+ retrieved_experience = ""
172
+ integrated_knowledge = ""
173
+ # Retrieve most similar narrative (task) experience
174
+ narrative_start = time.time()
175
+ most_similar_task, retrieved_experience, total_tokens, cost_string = (
176
+ self.knowledge_base.retrieve_narrative_experience(instruction)
177
+ )
178
+ logger.info(f"Retrieve narrative experience tokens: {total_tokens}, cost: {cost_string}")
179
+ narrative_time = time.time() - narrative_start
180
+ logger.info(f"[Timing] Manager.retrieve_narrative_experience execution time: {narrative_time:.2f} seconds")
181
+ self.global_state.log_operation(
182
+ module="manager",
183
+ operation="retrieve_narrative_experience",
184
+ data={
185
+ "tokens": total_tokens,
186
+ "cost": cost_string,
187
+ "content": "Most similar task: " + most_similar_task + "\n" + retrieved_experience.strip(),
188
+ "duration": narrative_time
189
+ }
190
+ )
191
+
192
+ logger.info(
193
+ "SIMILAR TASK EXPERIENCE: %s",
194
+ most_similar_task + "\n" + retrieved_experience.strip(),
195
+ )
196
+
197
+ # Retrieve knowledge from the web if search_engine is provided
198
+ if self.search_engine is not None:
199
+ knowledge_start = time.time()
200
+ retrieved_knowledge, total_tokens, cost_string = self.knowledge_base.retrieve_knowledge(
201
+ instruction=instruction,
202
+ search_query=self.search_query,
203
+ search_engine=self.search_engine,
204
+ )
205
+ logger.info(f"Retrieve knowledge tokens: {total_tokens}, cost: {cost_string}")
206
+
207
+ knowledge_time = time.time() - knowledge_start
208
+ logger.info(f"[Timing] Manager.retrieve_knowledge execution time: {knowledge_time:.2f} seconds")
209
+ self.global_state.log_operation(
210
+ module="manager",
211
+ operation="retrieve_knowledge",
212
+ data={
213
+ "tokens": total_tokens,
214
+ "cost": cost_string,
215
+ "content": retrieved_knowledge,
216
+ "duration": knowledge_time
217
+ }
218
+ )
219
+
220
+ logger.info("RETRIEVED KNOWLEDGE: %s", retrieved_knowledge)
221
+
222
+ if retrieved_knowledge is not None:
223
+ # Fuse the retrieved knowledge and experience
224
+ fusion_start = time.time()
225
+ integrated_knowledge, total_tokens, cost_string = self.knowledge_base.knowledge_fusion(
226
+ observation=observation,
227
+ instruction=instruction,
228
+ web_knowledge=retrieved_knowledge,
229
+ similar_task=most_similar_task,
230
+ experience=retrieved_experience,
231
+ )
232
+ logger.info(f"Knowledge fusion tokens: {total_tokens}, cost: {cost_string}")
233
+ fusion_time = time.time() - fusion_start
234
+ logger.info(f"[Timing] Manager.knowledge_fusion execution time: {fusion_time:.2f} seconds")
235
+ self.global_state.log_operation(
236
+ module="manager",
237
+ operation="knowledge_fusion",
238
+ data={
239
+ "tokens": total_tokens,
240
+ "cost": cost_string,
241
+ "content": integrated_knowledge,
242
+ "duration": fusion_time
243
+ }
244
+ )
245
+
246
+ logger.info("INTEGRATED KNOWLEDGE: %s", integrated_knowledge)
247
+
248
+ integrated_knowledge = integrated_knowledge or retrieved_experience
249
+
250
+ # Add the integrated knowledge to the task instruction in the system prompt
251
+ if integrated_knowledge:
252
+ instruction += f"\nYou may refer to some retrieved knowledge if you think they are useful.{integrated_knowledge}"
253
+ prefix_message = f"TASK_DESCRIPTION is {instruction}"
254
+
255
+ # Re-plan on failure case
256
+ if failed_subtask:
257
+ agent_log = agent_log_to_string(self.global_state.get_agent_log())
258
+ generator_message = (
259
+ f"The subtask {failed_subtask} cannot be completed. Please generate a new plan for the remainder of the trajectory.\n\n"
260
+ f"Successfully Completed Subtasks:\n{format_subtask_list(completed_subtasks_list)}\n"
261
+ f"Please refer to the agent log to understand the progress and context of the task so far.\n{agent_log}"
262
+ )
263
+ # Re-plan on subtask completion case
264
+ elif len(completed_subtasks_list) + len(remaining_subtasks_list) > 0:
265
+ agent_log = agent_log_to_string(self.global_state.get_agent_log())
266
+ generator_message = (
267
+ "The current trajectory and desktop state is provided. Please revise the plan for the following trajectory.\n\n"
268
+ f"Successfully Completed Subtasks:\n{format_subtask_list(completed_subtasks_list)}\n"
269
+ f"Future Remaining Subtasks:\n{format_subtask_list(remaining_subtasks_list)}\n"
270
+ f"Please refer to the agent log to understand the progress and context of the task so far.\n{agent_log}"
271
+ )
272
+ # Initial plan case
273
+ else:
274
+ generator_message = "Please generate the initial plan for the task.\n"
275
+
276
+ generator_message = prefix_message + "\n" + generator_message
277
+ logger.info("GENERATOR MESSAGE: %s", generator_message)
278
+ logger.info("GENERATING HIGH LEVEL PLAN")
279
+
280
+ subtask_planner_start = time.time()
281
+ plan, total_tokens, cost_string = self.generator_agent.execute_tool("subtask_planner", {"str_input": generator_message, "img_input": observation.get("screenshot", None)})
282
+ logger.info(f"Subtask planner tokens: {total_tokens}, cost: {cost_string}")
283
+ subtask_planner_time = time.time() - subtask_planner_start
284
+ logger.info(f"[Timing] Manager.subtask_planner execution time: {subtask_planner_time:.2f} seconds")
285
+ self.global_state.log_operation(
286
+ module="manager",
287
+ operation="subtask_planner",
288
+ data={
289
+ "tokens": total_tokens,
290
+ "cost": cost_string,
291
+ "content": plan,
292
+ "duration": subtask_planner_time
293
+ }
294
+ )
295
+
296
+ step_time = time.time() - step_start
297
+ logger.info(f"[Timing] Manager._generate_step_by_step_plan execution time: {step_time:.2f} seconds")
298
+ self.global_state.log_operation(
299
+ module="manager",
300
+ operation="Manager._generate_step_by_step_plan",
301
+ data={"duration": step_time}
302
+ )
303
+
304
+ if plan == "":
305
+ raise Exception("Plan Generation Failed - Fix the Prompt")
306
+
307
+ logger.info("HIGH LEVEL STEP BY STEP PLAN: %s", plan)
308
+
309
+ self.planner_history.append(plan)
310
+ self.turn_count += 1
311
+
312
+ planner_info = {
313
+ "search_query": self.search_query,
314
+ "goal_plan": plan,
315
+ }
316
+
317
+ assert type(plan) == str
318
+
319
+ return planner_info, plan
320
+
321
+ def _generate_dag(self, instruction: str, plan: str) -> Tuple[Dict, Dag]:
322
+ import time
323
+ dag_start = time.time()
324
+
325
+ logger.info("GENERATING DAG")
326
+
327
+ # Add maximum retry count
328
+ max_retries = 2
329
+ retry_count = 0
330
+ dag = None
331
+
332
+ while retry_count < max_retries and dag is None:
333
+ if retry_count > 0:
334
+ logger.warning(f"Retrying DAG generation, attempt {retry_count}")
335
+ self.global_state.log_operation(
336
+ module="manager",
337
+ operation="dag_retry",
338
+ data={"retry_count": retry_count}
339
+ )
340
+
341
+ # Generate DAG
342
+ dag_raw, total_tokens, cost_string = self.dag_translator_agent.execute_tool("dag_translator", {"str_input": f"Instruction: {instruction}\nPlan: {plan}"})
343
+ logger.info(f"DAG translator tokens: {total_tokens}, cost: {cost_string}")
344
+
345
+ # Try to parse DAG
346
+ dag = parse_dag(dag_raw)
347
+
348
+ # If parsing fails, increment retry count
349
+ if dag is None:
350
+ retry_count += 1
351
+ # If not the last attempt, wait a short time before retrying
352
+ if retry_count < max_retries:
353
+ time.sleep(1)
354
+
355
+ dag_time = time.time() - dag_start
356
+ logger.info(f"[Timing] Manager._generate_dag execution time: {dag_time:.2f} seconds")
357
+
358
+ logger.info("Generated DAG: %s", dag_raw)
359
+ self.global_state.log_operation(
360
+ module="manager",
361
+ operation="generated_dag",
362
+ data={
363
+ "tokens": total_tokens,
364
+ "cost": cost_string,
365
+ "content": dag_raw,
366
+ "duration": dag_time,
367
+ "retry_count": retry_count
368
+ }
369
+ )
370
+
371
+ dag_info = {
372
+ "dag": dag_raw,
373
+ }
374
+
375
+ # If all attempts fail, create a simple default DAG
376
+ if dag is None:
377
+ logger.error("Unable to generate valid DAG, using default DAG")
378
+ # Create a simple default DAG with just one "Execute Task" node
379
+ default_node = Node(name="Execute Task", info=f"Execute instruction: {instruction}")
380
+ dag = Dag(nodes=[default_node], edges=[])
381
+
382
+ self.global_state.log_operation(
383
+ module="manager",
384
+ operation="default_dag_created",
385
+ data={"content": "Using default DAG because valid DAG could not be parsed from model output"}
386
+ )
387
+
388
+ return dag_info, dag
389
+
390
+ def _topological_sort(self, dag: Dag) -> List[Node]:
391
+ """Topological sort of the DAG using DFS
392
+ dag: Dag: Object representation of the DAG with nodes and edges
393
+ """
394
+ import logging
395
+ logger = logging.getLogger("desktopenv.agent")
396
+
397
+ # Check if DAG is empty
398
+ if not dag.nodes:
399
+ logger.warning("DAG has no nodes, returning empty list")
400
+ return []
401
+
402
+ # If there's only one node, return it directly
403
+ if len(dag.nodes) == 1:
404
+ logger.info("DAG has only one node, returning directly")
405
+ return dag.nodes
406
+
407
+ def dfs(node_name, visited, temp_visited, stack):
408
+ # If node is already in current path, we have a cycle
409
+ if node_name in temp_visited:
410
+ raise ValueError(f"Cycle detected in DAG involving node: {node_name}")
411
+
412
+ # If node has been visited, skip
413
+ if visited.get(node_name, False):
414
+ return
415
+
416
+ # Mark node as part of current path
417
+ temp_visited.add(node_name)
418
+ visited[node_name] = True
419
+
420
+ # Visit all neighbors
421
+ for neighbor in adj_list.get(node_name, []):
422
+ if not visited.get(neighbor, False):
423
+ dfs(neighbor, visited, temp_visited, stack)
424
+
425
+ # Remove node from current path
426
+ temp_visited.remove(node_name)
427
+ stack.append(node_name)
428
+
429
+ try:
430
+ # Build adjacency list
431
+ adj_list = defaultdict(list)
432
+ for u, v in dag.edges:
433
+ if not u or not v:
434
+ logger.warning(f"Skipping invalid edge: {u} -> {v}")
435
+ continue
436
+ adj_list[u.name].append(v.name)
437
+
438
+ visited = {node.name: False for node in dag.nodes}
439
+ temp_visited = set() # For cycle detection
440
+ stack = []
441
+
442
+ # Perform DFS for each unvisited node
443
+ for node in dag.nodes:
444
+ if not visited.get(node.name, False):
445
+ dfs(node.name, visited, temp_visited, stack)
446
+
447
+ # Return topologically sorted nodes
448
+ sorted_nodes = []
449
+ for name in stack[::-1]:
450
+ matching_nodes = [n for n in dag.nodes if n.name == name]
451
+ if matching_nodes:
452
+ sorted_nodes.append(matching_nodes[0])
453
+ else:
454
+ logger.warning(f"Could not find node named {name} in DAG node list")
455
+
456
+ # Check if all nodes are included in result
457
+ if len(sorted_nodes) != len(dag.nodes):
458
+ logger.warning(f"Number of nodes in topological sort result ({len(sorted_nodes)}) does not match number in DAG ({len(dag.nodes)})")
459
+
460
+ return sorted_nodes
461
+
462
+ except Exception as e:
463
+ logger.error(f"Error during topological sort: {e}")
464
+ # On error, return original node list
465
+ logger.info("Returning unsorted original node list")
466
+ return dag.nodes
467
+
468
+ def get_action_queue(
469
+ self,
470
+ Tu: str,
471
+ observation: Dict,
472
+ running_state: str,
473
+ failed_subtask: Optional[Node] = None,
474
+ completed_subtasks_list: List[Node] = [],
475
+ remaining_subtasks_list: List[Node] = [],
476
+ ):
477
+ """Generate the action list based on the instruction
478
+ instruction:str: Instruction for the task
479
+ """
480
+ import time
481
+ action_queue_start = time.time()
482
+
483
+ try:
484
+ planner_info, plan = self._generate_step_by_step_plan(
485
+ observation,
486
+ Tu,
487
+ failed_subtask,
488
+ completed_subtasks_list,
489
+ remaining_subtasks_list,
490
+ )
491
+
492
+ # Generate the DAG
493
+ try:
494
+ dag_info, dag = self._generate_dag(Tu, plan)
495
+ except Exception as e:
496
+ logger.error(f"Error generating DAG: {e}")
497
+ # Create a simple default DAG with just one "Execute Task" node
498
+ default_node = Node(name="Execute Task", info=f"Execute instruction: {Tu}")
499
+ dag = Dag(nodes=[default_node], edges=[])
500
+ dag_info = {"dag": "Failed to generate DAG, using default DAG"}
501
+
502
+ self.global_state.log_operation(
503
+ module="manager",
504
+ operation="dag_generation_error",
505
+ data={"error": str(e), "content": "Using default DAG due to error in DAG generation"}
506
+ )
507
+
508
+ # Topological sort of the DAG
509
+ try:
510
+ action_queue = self._topological_sort(dag)
511
+ except Exception as e:
512
+ logger.error(f"Error during topological sort of DAG: {e}")
513
+ # If topological sort fails, use node list directly
514
+ action_queue = dag.nodes
515
+
516
+ self.global_state.log_operation(
517
+ module="manager",
518
+ operation="topological_sort_error",
519
+ data={"error": str(e), "content": "Topological sort failed, using node list directly"}
520
+ )
521
+
522
+ planner_info.update(dag_info)
523
+
524
+ if action_queue:
525
+ logger.info(f"NEXT SUBTASK: {action_queue[0]}")
526
+ self.global_state.log_operation(
527
+ module="manager",
528
+ operation="next_subtask",
529
+ data={"content": str(action_queue[0])}
530
+ )
531
+
532
+ if len(action_queue) > 1:
533
+ logger.info(f"REMAINING SUBTASKS: {action_queue[1:]}")
534
+ self.global_state.log_operation(
535
+ module="manager",
536
+ operation="remaining_subtasks",
537
+ data={"content": str(action_queue[1:])}
538
+ )
539
+
540
+ action_queue_time = time.time() - action_queue_start
541
+ logger.info(f"[Timing] manager.get_action_queue execution time: {action_queue_time:.2f} seconds")
542
+ self.global_state.log_operation(
543
+ module="manager",
544
+ operation="manager.get_action_queue",
545
+ data={"duration": action_queue_time}
546
+ )
547
+
548
+ return planner_info, action_queue
549
+
550
+ except Exception as e:
551
+ # Handle any unhandled exceptions in the entire process
552
+ logger.error(f"Unhandled exception in get_action_queue function: {e}")
553
+
554
+ # Create a simple default task node
555
+ default_node = Node(name="Execute Task", info=f"Execute instruction: {Tu}")
556
+ action_queue = [default_node]
557
+ planner_info = {"error": str(e), "fallback": "Using default task node"}
558
+
559
+ self.global_state.log_operation(
560
+ module="manager",
561
+ operation="get_action_queue_error",
562
+ data={"error": str(e), "content": "Unhandled exception occurred, using default task node"}
563
+ )
564
+
565
+ action_queue_time = time.time() - action_queue_start
566
+ logger.info(f"[Timing] manager.get_action_queue (error path) execution time: {action_queue_time:.2f} seconds")
567
+
568
+ return planner_info, action_queue