lybic-guiagents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lybic-guiagents might be problematic. Click here for more details.

Files changed (85) hide show
  1. desktop_env/__init__.py +1 -0
  2. desktop_env/actions.py +203 -0
  3. desktop_env/controllers/__init__.py +0 -0
  4. desktop_env/controllers/python.py +471 -0
  5. desktop_env/controllers/setup.py +882 -0
  6. desktop_env/desktop_env.py +509 -0
  7. desktop_env/evaluators/__init__.py +5 -0
  8. desktop_env/evaluators/getters/__init__.py +41 -0
  9. desktop_env/evaluators/getters/calc.py +15 -0
  10. desktop_env/evaluators/getters/chrome.py +1774 -0
  11. desktop_env/evaluators/getters/file.py +154 -0
  12. desktop_env/evaluators/getters/general.py +42 -0
  13. desktop_env/evaluators/getters/gimp.py +38 -0
  14. desktop_env/evaluators/getters/impress.py +126 -0
  15. desktop_env/evaluators/getters/info.py +24 -0
  16. desktop_env/evaluators/getters/misc.py +406 -0
  17. desktop_env/evaluators/getters/replay.py +20 -0
  18. desktop_env/evaluators/getters/vlc.py +86 -0
  19. desktop_env/evaluators/getters/vscode.py +35 -0
  20. desktop_env/evaluators/metrics/__init__.py +160 -0
  21. desktop_env/evaluators/metrics/basic_os.py +68 -0
  22. desktop_env/evaluators/metrics/chrome.py +493 -0
  23. desktop_env/evaluators/metrics/docs.py +1011 -0
  24. desktop_env/evaluators/metrics/general.py +665 -0
  25. desktop_env/evaluators/metrics/gimp.py +637 -0
  26. desktop_env/evaluators/metrics/libreoffice.py +28 -0
  27. desktop_env/evaluators/metrics/others.py +92 -0
  28. desktop_env/evaluators/metrics/pdf.py +31 -0
  29. desktop_env/evaluators/metrics/slides.py +957 -0
  30. desktop_env/evaluators/metrics/table.py +585 -0
  31. desktop_env/evaluators/metrics/thunderbird.py +176 -0
  32. desktop_env/evaluators/metrics/utils.py +719 -0
  33. desktop_env/evaluators/metrics/vlc.py +524 -0
  34. desktop_env/evaluators/metrics/vscode.py +283 -0
  35. desktop_env/providers/__init__.py +35 -0
  36. desktop_env/providers/aws/__init__.py +0 -0
  37. desktop_env/providers/aws/manager.py +278 -0
  38. desktop_env/providers/aws/provider.py +186 -0
  39. desktop_env/providers/aws/provider_with_proxy.py +315 -0
  40. desktop_env/providers/aws/proxy_pool.py +193 -0
  41. desktop_env/providers/azure/__init__.py +0 -0
  42. desktop_env/providers/azure/manager.py +87 -0
  43. desktop_env/providers/azure/provider.py +207 -0
  44. desktop_env/providers/base.py +97 -0
  45. desktop_env/providers/gcp/__init__.py +0 -0
  46. desktop_env/providers/gcp/manager.py +0 -0
  47. desktop_env/providers/gcp/provider.py +0 -0
  48. desktop_env/providers/virtualbox/__init__.py +0 -0
  49. desktop_env/providers/virtualbox/manager.py +463 -0
  50. desktop_env/providers/virtualbox/provider.py +124 -0
  51. desktop_env/providers/vmware/__init__.py +0 -0
  52. desktop_env/providers/vmware/manager.py +455 -0
  53. desktop_env/providers/vmware/provider.py +105 -0
  54. gui_agents/__init__.py +0 -0
  55. gui_agents/agents/Action.py +209 -0
  56. gui_agents/agents/__init__.py +0 -0
  57. gui_agents/agents/agent_s.py +832 -0
  58. gui_agents/agents/global_state.py +610 -0
  59. gui_agents/agents/grounding.py +651 -0
  60. gui_agents/agents/hardware_interface.py +129 -0
  61. gui_agents/agents/manager.py +568 -0
  62. gui_agents/agents/translator.py +132 -0
  63. gui_agents/agents/worker.py +355 -0
  64. gui_agents/cli_app.py +560 -0
  65. gui_agents/core/__init__.py +0 -0
  66. gui_agents/core/engine.py +1496 -0
  67. gui_agents/core/knowledge.py +449 -0
  68. gui_agents/core/mllm.py +555 -0
  69. gui_agents/tools/__init__.py +0 -0
  70. gui_agents/tools/tools.py +727 -0
  71. gui_agents/unit_test/__init__.py +0 -0
  72. gui_agents/unit_test/run_tests.py +65 -0
  73. gui_agents/unit_test/test_manager.py +330 -0
  74. gui_agents/unit_test/test_worker.py +269 -0
  75. gui_agents/utils/__init__.py +0 -0
  76. gui_agents/utils/analyze_display.py +301 -0
  77. gui_agents/utils/common_utils.py +263 -0
  78. gui_agents/utils/display_viewer.py +281 -0
  79. gui_agents/utils/embedding_manager.py +53 -0
  80. gui_agents/utils/image_axis_utils.py +27 -0
  81. lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
  82. lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
  83. lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
  84. lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
  85. lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,132 @@
1
+ # translator.py
2
+ """
3
+ Translates pyautogui-style scripts into unified commands (JSON list),
4
+ format strictly follows computer-use schema.
5
+ """
6
+
7
+ from __future__ import annotations
8
+ import ast, json
9
+ from typing import List, Dict
10
+
11
+
12
+ class TranslateError(RuntimeError):
13
+ ...
14
+
15
+
16
+ class _CommandBuilder(ast.NodeVisitor):
17
+ """
18
+ Only handles the most common GUI atomic operations:
19
+ click / moveTo / doubleClick / rightClick / middleClick /
20
+ dragTo / scroll / typewrite / press / hotkey / wait
21
+ If conditions, loops, or other logic are encountered, an error is thrown
22
+ (the Grounding layer should flatten these first).
23
+ """
24
+ def __init__(self) -> None:
25
+ super().__init__()
26
+ self.cmds: List[Dict] = []
27
+
28
+ # ---------- Node Visiting ----------
29
+ def visit_Expr(self, node): # pyautogui.xxx(...)
30
+ if not isinstance(node.value, ast.Call):
31
+ raise TranslateError("Only function call level instructions allowed")
32
+ self._handle_call(node.value)
33
+ self.generic_visit(node)
34
+
35
+ # ---------- Core: Map function calls to commands ----------
36
+ def _handle_call(self, call: ast.Call):
37
+ if not isinstance(call.func, ast.Attribute):
38
+ raise TranslateError("Complex expressions not supported")
39
+ lib, fn = self._split_attr(call.func) # type: ignore
40
+ if lib != "pyautogui":
41
+ raise TranslateError("Only pyautogui calls allowed")
42
+
43
+ # Get positional and keyword arguments
44
+ kw = {k.arg: self._literal(v) for k, v in zip(call.keywords, [k.value for k in call.keywords])}
45
+ pos = [self._literal(a) for a in call.args]
46
+
47
+ # ---------- mapping ----------
48
+ if fn in {"click", "doubleClick", "rightClick", "middleClick"}:
49
+ x, y = pos[:2] if len(pos) >= 2 else (kw.get("x"), kw.get("y"))
50
+ self._append_click(fn, x, y, kw)
51
+
52
+ elif fn == "moveTo":
53
+ x, y = pos[:2] if len(pos) >= 2 else (kw.get("x"), kw.get("y"))
54
+ self.cmds.append({"action": "move", "coordinate": [x, y]})
55
+
56
+ elif fn == "dragTo":
57
+ x, y = pos[:2] if len(pos) >= 2 else (kw.get("x"), kw.get("y"))
58
+ # startCoordinate needs to be supplemented by the caller; using None as a placeholder here
59
+ self.cmds.append({"action": "leftClickDrag",
60
+ "startCoordinate": None,
61
+ "coordinate": [x, y]})
62
+
63
+ elif fn == "scroll":
64
+ clicks = pos[0] if pos else kw.get("clicks")
65
+ direction = "up" if clicks > 0 else "down"
66
+ coordinate = [kw.get("x", 0), kw.get("y", 0)]
67
+ self.cmds.append({"action": "scroll",
68
+ "scrollAmount": abs(clicks),
69
+ "scrollDirection": direction,
70
+ "coordinate": coordinate})
71
+
72
+ elif fn in {"typewrite", "write"}:
73
+ text = pos[0] if pos else kw.get("message")
74
+ self.cmds.append({"action": "type", "text": text})
75
+
76
+ elif fn in {"press", "hotkey"}:
77
+ keys = [self._literal(a) for a in call.args]
78
+ key_combo = "+".join(keys)
79
+ self.cmds.append({"action": "keyPress", "text": key_combo})
80
+
81
+ elif fn == "sleep":
82
+ secs = pos[0] if pos else kw.get("seconds", 1)
83
+ self.cmds.append({"action": "wait", "duration": secs})
84
+
85
+ else:
86
+ raise TranslateError(f"Function {fn} not yet supported")
87
+
88
+ # ---------- Tools ----------
89
+ def _append_click(self, fn, x, y, kw):
90
+ # Single click / double click / different buttons
91
+ clicks = kw.get("clicks", 1)
92
+ button = kw.get("button", "left")
93
+ action = {
94
+ ("click", 1, "left"): "click",
95
+ ("click", 2, "left"): "doubleClick",
96
+ ("doubleClick", 1, "left"): "doubleClick",
97
+ ("rightClick", 1, "right"): "rightClick",
98
+ ("middleClick", 1, "middle"): "middleClick",
99
+ }.get((fn, clicks, button))
100
+ if not action:
101
+ raise TranslateError(f"Cannot map {fn} clicks={clicks} button={button}")
102
+ self.cmds.append({"action": action, "coordinate": [x, y]})
103
+
104
+ def _split_attr(self, attr: ast.Attribute):
105
+ parts = []
106
+ while isinstance(attr, ast.Attribute):
107
+ parts.insert(0, attr.attr)
108
+ attr = attr.value # type: ignore
109
+ if isinstance(attr, ast.Name):
110
+ parts.insert(0, attr.id)
111
+ else:
112
+ raise TranslateError("Complex expressions not supported")
113
+ return parts[0], parts[1]
114
+
115
+ def _literal(self, node):
116
+ if isinstance(node, ast.Constant):
117
+ return node.value
118
+ raise TranslateError("Only literal parameters allowed")
119
+
120
+ # ---------- External API ----------
121
+ def translate(py_code: str) -> List[Dict]:
122
+ tree = ast.parse(py_code)
123
+ builder = _CommandBuilder()
124
+ builder.visit(tree)
125
+ return builder.cmds
126
+
127
+
128
+ # ---------------- demo ----------------
129
+ # if __name__ == "__main__":
130
+ # sample = "import pyautogui; pyautogui.click(769, 1006, clicks=1, button='left');"
131
+ # cmds = translate(sample)
132
+ # print(json.dumps(cmds, indent=2, ensure_ascii=False))
@@ -0,0 +1,355 @@
1
+ import logging
2
+ import re
3
+ import textwrap
4
+ from typing import Dict, List, Tuple
5
+ import platform
6
+ import os
7
+ import json
8
+
9
+ from gui_agents.agents.grounding import ACI
10
+ from gui_agents.core.knowledge import KnowledgeBase
11
+ from gui_agents.utils.common_utils import (
12
+ Node,
13
+ extract_first_agent_function,
14
+ parse_single_code_from_string,
15
+ sanitize_code,
16
+ agent_log_to_string,
17
+ )
18
+ from gui_agents.tools.tools import Tools
19
+ from gui_agents.store.registry import Registry
20
+ from gui_agents.agents.global_state import GlobalState
21
+
22
+ logger = logging.getLogger("desktopenv.agent")
23
+
24
+
25
+ class Worker:
26
+
27
+ def __init__(
28
+ self,
29
+ Tools_dict: Dict,
30
+ local_kb_path: str,
31
+ platform: str = platform.system().lower(),
32
+ enable_reflection: bool = True,
33
+ use_subtask_experience: bool = True,
34
+ enable_takeover: bool = False,
35
+ enable_search: bool = True,
36
+ tools_config: Dict = {},
37
+ ):
38
+ """
39
+ Worker receives a subtask list and active subtask and generates the next action for the to execute.
40
+ Args:
41
+ engine_params: Dict
42
+ Parameters for the multimodal engine
43
+ local_kb_path: str
44
+ Path to knowledge base
45
+ platform: str
46
+ OS platform the agent runs on (darwin, linux, windows)
47
+ enable_reflection: bool
48
+ Whether to enable reflection
49
+ use_subtask_experience: bool
50
+ Whether to use subtask experience
51
+ enable_takeover: bool
52
+ Whether to enable user takeover functionality
53
+ enable_search: bool
54
+ Global switch for search functionality (overrides config)
55
+ tools_config: Dict
56
+ Complete tools configuration from tools_config.json
57
+ """
58
+ # super().__init__(engine_params, platform)
59
+ self.platform = platform
60
+
61
+ self.local_kb_path = local_kb_path
62
+ self.Tools_dict = Tools_dict
63
+ self.enable_takeover = enable_takeover
64
+ self.enable_search = enable_search # Store global search switch
65
+
66
+ # If tools_config is not provided, load it from file
67
+ if tools_config is None:
68
+ tools_config_path = os.path.join(
69
+ os.path.dirname(os.path.dirname(__file__)), "tools",
70
+ "tools_config.json")
71
+ with open(tools_config_path, "r") as f:
72
+ self.tools_config = json.load(f)
73
+ else:
74
+ self.tools_config = tools_config
75
+
76
+ self.embedding_engine = Tools()
77
+ self.embedding_engine.register_tool(
78
+ "embedding", self.Tools_dict["embedding"]["provider"],
79
+ self.Tools_dict["embedding"]["model"])
80
+
81
+ self.enable_reflection = enable_reflection
82
+ self.use_subtask_experience = use_subtask_experience
83
+ self.global_state: GlobalState = Registry.get(
84
+ "GlobalStateStore") # type: ignore
85
+ self.reset()
86
+
87
+ def reset(self):
88
+
89
+ self.generator_agent = Tools()
90
+ self.action_generator_tool = "action_generator_with_takeover" if self.enable_takeover else "action_generator"
91
+
92
+ # Get tool configuration from tools_config
93
+ tool_config = None
94
+ for tool in self.tools_config["tools"]:
95
+ if tool["tool_name"] == self.action_generator_tool:
96
+ tool_config = tool
97
+ break
98
+
99
+ # Prepare tool parameters
100
+ tool_params = {}
101
+
102
+ # First check global search switch
103
+ if not self.enable_search:
104
+ # If global search is disabled, force disable search for this tool
105
+ tool_params["enable_search"] = False
106
+ logger.info(
107
+ f"Configuring {self.action_generator_tool} with search DISABLED (global switch off)"
108
+ )
109
+ else:
110
+ # If global search is enabled, check tool-specific config
111
+ if tool_config and "enable_search" in tool_config:
112
+ # Use enable_search from config file
113
+ enable_search = tool_config.get("enable_search", False)
114
+ tool_params["enable_search"] = enable_search
115
+ tool_params["search_provider"] = tool_config.get(
116
+ "search_provider", "bocha")
117
+ tool_params["search_model"] = tool_config.get(
118
+ "search_model", "")
119
+
120
+ logger.info(
121
+ f"Configuring {self.action_generator_tool} with search enabled: {enable_search} (from config)"
122
+ )
123
+
124
+ # Register the tool with parameters
125
+ self.generator_agent.register_tool(
126
+ self.action_generator_tool,
127
+ self.Tools_dict[self.action_generator_tool]["provider"],
128
+ self.Tools_dict[self.action_generator_tool]["model"], **tool_params)
129
+
130
+ self.reflection_agent = Tools()
131
+ self.reflection_agent.register_tool(
132
+ "traj_reflector", self.Tools_dict["traj_reflector"]["provider"],
133
+ self.Tools_dict["traj_reflector"]["model"])
134
+
135
+ self.embedding_engine = Tools()
136
+ self.embedding_engine.register_tool(
137
+ "embedding", self.Tools_dict["embedding"]["provider"],
138
+ self.Tools_dict["embedding"]["model"])
139
+ self.knowledge_base = KnowledgeBase(
140
+ embedding_engine=self.embedding_engine,
141
+ Tools_dict=self.Tools_dict,
142
+ local_kb_path=self.local_kb_path,
143
+ platform=self.platform,
144
+ )
145
+
146
+ self.turn_count = 0
147
+ self.worker_history = []
148
+ self.reflections = []
149
+ self.cost_this_turn = 0
150
+ self.screenshot_inputs = []
151
+ self.planner_history = []
152
+ self.latest_action = None
153
+ self.max_trajector_length = 8
154
+
155
+ def generate_next_action(
156
+ self,
157
+ Tu: str,
158
+ search_query: str,
159
+ subtask: str,
160
+ subtask_info: str,
161
+ future_tasks: List[Node],
162
+ done_task: List[Node],
163
+ obs: Dict,
164
+ running_state: str = "running",
165
+ ) -> Dict:
166
+ """
167
+ Predict the next action(s) based on the current observation.
168
+ """
169
+ import time
170
+ action_start = time.time()
171
+
172
+ # Log the result of the previous hardware action, which is the current observation.
173
+ if self.turn_count > 0 and self.latest_action:
174
+ self.global_state.add_agent_log({
175
+ "type":
176
+ "passive",
177
+ "content":
178
+ f"Hardware action `{self.latest_action}` has been executed. The result is reflected in the current screenshot."
179
+ })
180
+
181
+ # Get RAG knowledge, only update system message at t=0
182
+ if self.turn_count == 0:
183
+ if self.use_subtask_experience:
184
+ subtask_query_key = ("Task:\n" + search_query +
185
+ "\n\nSubtask: " + subtask +
186
+ "\nSubtask Instruction: " + subtask_info)
187
+ retrieve_start = time.time()
188
+ retrieved_similar_subtask, retrieved_subtask_experience, total_tokens, cost_string = (
189
+ self.knowledge_base.retrieve_episodic_experience(
190
+ subtask_query_key))
191
+ logger.info(
192
+ f"Retrieve episodic experience tokens: {total_tokens}, cost: {cost_string}"
193
+ )
194
+ retrieve_time = time.time() - retrieve_start
195
+ logger.info(
196
+ f"[Timing] Worker.retrieve_episodic_experience execution time: {retrieve_time:.2f} seconds"
197
+ )
198
+
199
+ # Dirty fix to replace id with element description during subtask retrieval
200
+ pattern = r"\(\d+"
201
+ retrieved_subtask_experience = re.sub(
202
+ pattern, "(element_description",
203
+ retrieved_subtask_experience)
204
+ retrieved_subtask_experience = retrieved_subtask_experience.replace(
205
+ "_id", "_description")
206
+
207
+ logger.info(
208
+ "SIMILAR SUBTASK EXPERIENCE: %s",
209
+ retrieved_similar_subtask + "\n" +
210
+ retrieved_subtask_experience.strip(),
211
+ )
212
+ self.global_state.log_operation(
213
+ module="worker",
214
+ operation="Worker.retrieve_episodic_experience",
215
+ data={
216
+ "tokens":
217
+ total_tokens,
218
+ "cost":
219
+ cost_string,
220
+ "content":
221
+ "Retrieved similar subtask: " +
222
+ retrieved_similar_subtask + "\n" +
223
+ "Retrieved subtask experience: " +
224
+ retrieved_subtask_experience.strip(),
225
+ "duration":
226
+ retrieve_time
227
+ })
228
+ Tu += "\nYou may refer to some similar subtask experience if you think they are useful. {}".format(
229
+ retrieved_similar_subtask + "\n" +
230
+ retrieved_subtask_experience)
231
+
232
+ prefix_message = f"SUBTASK_DESCRIPTION is {subtask}\n\nTASK_DESCRIPTION is {Tu}\n\nFUTURE_TASKS is {', '.join([f.name for f in future_tasks])}\n\nDONE_TASKS is {', '.join(d.name for d in done_task)}"
233
+
234
+ # Reflection generation does not add its own response, it only gets the trajectory
235
+ reflection = None
236
+ if self.enable_reflection:
237
+ # Load the initial subtask info
238
+ if self.turn_count == 0:
239
+ text_content = textwrap.dedent(f"""
240
+ Subtask Description: {subtask}
241
+ Subtask Information: {subtask_info}
242
+ Current Trajectory below:
243
+ """)
244
+ self.reflection_agent.tools["traj_reflector"].llm_agent.add_message(
245
+ text_content +
246
+ "\n\nThe initial screen is provided. No action has been taken yet.",
247
+ image_content=obs["screenshot"],
248
+ role="user")
249
+
250
+ else:
251
+ if self.planner_history and self.planner_history[-1] is not None:
252
+ text_content = self.clean_worker_generation_for_reflection(
253
+ self.planner_history[-1])
254
+ else:
255
+ text_content = "No previous action available for reflection"
256
+
257
+ reflection_start = time.time()
258
+ reflection, total_tokens, cost_string = self.reflection_agent.execute_tool(
259
+ "traj_reflector", {
260
+ "str_input": text_content,
261
+ "img_input": obs["screenshot"]
262
+ })
263
+ logger.info(
264
+ f"Trajectory reflector tokens: {total_tokens}, cost: {cost_string}"
265
+ )
266
+ reflection_time = time.time() - reflection_start
267
+ logger.info(
268
+ f"[Timing] Worker.traj_reflector execution time: {reflection_time:.2f} seconds"
269
+ )
270
+ self.reflections.append(reflection)
271
+ logger.info("REFLECTION: %s", reflection)
272
+ self.global_state.log_operation(module="manager",
273
+ operation="reflection",
274
+ data={
275
+ "tokens": total_tokens,
276
+ "cost": cost_string,
277
+ "content": reflection,
278
+ "duration": reflection_time
279
+ })
280
+
281
+ generator_message = ""
282
+
283
+ # Only provide subinfo in the very first message to avoid over influence and redundancy
284
+ if self.turn_count == 0:
285
+ generator_message += prefix_message
286
+ generator_message += f"Remember only complete the subtask: {subtask}\n"
287
+ generator_message += f"You can use this extra information for completing the current subtask: {subtask_info}.\n"
288
+ else:
289
+ agent_log = agent_log_to_string(self.global_state.get_agent_log())
290
+ generator_message += f"\nYour previous action was: {self.latest_action}\n"
291
+ generator_message += (
292
+ f"\nYou may use this reflection on the previous action and overall trajectory: {reflection}\n"
293
+ if reflection and self.turn_count > 0 else "")
294
+ generator_message += f"Please refer to the agent log to understand the progress and context of the task so far.\n{agent_log}"
295
+
296
+ action_generator_start = time.time()
297
+ plan, total_tokens, cost_string = self.generator_agent.execute_tool(
298
+ "action_generator_with_takeover"
299
+ if self.enable_takeover else "action_generator", {
300
+ "str_input": generator_message,
301
+ "img_input": obs["screenshot"]
302
+ })
303
+ logger.info(
304
+ f"Action generator tokens: {total_tokens}, cost: {cost_string}")
305
+ action_generator_time = time.time() - action_generator_start
306
+ logger.info(
307
+ f"[Timing] Worker.action_generator execution time: {action_generator_time:.2f} seconds"
308
+ )
309
+
310
+ self.planner_history.append(plan)
311
+ logger.info("Action Plan: %s", plan)
312
+ self.global_state.log_operation(module="worker",
313
+ operation="action_plan",
314
+ data={
315
+ "tokens": total_tokens,
316
+ "cost": cost_string,
317
+ "content": plan,
318
+ "duration": action_generator_time
319
+ })
320
+
321
+ # Add the generated plan to the agent log as passive memory
322
+ self.global_state.add_agent_log({"type": "passive", "content": plan})
323
+
324
+ try:
325
+ action_code = parse_single_code_from_string(
326
+ plan.split("Grounded Action")[-1])
327
+ action_code = sanitize_code(action_code)
328
+ self.latest_action = extract_first_agent_function(action_code)
329
+ except Exception as e:
330
+ logger.warning(f"Failed to parse action from plan: {e}")
331
+ self.latest_action = None
332
+
333
+ executor_info = {
334
+ "current_subtask": subtask,
335
+ "current_subtask_info": subtask_info,
336
+ "executor_plan": plan,
337
+ "reflection": reflection,
338
+ }
339
+ self.turn_count += 1
340
+
341
+ self.screenshot_inputs.append(obs["screenshot"])
342
+
343
+ return executor_info
344
+
345
+ # Removes the previous action verification, and removes any extraneous grounded actions
346
+ def clean_worker_generation_for_reflection(self,
347
+ worker_generation: str) -> str:
348
+ # Remove the previous action verification
349
+ res = worker_generation[worker_generation.find("(Screenshot Analysis)"
350
+ ):]
351
+ action = extract_first_agent_function(worker_generation)
352
+ # Cut off extra grounded actions
353
+ res = res[:res.find("(Grounded Action)")]
354
+ res += f"(Grounded Action)\n```python\n{action}\n```\n"
355
+ return res