PyPI - lybic-guiagents - Versions diffs - 0.1.0__py3-none-any.whl - Mend

lybic-guiagents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lybic-guiagents might be problematic. Click here for more details.

Files changed (85) hide show

desktop_env/__init__.py +1 -0
desktop_env/actions.py +203 -0
desktop_env/controllers/__init__.py +0 -0
desktop_env/controllers/python.py +471 -0
desktop_env/controllers/setup.py +882 -0
desktop_env/desktop_env.py +509 -0
desktop_env/evaluators/__init__.py +5 -0
desktop_env/evaluators/getters/__init__.py +41 -0
desktop_env/evaluators/getters/calc.py +15 -0
desktop_env/evaluators/getters/chrome.py +1774 -0
desktop_env/evaluators/getters/file.py +154 -0
desktop_env/evaluators/getters/general.py +42 -0
desktop_env/evaluators/getters/gimp.py +38 -0
desktop_env/evaluators/getters/impress.py +126 -0
desktop_env/evaluators/getters/info.py +24 -0
desktop_env/evaluators/getters/misc.py +406 -0
desktop_env/evaluators/getters/replay.py +20 -0
desktop_env/evaluators/getters/vlc.py +86 -0
desktop_env/evaluators/getters/vscode.py +35 -0
desktop_env/evaluators/metrics/__init__.py +160 -0
desktop_env/evaluators/metrics/basic_os.py +68 -0
desktop_env/evaluators/metrics/chrome.py +493 -0
desktop_env/evaluators/metrics/docs.py +1011 -0
desktop_env/evaluators/metrics/general.py +665 -0
desktop_env/evaluators/metrics/gimp.py +637 -0
desktop_env/evaluators/metrics/libreoffice.py +28 -0
desktop_env/evaluators/metrics/others.py +92 -0
desktop_env/evaluators/metrics/pdf.py +31 -0
desktop_env/evaluators/metrics/slides.py +957 -0
desktop_env/evaluators/metrics/table.py +585 -0
desktop_env/evaluators/metrics/thunderbird.py +176 -0
desktop_env/evaluators/metrics/utils.py +719 -0
desktop_env/evaluators/metrics/vlc.py +524 -0
desktop_env/evaluators/metrics/vscode.py +283 -0
desktop_env/providers/__init__.py +35 -0
desktop_env/providers/aws/__init__.py +0 -0
desktop_env/providers/aws/manager.py +278 -0
desktop_env/providers/aws/provider.py +186 -0
desktop_env/providers/aws/provider_with_proxy.py +315 -0
desktop_env/providers/aws/proxy_pool.py +193 -0
desktop_env/providers/azure/__init__.py +0 -0
desktop_env/providers/azure/manager.py +87 -0
desktop_env/providers/azure/provider.py +207 -0
desktop_env/providers/base.py +97 -0
desktop_env/providers/gcp/__init__.py +0 -0
desktop_env/providers/gcp/manager.py +0 -0
desktop_env/providers/gcp/provider.py +0 -0
desktop_env/providers/virtualbox/__init__.py +0 -0
desktop_env/providers/virtualbox/manager.py +463 -0
desktop_env/providers/virtualbox/provider.py +124 -0
desktop_env/providers/vmware/__init__.py +0 -0
desktop_env/providers/vmware/manager.py +455 -0
desktop_env/providers/vmware/provider.py +105 -0
gui_agents/__init__.py +0 -0
gui_agents/agents/Action.py +209 -0
gui_agents/agents/__init__.py +0 -0
gui_agents/agents/agent_s.py +832 -0
gui_agents/agents/global_state.py +610 -0
gui_agents/agents/grounding.py +651 -0
gui_agents/agents/hardware_interface.py +129 -0
gui_agents/agents/manager.py +568 -0
gui_agents/agents/translator.py +132 -0
gui_agents/agents/worker.py +355 -0
gui_agents/cli_app.py +560 -0
gui_agents/core/__init__.py +0 -0
gui_agents/core/engine.py +1496 -0
gui_agents/core/knowledge.py +449 -0
gui_agents/core/mllm.py +555 -0
gui_agents/tools/__init__.py +0 -0
gui_agents/tools/tools.py +727 -0
gui_agents/unit_test/__init__.py +0 -0
gui_agents/unit_test/run_tests.py +65 -0
gui_agents/unit_test/test_manager.py +330 -0
gui_agents/unit_test/test_worker.py +269 -0
gui_agents/utils/__init__.py +0 -0
gui_agents/utils/analyze_display.py +301 -0
gui_agents/utils/common_utils.py +263 -0
gui_agents/utils/display_viewer.py +281 -0
gui_agents/utils/embedding_manager.py +53 -0
gui_agents/utils/image_axis_utils.py +27 -0
lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0

gui_agents/agents/translator.py ADDED Viewed

@@ -0,0 +1,132 @@
+# translator.py
+"""
+Translates pyautogui-style scripts into unified commands (JSON list),
+format strictly follows computer-use schema.
+"""
+from __future__ import annotations
+import ast, json
+from typing import List, Dict
+class TranslateError(RuntimeError):
+    ...
+class _CommandBuilder(ast.NodeVisitor):
+    """
+    Only handles the most common GUI atomic operations:
+        click / moveTo / doubleClick / rightClick / middleClick /
+        dragTo / scroll / typewrite / press / hotkey / wait
+    If conditions, loops, or other logic are encountered, an error is thrown
+    (the Grounding layer should flatten these first).
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.cmds: List[Dict] = []
+    # ---------- Node Visiting ----------
+    def visit_Expr(self, node):         # pyautogui.xxx(...)
+        if not isinstance(node.value, ast.Call):
+            raise TranslateError("Only function call level instructions allowed")
+        self._handle_call(node.value)
+        self.generic_visit(node)
+    # ---------- Core: Map function calls to commands ----------
+    def _handle_call(self, call: ast.Call):
+        if not isinstance(call.func, ast.Attribute):
+            raise TranslateError("Complex expressions not supported")
+        lib, fn = self._split_attr(call.func) # type: ignore
+        if lib != "pyautogui":
+            raise TranslateError("Only pyautogui calls allowed")
+        # Get positional and keyword arguments
+        kw = {k.arg: self._literal(v) for k, v in zip(call.keywords, [k.value for k in call.keywords])}
+        pos = [self._literal(a) for a in call.args]
+        # ---------- mapping ----------
+        if fn in {"click", "doubleClick", "rightClick", "middleClick"}:
+            x, y = pos[:2] if len(pos) >= 2 else (kw.get("x"), kw.get("y"))
+            self._append_click(fn, x, y, kw)
+        elif fn == "moveTo":
+            x, y = pos[:2] if len(pos) >= 2 else (kw.get("x"), kw.get("y"))
+            self.cmds.append({"action": "move", "coordinate": [x, y]})
+        elif fn == "dragTo":
+            x, y = pos[:2] if len(pos) >= 2 else (kw.get("x"), kw.get("y"))
+            # startCoordinate needs to be supplemented by the caller; using None as a placeholder here
+            self.cmds.append({"action": "leftClickDrag",
+                              "startCoordinate": None,
+                              "coordinate": [x, y]})
+        elif fn == "scroll":
+            clicks = pos[0] if pos else kw.get("clicks")
+            direction = "up" if clicks > 0 else "down"
+            coordinate = [kw.get("x", 0), kw.get("y", 0)]
+            self.cmds.append({"action": "scroll",
+                              "scrollAmount": abs(clicks),
+                              "scrollDirection": direction,
+                              "coordinate": coordinate})
+        elif fn in {"typewrite", "write"}:
+            text = pos[0] if pos else kw.get("message")
+            self.cmds.append({"action": "type", "text": text})
+        elif fn in {"press", "hotkey"}:
+            keys = [self._literal(a) for a in call.args]
+            key_combo = "+".join(keys)
+            self.cmds.append({"action": "keyPress", "text": key_combo})
+        elif fn == "sleep":
+            secs = pos[0] if pos else kw.get("seconds", 1)
+            self.cmds.append({"action": "wait", "duration": secs})
+        else:
+            raise TranslateError(f"Function {fn} not yet supported")
+    # ---------- Tools ----------
+    def _append_click(self, fn, x, y, kw):
+        # Single click / double click / different buttons
+        clicks = kw.get("clicks", 1)
+        button = kw.get("button", "left")
+        action = {
+            ("click", 1, "left"): "click",
+            ("click", 2, "left"): "doubleClick",
+            ("doubleClick", 1, "left"): "doubleClick",
+            ("rightClick", 1, "right"): "rightClick",
+            ("middleClick", 1, "middle"): "middleClick",
+        }.get((fn, clicks, button))
+        if not action:
+            raise TranslateError(f"Cannot map {fn} clicks={clicks} button={button}")
+        self.cmds.append({"action": action, "coordinate": [x, y]})
+    def _split_attr(self, attr: ast.Attribute):
+        parts = []
+        while isinstance(attr, ast.Attribute):
+            parts.insert(0, attr.attr)
+            attr = attr.value # type: ignore
+            if isinstance(attr, ast.Name):
+                parts.insert(0, attr.id)
+            else:
+                raise TranslateError("Complex expressions not supported")
+        return parts[0], parts[1]
+    def _literal(self, node):
+        if isinstance(node, ast.Constant):
+            return node.value
+        raise TranslateError("Only literal parameters allowed")
+# ---------- External API ----------
+def translate(py_code: str) -> List[Dict]:
+    tree = ast.parse(py_code)
+    builder = _CommandBuilder()
+    builder.visit(tree)
+    return builder.cmds
+# ---------------- demo ----------------
+# if __name__ == "__main__":
+#     sample = "import pyautogui; pyautogui.click(769, 1006, clicks=1, button='left');"
+#     cmds = translate(sample)
+#     print(json.dumps(cmds, indent=2, ensure_ascii=False))

gui_agents/agents/worker.py ADDED Viewed

@@ -0,0 +1,355 @@
+import logging
+import re
+import textwrap
+from typing import Dict, List, Tuple
+import platform
+import os
+import json
+from gui_agents.agents.grounding import ACI
+from gui_agents.core.knowledge import KnowledgeBase
+from gui_agents.utils.common_utils import (
+    Node,
+    extract_first_agent_function,
+    parse_single_code_from_string,
+    sanitize_code,
+    agent_log_to_string,
+)
+from gui_agents.tools.tools import Tools
+from gui_agents.store.registry import Registry
+from gui_agents.agents.global_state import GlobalState
+logger = logging.getLogger("desktopenv.agent")
+class Worker:
+    def __init__(
+        self,
+        Tools_dict: Dict,
+        local_kb_path: str,
+        platform: str = platform.system().lower(),
+        enable_reflection: bool = True,
+        use_subtask_experience: bool = True,
+        enable_takeover: bool = False,
+        enable_search: bool = True,
+        tools_config: Dict = {},
+    ):
+        """
+        Worker receives a subtask list and active subtask and generates the next action for the to execute.
+        Args:
+            engine_params: Dict
+                Parameters for the multimodal engine
+            local_kb_path: str
+                Path to knowledge base
+            platform: str
+                OS platform the agent runs on (darwin, linux, windows)
+            enable_reflection: bool
+                Whether to enable reflection
+            use_subtask_experience: bool
+                Whether to use subtask experience
+            enable_takeover: bool
+                Whether to enable user takeover functionality
+            enable_search: bool
+                Global switch for search functionality (overrides config)
+            tools_config: Dict
+                Complete tools configuration from tools_config.json
+        """
+        # super().__init__(engine_params, platform)
+        self.platform = platform
+        self.local_kb_path = local_kb_path
+        self.Tools_dict = Tools_dict
+        self.enable_takeover = enable_takeover
+        self.enable_search = enable_search  # Store global search switch
+        # If tools_config is not provided, load it from file
+        if tools_config is None:
+            tools_config_path = os.path.join(
+                os.path.dirname(os.path.dirname(__file__)), "tools",
+                "tools_config.json")
+            with open(tools_config_path, "r") as f:
+                self.tools_config = json.load(f)
+        else:
+            self.tools_config = tools_config
+        self.embedding_engine = Tools()
+        self.embedding_engine.register_tool(
+            "embedding", self.Tools_dict["embedding"]["provider"],
+            self.Tools_dict["embedding"]["model"])
+        self.enable_reflection = enable_reflection
+        self.use_subtask_experience = use_subtask_experience
+        self.global_state: GlobalState = Registry.get(
+            "GlobalStateStore")  # type: ignore
+        self.reset()
+    def reset(self):
+        self.generator_agent = Tools()
+        self.action_generator_tool = "action_generator_with_takeover" if self.enable_takeover else "action_generator"
+        # Get tool configuration from tools_config
+        tool_config = None
+        for tool in self.tools_config["tools"]:
+            if tool["tool_name"] == self.action_generator_tool:
+                tool_config = tool
+                break
+        # Prepare tool parameters
+        tool_params = {}
+        # First check global search switch
+        if not self.enable_search:
+            # If global search is disabled, force disable search for this tool
+            tool_params["enable_search"] = False
+            logger.info(
+                f"Configuring {self.action_generator_tool} with search DISABLED (global switch off)"
+            )
+        else:
+            # If global search is enabled, check tool-specific config
+            if tool_config and "enable_search" in tool_config:
+                # Use enable_search from config file
+                enable_search = tool_config.get("enable_search", False)
+                tool_params["enable_search"] = enable_search
+                tool_params["search_provider"] = tool_config.get(
+                    "search_provider", "bocha")
+                tool_params["search_model"] = tool_config.get(
+                    "search_model", "")
+                logger.info(
+                    f"Configuring {self.action_generator_tool} with search enabled: {enable_search} (from config)"
+                )
+        # Register the tool with parameters
+        self.generator_agent.register_tool(
+            self.action_generator_tool,
+            self.Tools_dict[self.action_generator_tool]["provider"],
+            self.Tools_dict[self.action_generator_tool]["model"], **tool_params)
+        self.reflection_agent = Tools()
+        self.reflection_agent.register_tool(
+            "traj_reflector", self.Tools_dict["traj_reflector"]["provider"],
+            self.Tools_dict["traj_reflector"]["model"])
+        self.embedding_engine = Tools()
+        self.embedding_engine.register_tool(
+            "embedding", self.Tools_dict["embedding"]["provider"],
+            self.Tools_dict["embedding"]["model"])
+        self.knowledge_base = KnowledgeBase(
+            embedding_engine=self.embedding_engine,
+            Tools_dict=self.Tools_dict,
+            local_kb_path=self.local_kb_path,
+            platform=self.platform,
+        )
+        self.turn_count = 0
+        self.worker_history = []
+        self.reflections = []
+        self.cost_this_turn = 0
+        self.screenshot_inputs = []
+        self.planner_history = []
+        self.latest_action = None
+        self.max_trajector_length = 8
+    def generate_next_action(
+        self,
+        Tu: str,
+        search_query: str,
+        subtask: str,
+        subtask_info: str,
+        future_tasks: List[Node],
+        done_task: List[Node],
+        obs: Dict,
+        running_state: str = "running",
+    ) -> Dict:
+        """
+        Predict the next action(s) based on the current observation.
+        """
+        import time
+        action_start = time.time()
+        # Log the result of the previous hardware action, which is the current observation.
+        if self.turn_count > 0 and self.latest_action:
+            self.global_state.add_agent_log({
+                "type":
+                    "passive",
+                "content":
+                    f"Hardware action `{self.latest_action}` has been executed. The result is reflected in the current screenshot."
+            })
+        # Get RAG knowledge, only update system message at t=0
+        if self.turn_count == 0:
+            if self.use_subtask_experience:
+                subtask_query_key = ("Task:\n" + search_query +
+                                     "\n\nSubtask: " + subtask +
+                                     "\nSubtask Instruction: " + subtask_info)
+                retrieve_start = time.time()
+                retrieved_similar_subtask, retrieved_subtask_experience, total_tokens, cost_string = (
+                    self.knowledge_base.retrieve_episodic_experience(
+                        subtask_query_key))
+                logger.info(
+                    f"Retrieve episodic experience tokens: {total_tokens}, cost: {cost_string}"
+                )
+                retrieve_time = time.time() - retrieve_start
+                logger.info(
+                    f"[Timing] Worker.retrieve_episodic_experience execution time: {retrieve_time:.2f} seconds"
+                )
+                # Dirty fix to replace id with element description during subtask retrieval
+                pattern = r"\(\d+"
+                retrieved_subtask_experience = re.sub(
+                    pattern, "(element_description",
+                    retrieved_subtask_experience)
+                retrieved_subtask_experience = retrieved_subtask_experience.replace(
+                    "_id", "_description")
+                logger.info(
+                    "SIMILAR SUBTASK EXPERIENCE: %s",
+                    retrieved_similar_subtask + "\n" +
+                    retrieved_subtask_experience.strip(),
+                )
+                self.global_state.log_operation(
+                    module="worker",
+                    operation="Worker.retrieve_episodic_experience",
+                    data={
+                        "tokens":
+                            total_tokens,
+                        "cost":
+                            cost_string,
+                        "content":
+                            "Retrieved similar subtask: " +
+                            retrieved_similar_subtask + "\n" +
+                            "Retrieved subtask experience: " +
+                            retrieved_subtask_experience.strip(),
+                        "duration":
+                            retrieve_time
+                    })
+                Tu += "\nYou may refer to some similar subtask experience if you think they are useful. {}".format(
+                    retrieved_similar_subtask + "\n" +
+                    retrieved_subtask_experience)
+            prefix_message = f"SUBTASK_DESCRIPTION is {subtask}\n\nTASK_DESCRIPTION is {Tu}\n\nFUTURE_TASKS is {', '.join([f.name for f in future_tasks])}\n\nDONE_TASKS is {', '.join(d.name for d in done_task)}"
+        # Reflection generation does not add its own response, it only gets the trajectory
+        reflection = None
+        if self.enable_reflection:
+            # Load the initial subtask info
+            if self.turn_count == 0:
+                text_content = textwrap.dedent(f"""
+                    Subtask Description: {subtask}
+                    Subtask Information: {subtask_info}
+                    Current Trajectory below:
+                    """)
+                self.reflection_agent.tools["traj_reflector"].llm_agent.add_message(
+                    text_content +
+                    "\n\nThe initial screen is provided. No action has been taken yet.",
+                    image_content=obs["screenshot"],
+                    role="user")
+            else:
+                if self.planner_history and self.planner_history[-1] is not None:
+                    text_content = self.clean_worker_generation_for_reflection(
+                        self.planner_history[-1])
+                else:
+                    text_content = "No previous action available for reflection"
+                reflection_start = time.time()
+                reflection, total_tokens, cost_string = self.reflection_agent.execute_tool(
+                    "traj_reflector", {
+                        "str_input": text_content,
+                        "img_input": obs["screenshot"]
+                    })
+                logger.info(
+                    f"Trajectory reflector tokens: {total_tokens}, cost: {cost_string}"
+                )
+                reflection_time = time.time() - reflection_start
+                logger.info(
+                    f"[Timing] Worker.traj_reflector execution time: {reflection_time:.2f} seconds"
+                )
+                self.reflections.append(reflection)
+                logger.info("REFLECTION: %s", reflection)
+                self.global_state.log_operation(module="manager",
+                                                operation="reflection",
+                                                data={
+                                                    "tokens": total_tokens,
+                                                    "cost": cost_string,
+                                                    "content": reflection,
+                                                    "duration": reflection_time
+                                                })
+        generator_message = ""
+        # Only provide subinfo in the very first message to avoid over influence and redundancy
+        if self.turn_count == 0:
+            generator_message += prefix_message
+            generator_message += f"Remember only complete the subtask: {subtask}\n"
+            generator_message += f"You can use this extra information for completing the current subtask: {subtask_info}.\n"
+        else:
+            agent_log = agent_log_to_string(self.global_state.get_agent_log())
+            generator_message += f"\nYour previous action was: {self.latest_action}\n"
+            generator_message += (
+                f"\nYou may use this reflection on the previous action and overall trajectory: {reflection}\n"
+                if reflection and self.turn_count > 0 else "")
+            generator_message += f"Please refer to the agent log to understand the progress and context of the task so far.\n{agent_log}"
+        action_generator_start = time.time()
+        plan, total_tokens, cost_string = self.generator_agent.execute_tool(
+            "action_generator_with_takeover"
+            if self.enable_takeover else "action_generator", {
+                "str_input": generator_message,
+                "img_input": obs["screenshot"]
+            })
+        logger.info(
+            f"Action generator tokens: {total_tokens}, cost: {cost_string}")
+        action_generator_time = time.time() - action_generator_start
+        logger.info(
+            f"[Timing] Worker.action_generator execution time: {action_generator_time:.2f} seconds"
+        )
+        self.planner_history.append(plan)
+        logger.info("Action Plan: %s", plan)
+        self.global_state.log_operation(module="worker",
+                                        operation="action_plan",
+                                        data={
+                                            "tokens": total_tokens,
+                                            "cost": cost_string,
+                                            "content": plan,
+                                            "duration": action_generator_time
+                                        })
+        # Add the generated plan to the agent log as passive memory
+        self.global_state.add_agent_log({"type": "passive", "content": plan})
+        try:
+            action_code = parse_single_code_from_string(
+                plan.split("Grounded Action")[-1])
+            action_code = sanitize_code(action_code)
+            self.latest_action = extract_first_agent_function(action_code)
+        except Exception as e:
+            logger.warning(f"Failed to parse action from plan: {e}")
+            self.latest_action = None
+        executor_info = {
+            "current_subtask": subtask,
+            "current_subtask_info": subtask_info,
+            "executor_plan": plan,
+            "reflection": reflection,
+        }
+        self.turn_count += 1
+        self.screenshot_inputs.append(obs["screenshot"])
+        return executor_info
+    # Removes the previous action verification, and removes any extraneous grounded actions
+    def clean_worker_generation_for_reflection(self,
+                                               worker_generation: str) -> str:
+        # Remove the previous action verification
+        res = worker_generation[worker_generation.find("(Screenshot Analysis)"
+                                                      ):]
+        action = extract_first_agent_function(worker_generation)
+        # Cut off extra grounded actions
+        res = res[:res.find("(Grounded Action)")]
+        res += f"(Grounded Action)\n```python\n{action}\n```\n"
+        return res