PyPI - lybic-guiagents - Versions diffs - 0.1.0__py3-none-any.whl - Mend

lybic-guiagents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lybic-guiagents might be problematic. Click here for more details.

Files changed (85) hide show

desktop_env/__init__.py +1 -0
desktop_env/actions.py +203 -0
desktop_env/controllers/__init__.py +0 -0
desktop_env/controllers/python.py +471 -0
desktop_env/controllers/setup.py +882 -0
desktop_env/desktop_env.py +509 -0
desktop_env/evaluators/__init__.py +5 -0
desktop_env/evaluators/getters/__init__.py +41 -0
desktop_env/evaluators/getters/calc.py +15 -0
desktop_env/evaluators/getters/chrome.py +1774 -0
desktop_env/evaluators/getters/file.py +154 -0
desktop_env/evaluators/getters/general.py +42 -0
desktop_env/evaluators/getters/gimp.py +38 -0
desktop_env/evaluators/getters/impress.py +126 -0
desktop_env/evaluators/getters/info.py +24 -0
desktop_env/evaluators/getters/misc.py +406 -0
desktop_env/evaluators/getters/replay.py +20 -0
desktop_env/evaluators/getters/vlc.py +86 -0
desktop_env/evaluators/getters/vscode.py +35 -0
desktop_env/evaluators/metrics/__init__.py +160 -0
desktop_env/evaluators/metrics/basic_os.py +68 -0
desktop_env/evaluators/metrics/chrome.py +493 -0
desktop_env/evaluators/metrics/docs.py +1011 -0
desktop_env/evaluators/metrics/general.py +665 -0
desktop_env/evaluators/metrics/gimp.py +637 -0
desktop_env/evaluators/metrics/libreoffice.py +28 -0
desktop_env/evaluators/metrics/others.py +92 -0
desktop_env/evaluators/metrics/pdf.py +31 -0
desktop_env/evaluators/metrics/slides.py +957 -0
desktop_env/evaluators/metrics/table.py +585 -0
desktop_env/evaluators/metrics/thunderbird.py +176 -0
desktop_env/evaluators/metrics/utils.py +719 -0
desktop_env/evaluators/metrics/vlc.py +524 -0
desktop_env/evaluators/metrics/vscode.py +283 -0
desktop_env/providers/__init__.py +35 -0
desktop_env/providers/aws/__init__.py +0 -0
desktop_env/providers/aws/manager.py +278 -0
desktop_env/providers/aws/provider.py +186 -0
desktop_env/providers/aws/provider_with_proxy.py +315 -0
desktop_env/providers/aws/proxy_pool.py +193 -0
desktop_env/providers/azure/__init__.py +0 -0
desktop_env/providers/azure/manager.py +87 -0
desktop_env/providers/azure/provider.py +207 -0
desktop_env/providers/base.py +97 -0
desktop_env/providers/gcp/__init__.py +0 -0
desktop_env/providers/gcp/manager.py +0 -0
desktop_env/providers/gcp/provider.py +0 -0
desktop_env/providers/virtualbox/__init__.py +0 -0
desktop_env/providers/virtualbox/manager.py +463 -0
desktop_env/providers/virtualbox/provider.py +124 -0
desktop_env/providers/vmware/__init__.py +0 -0
desktop_env/providers/vmware/manager.py +455 -0
desktop_env/providers/vmware/provider.py +105 -0
gui_agents/__init__.py +0 -0
gui_agents/agents/Action.py +209 -0
gui_agents/agents/__init__.py +0 -0
gui_agents/agents/agent_s.py +832 -0
gui_agents/agents/global_state.py +610 -0
gui_agents/agents/grounding.py +651 -0
gui_agents/agents/hardware_interface.py +129 -0
gui_agents/agents/manager.py +568 -0
gui_agents/agents/translator.py +132 -0
gui_agents/agents/worker.py +355 -0
gui_agents/cli_app.py +560 -0
gui_agents/core/__init__.py +0 -0
gui_agents/core/engine.py +1496 -0
gui_agents/core/knowledge.py +449 -0
gui_agents/core/mllm.py +555 -0
gui_agents/tools/__init__.py +0 -0
gui_agents/tools/tools.py +727 -0
gui_agents/unit_test/__init__.py +0 -0
gui_agents/unit_test/run_tests.py +65 -0
gui_agents/unit_test/test_manager.py +330 -0
gui_agents/unit_test/test_worker.py +269 -0
gui_agents/utils/__init__.py +0 -0
gui_agents/utils/analyze_display.py +301 -0
gui_agents/utils/common_utils.py +263 -0
gui_agents/utils/display_viewer.py +281 -0
gui_agents/utils/embedding_manager.py +53 -0
gui_agents/utils/image_axis_utils.py +27 -0
lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0

gui_agents/core/knowledge.py ADDED Viewed

@@ -0,0 +1,449 @@
+import json
+import os
+from typing import Dict, Tuple, List, Union
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from gui_agents.utils.common_utils import (
+    load_embeddings,
+    load_knowledge_base,
+    save_embeddings,
+)
+from gui_agents.tools.tools import Tools
+from gui_agents.agents.global_state import GlobalState
+from gui_agents.store.registry import Registry
+from gui_agents.core.mllm import CostManager
+def get_embedding_dim(model_name):
+    if model_name == "doubao-embedding-large-text-250515":
+        return 2048
+    elif model_name == "doubao-embedding-text-240715":
+        return 2560
+    elif model_name == "text-embedding-ada-002":
+        return 1536
+    elif model_name == "text-embedding-3-small":
+        return 1536
+    elif model_name == "text-embedding-3-large":
+        return 3072
+    elif model_name == "gemini-embedding-001":
+        return 3072
+    elif model_name == "jina-embeddings-v4":
+        return 2048
+    elif model_name == "jina-embeddings-v3":
+        return 1024
+    elif model_name == "text-embedding-v4":
+        return 1024
+    elif model_name == "text-embedding-v3":
+        return 1024
+    elif model_name == "embedding-2" or model_name == "embedding-3":
+        return 2048
+    else:
+        return None
+class KnowledgeBase:
+    def __init__(
+        self,
+        embedding_engine: Tools,
+        local_kb_path: str,
+        platform: str,
+        Tools_dict: Dict,
+        save_knowledge: bool = True,
+    ):
+        self.platform = platform
+        self.local_kb_path = local_kb_path
+        # initialize embedding engine
+        self.embedding_engine = embedding_engine
+        # Initialize paths for different memory types
+        self.episodic_memory_path = os.path.join(
+            self.local_kb_path, self.platform, "episodic_memory.json"
+        )
+        self.narrative_memory_path = os.path.join(
+            self.local_kb_path, self.platform, "narrative_memory.json"
+        )
+        embedding_model_name = ""
+        if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
+            embedding_model_name = self.embedding_engine.tools["embedding"].model_name
+        else:
+            embedding_model_name = "default"
+        embedding_dim = get_embedding_dim(embedding_model_name)
+        self.embeddings_path = os.path.join(
+            self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
+        )
+        # Initialize trajectory tracking
+        self.task_trajectory = ""
+        self.current_subtask_trajectory = ""
+        self.current_search_query = ""
+        self.query_formulator = Tools()
+        self.query_formulator.register_tool("query_formulator", Tools_dict["query_formulator"]["provider"], Tools_dict["query_formulator"]["model"])
+        self.knowledge_fusion_agent = Tools()
+        self.knowledge_fusion_agent.register_tool("context_fusion", Tools_dict["context_fusion"]["provider"], Tools_dict["context_fusion"]["model"])
+        self.narrative_summarization_agent = Tools()
+        self.narrative_summarization_agent.register_tool("narrative_summarization", Tools_dict["narrative_summarization"]["provider"], Tools_dict["narrative_summarization"]["model"])
+        self.episode_summarization_agent = Tools()
+        self.episode_summarization_agent.register_tool("episode_summarization", Tools_dict["episode_summarization"]["provider"], Tools_dict["episode_summarization"]["model"])
+        self.save_knowledge = save_knowledge
+    def retrieve_knowledge(
+        self, instruction: str, search_query: str, search_engine: Tools
+    ) -> Tuple[str, List[int], str]:
+        """Retrieve knowledge using search engine
+        Args:
+            instruction (str): task instruction
+            search_query (str): search query to use
+            search_engine (Tools): search engine tool to use
+        Returns:
+            Tuple[str, List[int], float]: The search results, token usage, and cost
+        """
+        search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
+        return search_results, total_tokens, cost_string
+    def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
+        """Formulate search query based on instruction and current state
+        Args:
+            instruction (str): The task instruction
+            observation (Dict): Current observation including screenshot
+        Returns:
+            Tuple[str, List[int], float]: The formulated query, token usage, and cost
+        """
+        query_path = os.path.join(
+            self.local_kb_path, self.platform, "formulate_query.json"
+        )
+        try:
+            with open(query_path, "r") as f:
+                formulate_query = json.load(f)
+        except:
+            formulate_query = {}
+        if instruction in formulate_query:
+            return formulate_query[instruction], [0, 0, 0], ""
+        self.query_formulator.tools["query_formulator"].llm_agent.reset()
+        content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
+            "str_input": f"The task is: {instruction}\n" +
+                "To use google search to get some useful information, first carefully analyze " +
+                "the screenshot of the current desktop UI state, then given the task " +
+                "instruction, formulate a question that can be used to search on the Internet " +
+                "for information in helping with the task execution.\n" +
+                "The question should not be too general or too specific. Please ONLY provide " +
+                "the question.\nQuestion:",
+            "img_input": observation["screenshot"] if "screenshot" in observation else None
+        })
+        search_query = content.strip().replace('"', "")
+        print("search query: ", search_query)
+        formulate_query[instruction] = search_query
+        with open(query_path, "w") as f:
+            json.dump(formulate_query, f, indent=2)
+        return search_query, total_tokens, cost_string
+    def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
+        """Retrieve narrative experience using embeddings
+        Args:
+            instruction (str): The task instruction
+        Returns:
+            Tuple[str, str]: The similar task key and its narrative experience
+        """
+        knowledge_base = load_knowledge_base(self.narrative_memory_path)
+        if not knowledge_base:
+            return "None", "None", [0, 0, 0], ""
+        embeddings = load_embeddings(self.embeddings_path)
+        # Get or create instruction embedding
+        instruction_embedding = embeddings.get(instruction)
+        total_tokens, cost_string = [0, 0, 0], ""
+        if instruction_embedding is None:
+            instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
+            embeddings[instruction] = instruction_embedding
+            # total_tokens += tokens
+            for i in range(len(total_tokens)):
+                total_tokens[i] += tokens[i]
+            cost_string = cost_string_now
+        # Get or create embeddings for knowledge base entries
+        candidate_embeddings = []
+        for key in knowledge_base:
+            candidate_embedding = embeddings.get(key)
+            if candidate_embedding is None:
+                candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
+                for i in range(len(tokens)):
+                    total_tokens[i] += tokens[i]
+                # total_tokens += tokens
+                cost_string = CostManager.add_costs(cost_string, cost_string_now)
+            embeddings[key] = candidate_embedding
+            candidate_embeddings.append(candidate_embedding)
+        save_embeddings(self.embeddings_path, embeddings)
+        similarities = cosine_similarity(
+            instruction_embedding, np.vstack(candidate_embeddings)
+        )[0]
+        sorted_indices = np.argsort(similarities)[::-1]
+        keys = list(knowledge_base.keys())
+        idx = 1 if keys[sorted_indices[0]] == instruction else 0
+        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
+    def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
+        """Retrieve similar task experience using embeddings
+        Args:
+            instruction (str): The task instruction
+        Returns:
+            Tuple[str, str]: The similar task key and its episodic experience
+        """
+        knowledge_base = load_knowledge_base(self.episodic_memory_path)
+        if not knowledge_base:
+            return "None", "None", [0, 0, 0], ""
+        embeddings = load_embeddings(self.embeddings_path)
+        # Get or create instruction embedding
+        instruction_embedding = embeddings.get(instruction)
+        total_tokens, cost_string = [0, 0, 0], ""
+        if instruction_embedding is None:
+            instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
+            embeddings[instruction] = instruction_embedding
+            # total_tokens += tokens
+            for i in range(len(total_tokens)):
+                total_tokens[i] += tokens[i]
+            cost_string = cost_string_now
+        # Get or create embeddings for knowledge base entries
+        candidate_embeddings = []
+        for key in knowledge_base:
+            candidate_embedding = embeddings.get(key)
+            if candidate_embedding is None:
+                candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
+                # total_tokens += tokens
+                for i in range(len(total_tokens)):
+                    total_tokens[i] += tokens[i]
+                cost_string = CostManager.add_costs(cost_string, cost_string_now)
+            embeddings[key] = candidate_embedding
+            candidate_embeddings.append(candidate_embedding)
+        save_embeddings(self.embeddings_path, embeddings)
+        similarities = cosine_similarity(
+            instruction_embedding, np.vstack(candidate_embeddings)
+        )[0]
+        sorted_indices = np.argsort(similarities)[::-1]
+        keys = list(knowledge_base.keys())
+        idx = 1 if keys[sorted_indices[0]] == instruction else 0
+        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
+    def knowledge_fusion(
+        self,
+        observation: Dict,
+        instruction: str,
+        web_knowledge: str,
+        similar_task: str,
+        experience: str,
+    ) -> Tuple[str, list, str]:
+        """Combine web knowledge with similar task experience"""
+        content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
+            "str_input": f"Task: {instruction}\n" +
+                f"**Web search result**:\n{web_knowledge}\n\n" +
+                f"**Retrieved similar task experience**:\n" +
+                f"Similar task:{similar_task}\n{experience}\n\n" +
+                f"Based on the web search result and the retrieved similar task experience, " +
+                f"if you think the similar task experience is indeed useful to the main task, " +
+                f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
+            "img_input": observation["screenshot"] if "screenshot" in observation else None
+        })
+        return content, total_tokens, cost
+    def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
+        """Save episodic memory (subtask level knowledge).
+        Args:
+            subtask_key (str): Key identifying the subtask
+            subtask_traj (str): Trajectory/experience of the subtask
+        """
+        if not self.save_knowledge:
+            return
+        try:
+            kb = load_knowledge_base(self.episodic_memory_path)
+        except:
+            kb = {}
+        if subtask_key not in kb:
+            subtask_summarization = self.summarize_episode(subtask_traj)
+            kb[subtask_key] = subtask_summarization
+            if self.save_knowledge:
+                os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
+                with open(self.episodic_memory_path, "w") as fout:
+                    json.dump(kb, fout, indent=2)
+        return kb.get(subtask_key)
+    def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
+        """Save narrative memory (task level knowledge).
+        Args:
+            task_key (str): Key identifying the task
+            task_traj (str): Full trajectory/experience of the task
+        """
+        if not self.save_knowledge:
+            return
+        try:
+            kb = load_knowledge_base(self.narrative_memory_path)
+        except:
+            kb = {}
+        if task_key not in kb:
+            task_summarization = self.summarize_narrative(task_traj)
+            kb[task_key] = task_summarization
+            if self.save_knowledge:
+                os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
+                with open(self.narrative_memory_path, "w") as fout:
+                    json.dump(kb, fout, indent=2)
+        return kb.get(task_key)
+    def initialize_task_trajectory(self, instruction: str) -> None:
+        """Initialize a new task trajectory.
+        Args:
+            instruction (str): The task instruction
+        """
+        self.task_trajectory = f"Task:\n{instruction}"
+        self.current_search_query = ""
+        self.current_subtask_trajectory = ""
+    def update_task_trajectory(self, meta_data: Dict) -> None:
+        """Update the task trajectory with new metadata.
+        Args:
+            meta_data (Dict): Metadata from the agent's prediction
+        """
+        if not self.current_search_query and "search_query" in meta_data:
+            self.current_search_query = meta_data["search_query"]
+        self.task_trajectory += (
+            "\n\nReflection:\n"
+            + str(meta_data["reflection"])
+            + "\n\n----------------------\n\nPlan:\n"
+            + meta_data["executor_plan"]
+        )
+    def handle_subtask_trajectory(self, meta_data: Dict):
+        """Handle subtask trajectory updates based on subtask status.
+        Args:
+            meta_data (Dict): Metadata containing subtask information
+        Returns:
+            bool: Whether the subtask was completed
+        """
+        subtask_status = meta_data["subtask_status"]
+        subtask = meta_data["subtask"]
+        subtask_info = meta_data["subtask_info"]
+        if subtask_status in ["Start", "Done"]:
+            # If there's an existing subtask trajectory, finalize it
+            if self.current_subtask_trajectory:
+                self.current_subtask_trajectory += "\nSubtask Completed.\n"
+                subtask_key = self.current_subtask_trajectory.split(
+                    "\n----------------------\n\nPlan:\n"
+                )[0]
+                self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
+                self.current_subtask_trajectory = ""
+                return True
+            # Start new subtask trajectory
+            self.current_subtask_trajectory = (
+                f"Task:\n{self.current_search_query}\n\n"
+                f"Subtask: {subtask}\n"
+                f"Subtask Instruction: {subtask_info}\n"
+                f"----------------------\n\n"
+                f'Plan:\n{meta_data["executor_plan"]}\n'
+            )
+            return False
+        elif subtask_status == "In":
+            # Continue current subtask trajectory
+            self.current_subtask_trajectory += (
+                f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
+            )
+            return False
+    def finalize_task(self) -> None:
+        """Finalize the task by saving any remaining trajectories."""
+        # Save any remaining subtask trajectory
+        if self.current_subtask_trajectory:
+            self.current_subtask_trajectory += "\nSubtask Completed.\n"
+            subtask_key = self.current_subtask_trajectory.split(
+                "\n----------------------\n\nPlan:\n"
+            )[0]
+            self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
+        # Save the complete task trajectory
+        if self.task_trajectory and self.current_search_query:
+            self.save_narrative_memory(self.current_search_query, self.task_trajectory)
+        # Reset trajectories
+        self.task_trajectory = ""
+        self.current_subtask_trajectory = ""
+        self.current_search_query = ""
+    def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
+        """Summarize the episode experience for lifelong learning reflection
+        Args:
+            trajectory (str): The episode experience to be summarized
+        Returns:
+            str: The summarized episode experience
+        """
+        # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
+        content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
+        return content, total_tokens, cost
+    def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
+        """Summarize the narrative experience for lifelong learning reflection
+        Args:
+            trajectory (str): The narrative experience to be summarized
+        Returns:
+            str: The summarized narrative experience
+        """
+        # Create Reflection on whole trajectories for next round trial
+        content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
+        return content, total_tokens, cost