PyPI - bioguider - Versions diffs - 0.2.3__py3-none-any.whl - Mend

bioguider 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bioguider might be problematic. Click here for more details.

Files changed (47) hide show

bioguider/__init__.py +0 -0
bioguider/agents/__init__.py +0 -0
bioguider/agents/agent_task.py +88 -0
bioguider/agents/agent_tools.py +147 -0
bioguider/agents/agent_utils.py +357 -0
bioguider/agents/collection_execute_step.py +180 -0
bioguider/agents/collection_observe_step.py +113 -0
bioguider/agents/collection_plan_step.py +154 -0
bioguider/agents/collection_task.py +179 -0
bioguider/agents/collection_task_utils.py +109 -0
bioguider/agents/common_agent.py +159 -0
bioguider/agents/common_agent_2step.py +126 -0
bioguider/agents/common_step.py +85 -0
bioguider/agents/dockergeneration_execute_step.py +186 -0
bioguider/agents/dockergeneration_observe_step.py +153 -0
bioguider/agents/dockergeneration_plan_step.py +158 -0
bioguider/agents/dockergeneration_task.py +158 -0
bioguider/agents/dockergeneration_task_utils.py +220 -0
bioguider/agents/evaluation_task.py +269 -0
bioguider/agents/identification_execute_step.py +179 -0
bioguider/agents/identification_observe_step.py +92 -0
bioguider/agents/identification_plan_step.py +135 -0
bioguider/agents/identification_task.py +220 -0
bioguider/agents/identification_task_utils.py +18 -0
bioguider/agents/peo_common_step.py +64 -0
bioguider/agents/prompt_utils.py +190 -0
bioguider/agents/python_ast_repl_tool.py +69 -0
bioguider/agents/rag_collection_task.py +130 -0
bioguider/conversation.py +67 -0
bioguider/database/summarized_file_db.py +140 -0
bioguider/managers/evaluation_manager.py +108 -0
bioguider/rag/__init__.py +0 -0
bioguider/rag/config.py +117 -0
bioguider/rag/data_pipeline.py +648 -0
bioguider/rag/embedder.py +24 -0
bioguider/rag/rag.py +134 -0
bioguider/settings.py +103 -0
bioguider/utils/constants.py +40 -0
bioguider/utils/default.gitignore +140 -0
bioguider/utils/file_utils.py +126 -0
bioguider/utils/gitignore_checker.py +175 -0
bioguider/utils/pyphen_utils.py +73 -0
bioguider/utils/utils.py +27 -0
bioguider-0.2.3.dist-info/LICENSE +21 -0
bioguider-0.2.3.dist-info/METADATA +44 -0
bioguider-0.2.3.dist-info/RECORD +47 -0
bioguider-0.2.3.dist-info/WHEEL +4 -0

bioguider/__init__.py ADDED Viewed

File without changes

bioguider/agents/__init__.py ADDED Viewed

File without changes

bioguider/agents/agent_task.py ADDED Viewed

@@ -0,0 +1,88 @@
+from typing import Callable
+from abc import ABC, abstractmethod
+from langchain_openai.chat_models.base import BaseChatOpenAI
+from langgraph.graph.graph import CompiledGraph
+from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
+from bioguider.database.summarized_file_db import SummarizedFilesDb
+class AgentTask(ABC):
+    """
+    A class representing a step in an agent's process.
+    """
+    def __init__(self, llm: BaseChatOpenAI, step_callback: Callable | None = None):
+        """
+        Initialize the AgentStep with a language model and a callback function.
+        Args:
+            llm (BaseChatOpenAI): The language model to use.
+            step_callback (Callable): A callback function to handle step results.
+        """
+        self.llm = llm
+        self.step_callback = step_callback
+        self.summary_file_db = None
+        self.graph: CompiledGraph | None = None
+    def _print_step(
+        self,
+        step_name: str | None = None,
+        step_output: str | None = None,
+        token_usage: dict | object | None = None,
+    ):
+        if self.step_callback is None:
+            return
+        # convert token_usage to dict
+        if token_usage is not None and not isinstance(token_usage, dict):
+            token_usage = vars(token_usage)
+            token_usage = {**DEFAULT_TOKEN_USAGE, **token_usage}
+        step_callback = self.step_callback
+        step_callback(
+            step_name=step_name,
+            step_output=step_output,
+            token_usage=token_usage,
+        )
+    def compile(self, repo_path: str, gitignore_path: str, db: SummarizedFilesDb | None = None, **kwargs):
+        """
+        Compile the agent step with the given repository and gitignore paths.
+        Args:
+            repo_path (str): The path to the repository.
+            gitignore_path (str): The path to the .gitignore file.
+            **kwargs: derived class may pass more arguments to implmented _compile(), that is,
+                what **kwargs is depends on derived class
+        """
+        self.summary_file_db = db
+        self._compile(repo_path, gitignore_path, **kwargs)
+    @abstractmethod
+    def _compile(self, repo_path: str, gitignore_path: str, **kwargs):
+        """
+        Abstract method to compile the agent step.
+        Args:
+            repo_path (str): The path to the repository.
+            gitignore_path (str): The path to the .gitignore file.
+        """
+        pass
+    def _go_graph(self, input: dict) -> dict:
+        input = {
+            **input,
+            "llm": self.llm,
+            "step_output_callback": self.step_callback,
+        }
+        for s in self.graph.stream(
+            input=input,
+            stream_mode="values",
+            config={"recursion_limit": 500},
+        ):
+            print(s)
+        return s

bioguider/agents/agent_tools.py ADDED Viewed

@@ -0,0 +1,147 @@
+import os
+from typing import Callable
+from langchain_openai.chat_models.base import BaseChatOpenAI
+from bioguider.database.summarized_file_db import SummarizedFilesDb
+from bioguider.utils.file_utils import get_file_type
+from bioguider.agents.agent_utils import read_directory, read_file, summarize_file
+class agent_tool:
+    def __init__(
+        self,
+        llm: BaseChatOpenAI | None = None,
+        output_callback:Callable[[dict], None] = None,
+    ):
+        self.llm = llm
+        self.output_callback = output_callback
+    def _print_token_usage(self, token_usage: dict):
+        if self.output_callback is not None:
+            self.output_callback(token_usage=token_usage)
+    def _print_step_output(self, step_output: str):
+        if self.output_callback is not None:
+            self.output_callback(step_output=step_output)
+class read_file_tool:
+    """ read file
+Args:
+    file_path str: file path
+Returns:
+    A string of file content, if the file does not exist, return None.
+        """
+    def __init__(self, repo_path: str | None = None):
+        self.repo_path = repo_path if repo_path is not None else ""
+    def run(self, file_path: str) -> str | None:
+        if file_path is None:
+            return None
+        file_path = file_path.strip()
+        if self.repo_path is not None and self.repo_path not in file_path:
+            file_path = os.path.join(self.repo_path, file_path)
+        if not os.path.isfile(file_path):
+            return None
+        return read_file(file_path)
+class summarize_file_tool(agent_tool):
+    """ read and summarize the file
+Args:
+    file_path str: file path
+Returns:
+    A string of summarized file content, if the file does not exist, return None.
+        """
+    def __init__(
+        self,
+        llm: BaseChatOpenAI,
+        repo_path: str | None = None,
+        output_callback: Callable | None = None,
+        detailed_level: int | None = 6,
+        db: SummarizedFilesDb | None = None,
+        summaize_instruction: str = "",
+    ):
+        super().__init__(llm=llm, output_callback=output_callback)
+        self.repo_path = repo_path
+        detailed_level = detailed_level if detailed_level is not None else 6
+        detailed_level = detailed_level if detailed_level > 0 else 1
+        detailed_level = detailed_level if detailed_level <= 10 else 10
+        self.detailed_level = detailed_level
+        self.summary_file_db = db
+        self.summarize_instruction = summaize_instruction
+    def _retrive_from_summary_file_db(self, file_path: str) -> str | None:
+        if self.summary_file_db is None:
+            return None
+        return self.summary_file_db.select_summarized_text(
+            file_path=file_path,
+            instruction=self.summarize_instruction,
+            summarize_level=self.detailed_level,
+        )
+    def _save_to_summary_file_db(self, file_path: str, summarized_text: str, token_usage: dict):
+        if self.summary_file_db is None:
+            return
+        self.summary_file_db.upsert_summarized_file(
+            file_path=file_path,
+            instruction=self.summarize_instruction,
+            summarize_level=self.detailed_level,
+            summarized_text=summarized_text,
+            token_usage=token_usage,
+        )
+    def run(self, file_path: str) -> str | None:
+        if file_path is None:
+            return None
+        file_path = file_path.strip()
+        abs_file_path = file_path
+        if self.repo_path is not None and self.repo_path not in abs_file_path:
+            abs_file_path = os.path.join(self.repo_path, abs_file_path)
+        if not os.path.isfile(abs_file_path):
+            return f"{file_path} is not a file."
+        summarized_content = self._retrive_from_summary_file_db(
+            file_path=file_path
+        )
+        if summarized_content is not None:
+            return f"summarized content of file {file_path}: " + summarized_content
+        file_content = read_file(abs_file_path)
+        file_content = file_content.replace("{", "{{").replace("}", "}}")
+        summarized_content, token_usage = summarize_file(
+            self.llm, abs_file_path, file_content, self.detailed_level,
+            summary_instructions=self.summarize_instruction,
+        )
+        self._save_to_summary_file_db(
+            file_path=file_path,
+            summarized_text=summarized_content,
+            token_usage=token_usage,
+        )
+        self._print_token_usage(token_usage)
+        return f"summarized content of file {file_path}: " + summarized_content
+class read_directory_tool:
+    """Reads the contents of a directory, including files and subdirectories in it..
+Args:
+    dir_path (str): Path to the directory.
+Returns:
+    a string containing file and subdirectory paths found within the specified depth.
+    """
+    def __init__(
+        self,
+        repo_path: str | None = None,
+        gitignore_path: str | None = None,
+    ):
+        self.repo_path = repo_path
+        self.gitignore_path = gitignore_path if gitignore_path is not None else ""
+    def run(self, dir_path):
+        dir_path = dir_path.strip()
+        full_path = dir_path
+        if full_path == "." or full_path == "..":
+            return f"Please skip this folder {dir_path}"
+        if self.repo_path not in full_path:
+            full_path = os.path.join(self.repo_path, full_path)
+        files = read_directory(full_path, gitignore_path=self.gitignore_path, level=1)
+        if files is None:
+            return "N/A"
+        file_pairs = [(f, get_file_type(os.path.join(full_path, f)).value) for f in files]
+        dir_structure = ""
+        for f, f_type in file_pairs:
+            dir_structure += f"{os.path.join(dir_path, f)} - {f_type}\n"
+        return f"The 2-level content of directory {dir_path}: \n" + \
+            f"{dir_structure if len(dir_structure) > 0 else 'No files and sub-directories in it'}"

bioguider/agents/agent_utils.py ADDED Viewed

@@ -0,0 +1,357 @@
+import json
+import os
+import re
+import subprocess
+from typing import List, Optional, Tuple, Union
+from langchain_openai import AzureChatOpenAI
+from langchain_deepseek import ChatDeepSeek
+from langchain_core.utils.interactive_env import is_interactive_env
+from langchain_core.messages.base import get_msg_title_repr
+from langchain_core.prompts import ChatPromptTemplate, StringPromptTemplate
+from langchain_core.messages import AIMessage
+from langchain_openai.chat_models.base import BaseChatOpenAI
+from langchain.tools import BaseTool
+from langchain.schema import AgentAction, AgentFinish
+from langchain.agents import AgentOutputParser
+from langgraph.prebuilt import create_react_agent
+import logging
+from pydantic import BaseModel, Field
+from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
+from bioguider.utils.file_utils import get_file_type
+from ..utils.gitignore_checker import GitignoreChecker
+from ..database.summarized_file_db import SummarizedFilesDb
+logger = logging.getLogger(__name__)
+class PlanAgentResult(BaseModel):
+    """ Identification Plan Result """
+    actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
+PlanAgentResultJsonSchema = {
+    "title": "identification_plan_result",
+    "description": "plan result",
+    "type": "object",
+    "properties": {
+        "actions": {
+            "type": "array",
+            "description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
+            "title": "Actions",
+            "items": {"type": "object"}
+        },
+    },
+    "required": ["actions"],
+}
+def get_openai():
+    return get_llm(
+        api_key=os.environ.get("OPENAI_API_KEY"),
+        model_name=os.environ.get("OPENAI_MODEL"),
+        azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
+        api_version=os.environ.get("OPENAI_API_VERSION"),
+        azure_deployment=os.environ.get("OPENAI_DEPLOYMENT_NAME"),
+        max_tokens=os.environ.get("OPENAI_MAX_OUTPUT_TOKEN"),
+    )
+def get_llm(
+    api_key: str,
+    model_name: str="gpt-4o",
+    azure_endpoint: str=None,
+    api_version: str=None,
+    azure_deployment: str=None,
+    temperature: float = 0.0,
+    max_tokens: int = 4096,
+):
+    if model_name.startswith("deepseek"):
+        chat = ChatDeepSeek(
+            api_key=api_key,
+            model=model_name,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+    elif model_name.startswith("gpt"):
+        chat = AzureChatOpenAI(
+            api_key=api_key,
+            azure_endpoint=azure_endpoint,
+            api_version=api_version,
+            azure_deployment=azure_deployment,
+            model=model_name,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+    else:
+        raise ValueError("Invalid model name")
+    # validate chat
+    try:
+        chat.invoke("Hi")
+    except Exception as e:
+        print(e)
+        return None
+    return chat
+def pretty_print(message, printout = True):
+    if isinstance(message, tuple):
+        title = message
+    else:
+        if isinstance(message.content, list):
+            title = get_msg_title_repr(message.type.title().upper() + " Message", bold=is_interactive_env())
+            if message.name is not None:
+                title += f"\nName: {message.name}"
+            for i in message.content:
+                if i['type'] == 'text':
+                    title += f"\n{i['text']}\n"
+                elif i['type'] == 'tool_use':
+                    title += f"\nTool: {i['name']}"
+                    title += f"\nInput: {i['input']}"
+            if printout:
+                print(f"{title}")
+        else:
+            title = get_msg_title_repr(message.type.title() + " Message", bold=is_interactive_env())
+            if message.name is not None:
+                title += f"\nName: {message.name}"
+            title += f"\n\n{message.content}"
+            if printout:
+                print(f"{title}")
+    return title
+HUGE_FILE_LENGTH = 10 * 1024 # 10K
+def read_file(
+    file_path: str,
+) -> str | None:
+    if not os.path.isfile(file_path):
+        return None
+    with open(file_path, 'r') as f:
+        content = f.read()
+        return content
+def write_file(file_path: str, content: str):
+    try:
+        with open(file_path, "w") as fobj:
+            fobj.write(content)
+            return True
+    except Exception as e:
+        logger.error(e)
+        return False
+def read_directory(
+    dir_path: str,
+    gitignore_path: str,
+    level: int=1,
+) -> list[str] | None:
+    if not os.path.isdir(dir_path):
+        return None
+    gitignore_checker = GitignoreChecker(
+        directory=dir_path,
+        gitignore_path=gitignore_path
+    )
+    files = gitignore_checker.check_files_and_folders(level=level)
+    return files
+EVALUATION_SUMMARIZE_FILE_PROMPT = ChatPromptTemplate.from_template("""
+You are provided with the content of the file **{file_name}**:
+```
+{file_content}
+```
+### **Summary Instructions**
+{summary_instructions}
+The content is lengthy. Please generate a concise summary ({sentence_num1}-{sentence_num2} sentences).
+""")
+MAX_FILE_LENGTH=20 *1024 # 20K
+MAX_SENTENCE_NUM=20
+def summarize_file(
+    llm: BaseChatOpenAI,
+    name: str,
+    content: str | None = None,
+    level: int = 3,
+    summary_instructions: str | None = None,
+    db: SummarizedFilesDb | None = None,
+) -> Tuple[str, dict]:
+    if content is None:
+        try:
+            with open(name, "r") as fobj:
+                content = fobj.read()
+        except Exception as e:
+            logger.error(e)
+            return ""
+    # First, query from database
+    if db is not None:
+        res = db.select_summarized_text(name, summary_instructions, level)
+        if res is not None:
+            return res, {**DEFAULT_TOKEN_USAGE}
+    file_content = content
+    level = level if level > 0 else 1
+    level = level if level < MAX_SENTENCE_NUM+1 else MAX_SENTENCE_NUM
+    if len(file_content) > MAX_FILE_LENGTH:
+        file_content = content[:MAX_FILE_LENGTH] + " ..."
+    prompt = EVALUATION_SUMMARIZE_FILE_PROMPT.format(
+        file_name=name,
+        file_content=file_content,
+        sentence_num1=level,
+        sentence_num2=level+1,
+        summary_instructions=summary_instructions \
+            if summary_instructions is not None and len(summary_instructions) > 0 \
+            else "N/A",
+    )
+    config = {"recursion_limit": 500}
+    res: AIMessage = llm.invoke([("human", prompt)], config=config)
+    out = res.content
+    token_usage = {
+        "prompt_tokens": res.usage_metadata["input_tokens"],
+        "completion_tokens": res.usage_metadata["output_tokens"],
+        "total_tokens": res.usage_metadata["total_tokens"],
+    }
+    if db is not None:
+        db.upsert_summarized_file(
+            name, summary_instructions, level, token_usage
+        )
+    return out, token_usage
+def increase_token_usage(
+    token_usage: Optional[dict] = None,
+    incremental: dict = {**DEFAULT_TOKEN_USAGE},
+):
+    if token_usage is None:
+        token_usage = {**DEFAULT_TOKEN_USAGE}
+    token_usage["total_tokens"] += incremental["total_tokens"]
+    token_usage["completion_tokens"] += incremental["completion_tokens"]
+    token_usage["prompt_tokens"] += incremental["prompt_tokens"]
+    return token_usage
+  # Set up a prompt template
+class CustomPromptTemplate(StringPromptTemplate):
+    # The template to use
+    template: str
+    # The list of tools available
+    tools: List[BaseTool]
+    # Plan
+    plan_actions: str
+    def format(self, **kwargs) -> str:
+        # Get the intermediate steps (AgentAction, Observation tuples)
+        # Format them in a particular way
+        intermediate_steps = kwargs.pop("intermediate_steps")
+        thoughts = ""
+        for action, observation in intermediate_steps:
+            thoughts += action.log
+            thoughts += f"\nObservation: {observation}\n"
+        # Set plan_step
+        kwargs["plan_actions"] = self.plan_actions
+        # Set the agent_scratchpad variable to that value
+        kwargs["agent_scratchpad"] = thoughts
+        # Create a tools variable from the list of tools provided
+        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
+        # Create a list of tool names for the tools provided
+        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
+        prompt = self.template.format(**kwargs)
+        # print([prompt])
+        return prompt
+class CustomOutputParser(AgentOutputParser):
+    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
+        # Check if agent should finish
+        if "Final Answer:" in llm_output:
+            return AgentFinish(
+                return_values={"output": llm_output},
+                log=llm_output,
+            )
+        # Parse out the action and action input
+        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
+        match = re.search(regex, llm_output, re.DOTALL)
+        if not match:
+            # raise ValueError(f"Could not parse LLM output: `{llm_output}`")
+            print(f"Warning: could not parse LLM output: `{llm_output}`, finishing chain...")
+            return AgentFinish(
+                return_values={"output": llm_output},
+                log=llm_output,
+            )
+        action = match.group(1).strip()
+        action_input = match.group(2)
+        # Return the action and action input
+        action_dict = None
+        action_input = action_input.strip(" ").strip('"')
+        action_input_replaced = action_input.replace("'", '"')
+        try:
+            action_dict = json.loads(action_input_replaced)
+        except json.JSONDecodeError:
+            pass
+        if action_dict is None:
+            # try using ast to parse input string
+            import ast
+            try:
+                action_dict = ast.literal_eval(action_input)
+                if not isinstance(action_dict, dict):
+                    action_dict = None
+            except Exception as e:
+                pass
+        return AgentAction(
+            tool=action,
+            tool_input=action_dict if action_dict is not None else action_input,
+            log=llm_output
+        )
+def get_tool_names_and_descriptions(tools: List[BaseTool]) -> str:
+    tool_names = []
+    tools_descriptions = ""
+    for tool in tools:
+        tools_descriptions += f"name: {tool.name}, description: {tool.description}\n"
+        tool_names.append(tool.name)
+    return str(tool_names), tools_descriptions
+def generate_repo_structure_prompt(
+    files: List[str],
+    dir_path: str="",
+) -> str:
+    # Convert the repo structure to a string
+    file_pairs = [(f, get_file_type(os.path.join(dir_path, f)).value) for f in files]
+    repo_structure = ""
+    for f, f_type in file_pairs:
+        repo_structure += f"{f} - {f_type}\n"
+    return repo_structure
+class ObservationResult(BaseModel):
+    Analysis: Optional[str]=Field(description="Analyzing the goal, repository file structure and intermediate output.")
+    FinalAnswer: Optional[str]=Field(description="the final answer for the goal")
+    Thoughts: Optional[str]=Field(description="If the information is insufficient, the thoughts will be given and be taken into consideration in next round.")
+def convert_plan_to_string(plan: PlanAgentResult) -> str:
+    plan_str = ""
+    for action in plan.actions:
+        action_str = f"Step: {action['name']}\n"
+        action_str += f"Step Input: {action['input']}\n"
+        plan_str += action_str
+    return plan_str
+def run_command(command: list, cwd: str = None, timeout: int = None):
+    """
+    Run a shell command with optional timeout and return stdout, stderr, and return code.
+    """
+    try:
+        result = subprocess.run(
+            command,
+            cwd=cwd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            timeout=timeout
+        )
+        return result.stdout, result.stderr, result.returncode
+    except subprocess.TimeoutExpired as e:
+        return e.stdout or "", e.stderr or f"Command timed out after {timeout} seconds", -1
+def escape_braces(text: str) -> str:
+    # First replace single } not part of }} with }}
+    text = re.sub(r'(?<!})}(?!})', '}}', text)
+    # Then replace single { not part of {{
+    text = re.sub(r'(?<!{){(?!{)', '{{', text)
+    return text