PyPI - bioguider - Versions diffs - 0.2.3__py3-none-any.whl - Mend

bioguider 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bioguider might be problematic. Click here for more details.

Files changed (47) hide show

bioguider/__init__.py +0 -0
bioguider/agents/__init__.py +0 -0
bioguider/agents/agent_task.py +88 -0
bioguider/agents/agent_tools.py +147 -0
bioguider/agents/agent_utils.py +357 -0
bioguider/agents/collection_execute_step.py +180 -0
bioguider/agents/collection_observe_step.py +113 -0
bioguider/agents/collection_plan_step.py +154 -0
bioguider/agents/collection_task.py +179 -0
bioguider/agents/collection_task_utils.py +109 -0
bioguider/agents/common_agent.py +159 -0
bioguider/agents/common_agent_2step.py +126 -0
bioguider/agents/common_step.py +85 -0
bioguider/agents/dockergeneration_execute_step.py +186 -0
bioguider/agents/dockergeneration_observe_step.py +153 -0
bioguider/agents/dockergeneration_plan_step.py +158 -0
bioguider/agents/dockergeneration_task.py +158 -0
bioguider/agents/dockergeneration_task_utils.py +220 -0
bioguider/agents/evaluation_task.py +269 -0
bioguider/agents/identification_execute_step.py +179 -0
bioguider/agents/identification_observe_step.py +92 -0
bioguider/agents/identification_plan_step.py +135 -0
bioguider/agents/identification_task.py +220 -0
bioguider/agents/identification_task_utils.py +18 -0
bioguider/agents/peo_common_step.py +64 -0
bioguider/agents/prompt_utils.py +190 -0
bioguider/agents/python_ast_repl_tool.py +69 -0
bioguider/agents/rag_collection_task.py +130 -0
bioguider/conversation.py +67 -0
bioguider/database/summarized_file_db.py +140 -0
bioguider/managers/evaluation_manager.py +108 -0
bioguider/rag/__init__.py +0 -0
bioguider/rag/config.py +117 -0
bioguider/rag/data_pipeline.py +648 -0
bioguider/rag/embedder.py +24 -0
bioguider/rag/rag.py +134 -0
bioguider/settings.py +103 -0
bioguider/utils/constants.py +40 -0
bioguider/utils/default.gitignore +140 -0
bioguider/utils/file_utils.py +126 -0
bioguider/utils/gitignore_checker.py +175 -0
bioguider/utils/pyphen_utils.py +73 -0
bioguider/utils/utils.py +27 -0
bioguider-0.2.3.dist-info/LICENSE +21 -0
bioguider-0.2.3.dist-info/METADATA +44 -0
bioguider-0.2.3.dist-info/RECORD +47 -0
bioguider-0.2.3.dist-info/WHEEL +4 -0

bioguider/agents/rag_collection_task.py ADDED Viewed

@@ -0,0 +1,130 @@
+import os
+from adalflow import Document
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel, Field
+from .common_agent_2step import CommonAgentTwoSteps
+from ..rag.rag import RAG
+RAG_COLLECT_SYSTEM_PROMPT = ChatPromptTemplate.from_template("""
+You are an expert in repository documents retrieval and collection.
+Your task is to collect relevant documents based on the user's query using the RAG system.
+Here is the user's query:
+{query}
+The following are the documents extracted from the RAG system:
+{documents}
+Please analyze the documents one by one and determine which ones are relevant to the user's query.
+Return a list of boolean values indicating the relevance of each document. Output example:
+[True, False, True, ...]  # True if the document is relevant, False otherwise
+""")
+class RAGCollectResult(BaseModel):
+    """
+    Represents the result of a RAG collection task.
+    Attributes:
+        query (str): The user's query.
+        documents (list): List of documents retrieved from the RAG system.
+        relevance (list): List of boolean values indicating the relevance of each document.
+    """
+    query: str = Field(..., description="The user's query")
+    documents: list[str] = Field(..., description="List of documents retrieved from the RAG system")
+    relevance: list[bool] = Field(..., description="List of boolean values indicating the relevance of each document")
+RAGCollectResultSchema = {
+  'description': "Represents the result of a RAG collection task.\n\nAttributes:\n    query (str): The user's query.\n    documents (list): List of documents retrieved from the RAG system.\n    relevance (list): List of boolean values indicating the relevance of each document.",
+  'properties': {
+    'query': {'description': "The user's query", 'title': 'Query', 'type': 'string'},
+    'documents': {'description': 'List of documents retrieved from the RAG system', 'items': {'type': 'string'}, 'title': 'Documents', 'type': 'array'},
+    'relevance': {'description': 'List of boolean values indicating the relevance of each document', 'items': {'type': 'boolean'}, 'title': 'Relevance', 'type': 'array'}
+  },
+  'required': [
+    'query', 'documents', 'relevance'
+  ],
+  'title': 'RAGCollectResult',
+  'type': 'object'
+}
+class RAGCollectionTaskItem:
+    def __init__(self, llm, rag: RAG, step_callback, batch_size: int = 5):
+        """
+        Initialize the RAGCollectionTaskItem with a repository URL or local path.
+        Args:
+            rag: An instance of the RAG class
+        """
+        self.llm = llm
+        self.rag = rag
+        self.batch_size = batch_size
+        self.step_callback = step_callback
+    def collect(self, query: str, rag_documents: list[Document]) -> list[Document]:
+        relevant_documents = []
+        for i in range(0, len(rag_documents), self.batch_size):
+            contents = [' - ' + doc.text for doc in rag_documents[i:i + self.batch_size]]
+            documents_text = "\n".join(contents)
+            prompt = RAG_COLLECT_SYSTEM_PROMPT.format(query=query, documents=documents_text)
+            prompt = prompt.replace("{", "{{").replace("}", "}}")  # Escape curly braces for LangChain
+            agent = CommonAgentTwoSteps(llm=self.llm)
+            res, _, token_usage, reasoning = agent.go(
+                system_prompt=prompt,
+                instruction_prompt="Please analyze the documents and determine their relevance to the query.",
+                schema=RAGCollectResultSchema,
+            )
+            self.step_callback(
+                step_output=f"**Reasoning Process**: {reasoning}\n",
+            )
+            self.step_callback(
+                step_output=f"**RAG Collection Result**: {res}",
+            )
+            self.step_callback(
+                token_usage=token_usage,
+            )
+            res = RAGCollectResult(**res)
+            relevants = self._collect_documents(
+                rag_documents[i:i + self.batch_size],
+                res.relevance
+            )
+            relevant_documents.extend(relevants)
+        return relevant_documents
+    def _collect_documents(self, docs: list[Document], relevants: list[bool]) -> list[Document]:
+        """
+        Collect documents based on relevance.
+        Args:
+            docs: List of documents to filter
+            relevants: List of boolean values indicating relevance
+        Returns:
+            List of relevant documents
+        """
+        return [doc for doc, relevant in zip(docs, relevants) if relevant]
+class RAGCollectionTask:
+    def __init__(self, rag: RAG):
+        """
+        Initialize the RAGCollectionTask with a repository URL or local path.
+        Args:
+            repo_url_or_path: URL or local path to the repository
+            access_token: Optional access token for private repositories
+        """
+        self.rag = rag
+    def query(self, query: str) -> list:
+        """
+        Process a query using RAG.
+        Args:
+            query: The user's query
+        Returns:
+            retrieved_documents: List of documents retrieved based on the query
+        """
+        return self.rag.query_doc(query)

bioguider/conversation.py ADDED Viewed

@@ -0,0 +1,67 @@
+from abc import ABC, abstractmethod
+from langchain_core.messages import BaseMessage
+from langchain_deepseek import ChatDeepSeek
+from openai import AuthenticationError
+from pydantic import PositiveFloat, PositiveInt
+class Conversation(ABC):
+    def __init__(self):
+        super().__init__()
+    @abstractmethod
+    def chat(
+        question: str,
+        messages: list[BaseMessage] = None
+    ):
+        """ chat with LLM """
+class DeepSeekConversation(Conversation):
+    chatter: ChatDeepSeek | None = None
+    model: str = "deepseek-chat"
+    temperature: PositiveFloat = 0.1
+    request_timeout: PositiveInt = 60
+    base_url: str = "https://api.deepseek.com/v1"
+    max_retries: PositiveInt = 3
+    api_key: str | None = None
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def set_api_key(self, key: str):
+        try:
+            self.chatter = ChatDeepSeek(
+                model=self.model,
+                api_key=key,
+                temperature=self.temperature,
+                max_retries=self.max_retries,
+                timeout=self.request_timeout,
+                base_url=self.base_url,
+            )
+            # verify chat
+            ai_msg = self.chatter.invoke(
+                 [("system", "Hi")]
+            )
+            return True
+        except AuthenticationError as e:
+            self.chatter = None
+            return False
+    def chat(
+        self,
+        question: str,
+        messages: list[BaseMessage] = None
+    ):
+        msgs = messages + [("user", question)] if messages is not None else \
+               [("user", question)]
+        try:
+            res_msg = self.chatter.invoke(msgs)
+            return res_msg
+        except Exception as e:
+            return str(e)

bioguider/database/summarized_file_db.py ADDED Viewed

@@ -0,0 +1,140 @@
+import sqlite3
+from sqlite3 import Connection
+import os
+from time import strftime
+from typing import Optional
+import logging
+from string import Template
+import json
+from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
+logging = logging.getLogger(__name__)
+SUMMARIZED_FILES_TABLE_NAME = "SummarizedFiles"
+summarized_files_create_table_query = f"""
+CREATE TABLE IF NOT EXISTS {SUMMARIZED_FILES_TABLE_NAME} (
+    file_path VARCHAR(512),
+    instruction TEXT,
+    summarize_level INTEGER,
+    summarized_text TEXT,
+    token_usage  VARCHAR(512),
+    datetime TEXT NOT NULL DEFAULT (strftime('%Y-%m-%d %H:%M:%f', 'now')),
+    UNIQUE (file_path, instruction, summarize_level)
+);
+"""
+summarized_files_upsert_query = f"""
+INSERT INTO {SUMMARIZED_FILES_TABLE_NAME}(file_path, instruction, summarize_level, summarized_text, token_usage, datetime)
+VALUES (?, ?, ?, ?, ?, strftime('%Y-%m-%d %H:%M:%f', 'now'))
+ON CONFLICT(file_path, instruction, summarize_level) DO UPDATE SET summarized_text=excluded.summarized_text,
+datetime=strftime('%Y-%m-%d %H:%M:%f', 'now');
+"""
+summarized_files_select_query = f"""
+SELECT summarized_text, datetime FROM {SUMMARIZED_FILES_TABLE_NAME}
+where file_path = ? and instruction = ? and summarize_level = ?;
+"""
+class SummarizedFilesDb:
+    def __init__(self, author: str, repo_name: str):
+        self.author = author
+        self.repo_name = repo_name
+        self.connection: Connection | None = None
+    def _ensure_tables(self) -> bool:
+        if self.connection is None:
+            return False
+        try:
+            cursor = self.connection.cursor()
+            cursor.execute(
+                summarized_files_create_table_query
+            )
+            self.connection.commit()
+            return True
+        except Exception as e:
+            logging.error(e)
+            return False
+    def _connect_to_db(self) -> bool:
+        if self.connection is not None:
+            return True
+        db_path = os.environ.get("DATA_FOLDER", "./data")
+        db_path = os.path.join(db_path, "databases")
+        # Ensure the local path exists
+        try:
+            os.makedirs(db_path, exist_ok=True)
+        except Exception as e:
+            logging.error(e)
+            return False
+        db_path = os.path.join(db_path, f"{self.author}_{self.repo_name}.db")
+        if not os.path.exists(db_path):
+            try:
+                with open(db_path, "w"):
+                    pass
+            except Exception as e:
+                logging.error(e)
+                return False
+        self.connection = sqlite3.connect(db_path)
+        return True
+    def upsert_summarized_file(
+        self,
+        file_path: str,
+        instruction: str,
+        summarize_level: int,
+        summarized_text: str,
+        token_usage: dict | None = None
+    ):
+        token_usage = token_usage if token_usage is not None else {**DEFAULT_TOKEN_USAGE}
+        token_usage = json.dumps(token_usage)
+        res = self._connect_to_db()
+        assert res
+        res = self._ensure_tables()
+        assert res
+        try:
+            cursor = self.connection.cursor()
+            cursor.execute(
+                summarized_files_upsert_query,
+                (file_path, instruction, summarize_level, summarized_text, token_usage, )
+            )
+            self.connection.commit()
+            return True
+        except Exception as e:
+            logging.error(e)
+            return False
+        finally:
+            self.connection.close()
+            self.connection = None
+    def select_summarized_text(
+        self,
+        file_path: str,
+        instruction: str,
+        summarize_level: int,
+    ) -> str | None:
+        self._connect_to_db()
+        self._ensure_tables()
+        try:
+            cursor = self.connection.cursor()
+            cursor.execute(
+                summarized_files_select_query,
+                (file_path, instruction, summarize_level,)
+            )
+            row = cursor.fetchone()
+            if row is None:
+                return None
+            return row[0]
+        except Exception as e:
+            logging.error(e)
+            return None
+        finally:
+            self.connection.close()
+            self.connection = None
+    def get_db_file(self):
+        db_path = os.environ.get("DATA_FOLDER", "./data")
+        db_path = os.path.join(db_path, f"{self.author}_{self.repo_name}.db")
+        return db_path

bioguider/managers/evaluation_manager.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+from pathlib import Path
+from bioguider.agents.prompt_utils import CollectionGoalItemEnum
+from bioguider.utils.constants import ProjectMetadata
+from bioguider.utils.gitignore_checker import GitignoreChecker
+from ..agents.identification_task import IdentificationTask
+from ..rag.rag import RAG
+from ..utils.file_utils import parse_repo_url
+from ..database.summarized_file_db import SummarizedFilesDb
+from ..agents.evaluation_task import EvaluationREADMETask
+from ..agents.collection_task import CollectionTask
+class EvaluationManager:
+    def __init__(self, llm, step_callback):
+        self.rag = None
+        self.llm = llm
+        self.step_callback = step_callback
+        self.repo_url: str | None = None
+        self.project_metadata: ProjectMetadata | None = None
+    def prepare_repo(self, repo_url: str):
+        self.repo_url = repo_url
+        self.rag = RAG()
+        self.rag.initialize_db_manager()
+        self.rag.prepare_retriever(repo_url_or_path=repo_url)
+        author, repo_name = parse_repo_url(repo_url)
+        self.summary_file_db = SummarizedFilesDb(author, repo_name)
+    def identify_project(self) -> ProjectMetadata:
+        repo_path = self.rag.repo_dir
+        gitignore_path = Path(repo_path, ".gitignore")
+        identfication_task = IdentificationTask(
+            llm=self.llm,
+            step_callback=self.step_callback,
+        )
+        identfication_task.compile(
+            repo_path=repo_path,
+            gitignore_path=gitignore_path,
+            db=self.summary_file_db,
+        )
+        language = identfication_task.identify_primary_language()
+        project_type = identfication_task.identify_project_type()
+        meta_data = identfication_task.identify_meta_data()
+        self.project_metadata = ProjectMetadata(
+            url=self.repo_url,
+            project_type=project_type,
+            primary_language=language,
+            repo_name=meta_data["name"] if "name" in meta_data else "",
+            description=meta_data["description"] if "description" in meta_data else "",
+            owner=meta_data["owner"] if "owner" in meta_data else "",
+            license=meta_data["license"] if "license" in meta_data else "",
+        )
+        return self.project_metadata
+    def evaluate_readme(self):
+        task = EvaluationREADMETask(
+            llm=self.llm,
+            repo_path=self.rag.repo_dir,
+            gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
+            meta_data=self.project_metadata,
+            step_callback=self.step_callback,
+        )
+        readme_files = self._find_readme_files()
+        results = task.evaluate(readme_files)
+        return results
+    def evaluate_tutorial(self):
+        task = CollectionTask(
+            llm=self.llm,
+            step_callback=self.step_callback,
+        )
+        task.compile(
+            repo_path=self.rag.repo_dir,
+            gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
+            db=self.summary_file_db,
+            goal_item=CollectionGoalItemEnum.Tutorial.name,
+        )
+        s = task.collect()
+        if s is None or 'final_answer' not in s:
+            return None
+    def _find_readme_files(self) -> list[str]:
+        """
+        Search for a README file in the repository directory.
+        """
+        possible_readme_files = [
+            "readme.md",
+            "readme.rst",
+            "readme.txt",
+            "readme",
+        ]
+        repo_path = self.rag.repo_dir
+        gitignore_path = Path(repo_path, ".gitignore")
+        gitignore_checker = GitignoreChecker(
+            directory=self.repo_path, gitignore_path=gitignore_path
+        )
+        found_readme_files = gitignore_checker.check_files_and_folders(
+            check_file_cb=lambda root_dir, relative_path: Path(relative_path).name.lower() in possible_readme_files,
+        )
+        return found_readme_files

bioguider/rag/__init__.py ADDED Viewed

File without changes

bioguider/rag/config.py ADDED Viewed

@@ -0,0 +1,117 @@
+import os
+from typing import List
+from adalflow import GoogleGenAIClient
+from adalflow.components.model_client.openai_client import OpenAIClient
+from adalflow.components.model_client.azureai_client import AzureAIClient
+DEFAULT_EXCLUDED_DIRS: List[str] = [
+    # Virtual environments and package managers
+    "./.venv/", "./venv/", "./env/", "./virtualenv/",
+    "./node_modules/", "./bower_components/", "./jspm_packages/",
+    # Version control
+    "./.git/", "./.svn/", "./.hg/", "./.bzr/",
+    # Cache and compiled files
+    "./__pycache__/", "./.pytest_cache/", "./.mypy_cache/", "./.ruff_cache/", "./.coverage/",
+    # Build and distribution
+    "./dist/", "./build/", "./out/", "./target/", "./bin/", "./obj/",
+    # Documentation
+    "./docs/", "./_docs/", "./site-docs/", "./_site/",
+    # IDE specific
+    "./.idea/", "./.vscode/", "./.vs/", "./.eclipse/", "./.settings/",
+    # Logs and temporary files
+    "./logs/", "./log/", "./tmp/", "./temp/",
+]
+DEFAULT_EXCLUDED_FILES: List[str] = [
+]
+configs = {
+    "embedder": {
+        "batch_size": 500,
+        "model_client": OpenAIClient,
+        "model_kwargs": {
+            "model": "text-embedding-3-small",
+            "dimensions": 256,
+            "encoding_format": "float",
+        },
+    },
+    "retriever": {
+        "top_k": 20,
+    },
+    "generator": {
+        "model_client": GoogleGenAIClient,
+        "model_kwargs": {
+            "model": "gemini-2.5-flash-preview-04-17",
+            "temperature": 0.7,
+            "top_p": 0.8,
+        },
+    },
+    "text_splitter": {
+        "split_by": "word",
+        "chunk_size": 350,
+        "chunk_overlap": 100,
+    },
+    "file_filters": {
+        "excluded_dirs": [
+            "./.venv/", "./venv/", "./env/", "./virtualenv/",
+            "./node_modules/", "./bower_components/", "./jspm_packages/",
+            "./.git/", "./.svn/", "./.hg/", "./.bzr/",
+            "./__pycache__/", "./.pytest_cache/", "./.mypy_cache/", "./.ruff_cache/", "./.coverage/",
+            "./dist/", "./build/", "./out/", "./target/", "./bin/", "./obj/",
+            "./_docs/", "./site-docs/", "./_site/",
+            "./.idea/", "./.vscode/", "./.vs/", "./.eclipse/", "./.settings/",
+            "./logs/", "./log/", "./tmp/", "./temp/",
+        ],
+        "excluded_files": [
+            "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "npm-shrinkwrap.json",
+            "poetry.lock", "Pipfile.lock", "requirements.txt.lock", "Cargo.lock", "composer.lock",
+            ".lock", ".DS_Store", "Thumbs.db", "desktop.ini", "*.lnk",
+            ".env", ".env.*", "*.env", "*.cfg", "*.ini", ".flaskenv",
+            ".gitignore", ".gitattributes", ".gitmodules", ".github", ".gitlab-ci.yml",
+            ".prettierrc", ".eslintrc", ".eslintignore", ".stylelintrc", ".editorconfig",
+            ".jshintrc", ".pylintrc", ".flake8", "mypy.ini", "pyproject.toml",
+            "tsconfig.json", "webpack.config.js", "babel.config.js", "rollup.config.js",
+            "jest.config.js", "karma.conf.js", "vite.config.js", "next.config.js",
+            "*.min.js", "*.min.css", "*.bundle.js", "*.bundle.css",
+            "*.map", "*.gz", "*.zip", "*.tar", "*.tgz", "*.rar",
+            "*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.class", "*.exe", "*.o", "*.a",
+            "*.jpg", "*.jpeg", "*.png", "*.gif", "*.ico", "*.svg", "*.webp",
+            "*.mp3", "*.mp4", "*.wav", "*.avi", "*.mov", "*.webm",
+            "*.csv", "*.tsv", "*.xls", "*.xlsx", "*.db", "*.sqlite", "*.sqlite3",
+            "*.pdf", "*.docx", "*.pptx",
+        ],
+    },
+    "repository": {
+        # Maximum repository size in MB
+        "size_limit_mb": 50000,
+    },
+}
+def get_embedder_config():
+    return configs["embedder"]
+def create_model_client():
+    openai_type = os.environ.get("OPENAI_API_TYPE")
+    is_azure = openai_type == "azure" if openai_type is not None else False
+    if not is_azure:
+        return OpenAIClient()
+    return AzureAIClient(
+        api_key=os.environ.get("OPENAI_API_KEY"),
+        api_version=os.environ.get("OPENAI_API_VERSION"),
+        azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
+    )
+def create_model_kwargs():
+    openai_type = os.environ.get("OPENAI_API_TYPE")
+    is_azure = openai_type == "azure" if openai_type is not None else False
+    if not is_azure:
+        return {
+            "model": "text-embedding-3-small",
+            "dimensions": 256,
+            "encoding_format": "float",
+        }
+    return {
+        "model": os.environ.get("OPENAI_TEXT_EMBEDDING_DEPLOYMENT_NAME"),
+        "dimensions": 256,
+        "encoding_format": "float",
+    }