PyPI - bioguider - Versions diffs - 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl - Mend

bioguider 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bioguider might be problematic. Click here for more details.

Files changed (20) hide show

bioguider/agents/consistency_collection_step.py +9 -7
bioguider/agents/consistency_evaluation_task.py +3 -2
bioguider/agents/consistency_evaluation_task_utils.py +2 -1
bioguider/agents/consistency_observe_step.py +15 -13
bioguider/agents/evaluation_task.py +0 -110
bioguider/agents/evaluation_tutorial_task.py +157 -0
bioguider/agents/evaluation_tutorial_task_prompts.py +114 -0
bioguider/agents/evaluation_userguide_task.py +4 -1
bioguider/agents/prompt_utils.py +9 -0
bioguider/database/code_structure_db.py +20 -9
bioguider/database/summarized_file_db.py +6 -3
bioguider/managers/evaluation_manager.py +14 -16
bioguider/rag/data_pipeline.py +1 -1
bioguider/utils/code_structure_builder.py +6 -4
bioguider/utils/notebook_utils.py +117 -0
bioguider/utils/r_file_handler.py +528 -347
{bioguider-0.2.21.dist-info → bioguider-0.2.23.dist-info}/METADATA +1 -1
{bioguider-0.2.21.dist-info → bioguider-0.2.23.dist-info}/RECORD +20 -17
{bioguider-0.2.21.dist-info → bioguider-0.2.23.dist-info}/LICENSE +0 -0
{bioguider-0.2.21.dist-info → bioguider-0.2.23.dist-info}/WHEEL +0 -0

bioguider/managers/evaluation_manager.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
+from bioguider.agents.evaluation_tutorial_task import EvaluationTutorialTask
 from bioguider.agents.evaluation_userguide_task import EvaluationUserGuideTask
 from bioguider.agents.prompt_utils import CollectionGoalItemEnum
 from bioguider.database.code_structure_db import CodeStructureDb
@@ -82,22 +83,6 @@ class EvaluationManager:
         results, readme_files = task.evaluate()
         return results, readme_files
-    def evaluate_tutorial(self):
-        pass
-        # task = CollectionTask(
-        #     llm=self.llm,
-        #     step_callback=self.step_callback,
-        # )
-        # task.compile(
-        #     repo_path=self.rag.repo_dir,
-        #     gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
-        #     db=self.summary_file_db,
-        #     goal_item=CollectionGoalItemEnum.Tutorial.name,
-        # )
-        # s = task.collect()
-        # if s is None or 'final_answer' not in s:
-        #     return None
     def evaluate_installation(self):
         evaluation_task = EvaluationInstallationTask(
             llm=self.llm,
@@ -142,6 +127,19 @@ class EvaluationManager:
         )
         evaluation, files = evaluation_task.evaluate()
         return evaluation, files
+    def evaluate_tutorial(self):
+        evaluation_task = EvaluationTutorialTask(
+            llm=self.llm,
+            repo_path=self.rag.repo_dir,
+            gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
+            meta_data=self.project_metadata,
+            step_callback=self.step_callback,
+            summarized_files_db=self.summary_file_db,
+            code_structure_db=self.code_structure_db,
+        )
+        evaluation, files = evaluation_task.evaluate()
+        return evaluation, files

bioguider/rag/data_pipeline.py CHANGED Viewed

@@ -91,7 +91,7 @@ def download_repo(repo_url: str, local_path: str, access_token: str = None):
         logger.info(f"Cloning repository from {repo_url} to {local_path}")
         # We use repo_url in the log to avoid exposing the token in logs
         result = subprocess.run(
-            ["git", "clone", clone_url, local_path],
+            ["git", "clone", "--recurse-submodules", clone_url, local_path],
             check=True,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,

bioguider/utils/code_structure_builder.py CHANGED Viewed

@@ -12,16 +12,18 @@ logger = logging.getLogger(__name__)
 class CodeStructureBuilder:
     def __init__(
         self,
-        repo_path: str,
-        gitignore_path: str,
+        repo_path: str | Path,
+        gitignore_path: str | Path,
         code_structure_db: CodeStructureDb,
     ):
-        self.repo_path = repo_path
-        self.gitignore_checker = GitignoreChecker(repo_path, gitignore_path)
+        self.repo_path = str(repo_path)
+        self.gitignore_checker = GitignoreChecker(repo_path, str(gitignore_path))
         self.file_handler = PythonFileHandler(repo_path)
         self.code_structure_db = code_structure_db
     def build_code_structure(self):
+        if self.code_structure_db.is_database_built():
+            return
         files = self.gitignore_checker.check_files_and_folders()
         for file in files:
             if not file.endswith(".py") and not file.endswith(".R"):

bioguider/utils/notebook_utils.py ADDED Viewed

@@ -0,0 +1,117 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Union, Dict, Any, List
+import json
+def extract_markdown_from_notebook(
+    ipynb_path: Union[str, Path],
+    out_path: Union[str, Path, None] = None,
+) -> Dict[str, Any]:
+    """
+    Extract markdown from a Jupyter notebook.
+    """
+    ipynb_path = Path(ipynb_path)
+    if not ipynb_path.exists():
+        raise FileNotFoundError(f"File {ipynb_path} does not exist")
+    try:
+        with ipynb_path.open("r", encoding="utf-8") as f:
+            nb = json.load(f)
+    except json.JSONDecodeError:
+        raise ValueError(f"File {ipynb_path} is not a valid JSON file")
+    markdown_txts = [
+        "\n".join(cell.get("source")) if isinstance(cell.get("source"), list) else cell.get("source") for cell in nb.get("cells", [])
+        if cell.get("cell_type") == "markdown"
+    ]
+    text = "\n".join(markdown_txts)
+    if out_path is not None:
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.write(text)
+    return text
+def strip_notebook_to_code_and_markdown(
+    ipynb_path: Union[str, Path],
+    out_path: Union[str, Path, None] = None,
+    keep_top_metadata: bool = True,
+) -> Dict[str, Any]:
+    """
+    Load a .ipynb and return a new notebook that:
+      - keeps ONLY 'code' and 'markdown' cells
+      - empties outputs and execution_count for code cells
+      - drops all other cell types (e.g., 'raw')
+      - preserves attachments on markdown cells
+      - optionally preserves top-level metadata (kernelspec, language_info, etc.)
+    Parameters
+    ----------
+    ipynb_path : str | Path
+        Path to the input .ipynb file.
+    out_path : str | Path | None, default None
+        If provided, write the cleaned notebook to this path.
+    keep_top_metadata : bool, default True
+        If True, copy top-level metadata as-is (useful for re-running).
+        If False, keep only minimal metadata.
+    Returns
+    -------
+    dict
+        The cleaned notebook (nbformat v4-style dict).
+    """
+    ipynb_path = Path(ipynb_path)
+    if not ipynb_path.exists():
+        raise FileNotFoundError(f"File {ipynb_path} does not exist")
+    try:
+        with ipynb_path.open("r", encoding="utf-8") as f:
+            nb = json.load(f)
+    except json.JSONDecodeError:
+        raise ValueError(f"File {ipynb_path} is not a valid JSON file")
+    nbformat = nb.get("nbformat", 4)
+    nbformat_minor = nb.get("nbformat_minor", 5)
+    def _to_text(src) -> str:
+        # nbformat allows str or list of lines
+        if isinstance(src, list):
+            return "".join(src)
+        return src or ""
+    new_cells: List[Dict[str, Any]] = []
+    for cell in nb.get("cells", []):
+        ctype = cell.get("cell_type")
+        if ctype == "markdown":
+            new_cell = {
+                "cell_type": "markdown",
+                "metadata": cell.get("metadata", {}),
+                "source": _to_text(cell.get("source", "")),
+            }
+            if "attachments" in cell:
+                new_cell["attachments"] = cell["attachments"]
+            new_cells.append(new_cell)
+        elif ctype == "code":
+            new_cells.append({
+                "cell_type": "code",
+                "metadata": cell.get("metadata", {}),
+                "source": _to_text(cell.get("source", "")),
+                "execution_count": None,   # clear execution count
+                "outputs": [],             # strip ALL outputs
+            })
+        # else: drop 'raw' and any other unknown cell types
+    # Build new notebook object
+    new_nb: Dict[str, Any] = {
+        "nbformat": nbformat,
+        "nbformat_minor": nbformat_minor,
+        "metadata": nb.get("metadata", {}) if keep_top_metadata else {},
+        "cells": new_cells,
+    }
+    if out_path is not None:
+        out_path = Path(out_path)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with out_path.open("w", encoding="utf-8") as f:
+            json.dump(new_nb, f, ensure_ascii=False, indent=1)
+    return new_nb

bioguider 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

Potentially problematic release.

bioguider 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl