bioguider 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from pathlib import Path
3
3
 
4
+ from bioguider.agents.evaluation_tutorial_task import EvaluationTutorialTask
4
5
  from bioguider.agents.evaluation_userguide_task import EvaluationUserGuideTask
5
6
  from bioguider.agents.prompt_utils import CollectionGoalItemEnum
6
7
  from bioguider.database.code_structure_db import CodeStructureDb
@@ -142,6 +143,19 @@ class EvaluationManager:
142
143
  )
143
144
  evaluation, files = evaluation_task.evaluate()
144
145
  return evaluation, files
146
+
147
+ def evaluate_tutorial(self):
148
+ evaluation_task = EvaluationTutorialTask(
149
+ llm=self.llm,
150
+ repo_path=self.rag.repo_dir,
151
+ gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
152
+ meta_data=self.project_metadata,
153
+ step_callback=self.step_callback,
154
+ summarized_files_db=self.summary_file_db,
155
+ code_structure_db=self.code_structure_db,
156
+ )
157
+ evaluation, files = evaluation_task.evaluate()
158
+ return evaluation, files
145
159
 
146
160
 
147
161
 
@@ -91,7 +91,7 @@ def download_repo(repo_url: str, local_path: str, access_token: str = None):
91
91
  logger.info(f"Cloning repository from {repo_url} to {local_path}")
92
92
  # We use repo_url in the log to avoid exposing the token in logs
93
93
  result = subprocess.run(
94
- ["git", "clone", clone_url, local_path],
94
+ ["git", "clone", "--recurse-submodules", clone_url, local_path],
95
95
  check=True,
96
96
  stdout=subprocess.PIPE,
97
97
  stderr=subprocess.PIPE,
@@ -12,16 +12,18 @@ logger = logging.getLogger(__name__)
12
12
  class CodeStructureBuilder:
13
13
  def __init__(
14
14
  self,
15
- repo_path: str,
16
- gitignore_path: str,
15
+ repo_path: str | Path,
16
+ gitignore_path: str | Path,
17
17
  code_structure_db: CodeStructureDb,
18
18
  ):
19
- self.repo_path = repo_path
20
- self.gitignore_checker = GitignoreChecker(repo_path, gitignore_path)
19
+ self.repo_path = str(repo_path)
20
+ self.gitignore_checker = GitignoreChecker(repo_path, str(gitignore_path))
21
21
  self.file_handler = PythonFileHandler(repo_path)
22
22
  self.code_structure_db = code_structure_db
23
23
 
24
24
  def build_code_structure(self):
25
+ if self.code_structure_db.is_database_built():
26
+ return
25
27
  files = self.gitignore_checker.check_files_and_folders()
26
28
  for file in files:
27
29
  if not file.endswith(".py") and not file.endswith(".R"):
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ from typing import Union, Dict, Any, List
4
+ import json
5
+
6
+ def extract_markdown_from_notebook(
7
+ ipynb_path: Union[str, Path],
8
+ out_path: Union[str, Path, None] = None,
9
+ ) -> Dict[str, Any]:
10
+ """
11
+ Extract markdown from a Jupyter notebook.
12
+ """
13
+ ipynb_path = Path(ipynb_path)
14
+ if not ipynb_path.exists():
15
+ raise FileNotFoundError(f"File {ipynb_path} does not exist")
16
+ try:
17
+ with ipynb_path.open("r", encoding="utf-8") as f:
18
+ nb = json.load(f)
19
+ except json.JSONDecodeError:
20
+ raise ValueError(f"File {ipynb_path} is not a valid JSON file")
21
+
22
+ markdown_txts = [
23
+ "\n".join(cell.get("source")) if isinstance(cell.get("source"), list) else cell.get("source") for cell in nb.get("cells", [])
24
+ if cell.get("cell_type") == "markdown"
25
+ ]
26
+ text = "\n".join(markdown_txts)
27
+ if out_path is not None:
28
+ with open(out_path, "w", encoding="utf-8") as f:
29
+ f.write(text)
30
+ return text
31
+
32
+ def strip_notebook_to_code_and_markdown(
33
+ ipynb_path: Union[str, Path],
34
+ out_path: Union[str, Path, None] = None,
35
+ keep_top_metadata: bool = True,
36
+ ) -> Dict[str, Any]:
37
+ """
38
+ Load a .ipynb and return a new notebook that:
39
+ - keeps ONLY 'code' and 'markdown' cells
40
+ - empties outputs and execution_count for code cells
41
+ - drops all other cell types (e.g., 'raw')
42
+ - preserves attachments on markdown cells
43
+ - optionally preserves top-level metadata (kernelspec, language_info, etc.)
44
+
45
+ Parameters
46
+ ----------
47
+ ipynb_path : str | Path
48
+ Path to the input .ipynb file.
49
+ out_path : str | Path | None, default None
50
+ If provided, write the cleaned notebook to this path.
51
+ keep_top_metadata : bool, default True
52
+ If True, copy top-level metadata as-is (useful for re-running).
53
+ If False, keep only minimal metadata.
54
+
55
+ Returns
56
+ -------
57
+ dict
58
+ The cleaned notebook (nbformat v4-style dict).
59
+ """
60
+ ipynb_path = Path(ipynb_path)
61
+ if not ipynb_path.exists():
62
+ raise FileNotFoundError(f"File {ipynb_path} does not exist")
63
+ try:
64
+ with ipynb_path.open("r", encoding="utf-8") as f:
65
+ nb = json.load(f)
66
+ except json.JSONDecodeError:
67
+ raise ValueError(f"File {ipynb_path} is not a valid JSON file")
68
+
69
+ nbformat = nb.get("nbformat", 4)
70
+ nbformat_minor = nb.get("nbformat_minor", 5)
71
+
72
+ def _to_text(src) -> str:
73
+ # nbformat allows str or list of lines
74
+ if isinstance(src, list):
75
+ return "".join(src)
76
+ return src or ""
77
+
78
+ new_cells: List[Dict[str, Any]] = []
79
+ for cell in nb.get("cells", []):
80
+ ctype = cell.get("cell_type")
81
+ if ctype == "markdown":
82
+ new_cell = {
83
+ "cell_type": "markdown",
84
+ "metadata": cell.get("metadata", {}),
85
+ "source": _to_text(cell.get("source", "")),
86
+ }
87
+ if "attachments" in cell:
88
+ new_cell["attachments"] = cell["attachments"]
89
+ new_cells.append(new_cell)
90
+
91
+ elif ctype == "code":
92
+ new_cells.append({
93
+ "cell_type": "code",
94
+ "metadata": cell.get("metadata", {}),
95
+ "source": _to_text(cell.get("source", "")),
96
+ "execution_count": None, # clear execution count
97
+ "outputs": [], # strip ALL outputs
98
+ })
99
+
100
+ # else: drop 'raw' and any other unknown cell types
101
+
102
+ # Build new notebook object
103
+ new_nb: Dict[str, Any] = {
104
+ "nbformat": nbformat,
105
+ "nbformat_minor": nbformat_minor,
106
+ "metadata": nb.get("metadata", {}) if keep_top_metadata else {},
107
+ "cells": new_cells,
108
+ }
109
+
110
+ if out_path is not None:
111
+ out_path = Path(out_path)
112
+ out_path.parent.mkdir(parents=True, exist_ok=True)
113
+ with out_path.open("w", encoding="utf-8") as f:
114
+ json.dump(new_nb, f, ensure_ascii=False, indent=1)
115
+
116
+ return new_nb
117
+