bioguider 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/agents/consistency_collection_step.py +9 -7
- bioguider/agents/consistency_evaluation_task.py +3 -2
- bioguider/agents/consistency_evaluation_task_utils.py +2 -1
- bioguider/agents/consistency_observe_step.py +15 -13
- bioguider/agents/evaluation_task.py +0 -110
- bioguider/agents/evaluation_tutorial_task.py +157 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +114 -0
- bioguider/agents/evaluation_userguide_task.py +4 -1
- bioguider/agents/prompt_utils.py +9 -0
- bioguider/database/code_structure_db.py +20 -9
- bioguider/database/summarized_file_db.py +6 -3
- bioguider/managers/evaluation_manager.py +14 -16
- bioguider/rag/data_pipeline.py +1 -1
- bioguider/utils/code_structure_builder.py +6 -4
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/r_file_handler.py +528 -347
- {bioguider-0.2.21.dist-info → bioguider-0.2.23.dist-info}/METADATA +1 -1
- {bioguider-0.2.21.dist-info → bioguider-0.2.23.dist-info}/RECORD +20 -17
- {bioguider-0.2.21.dist-info → bioguider-0.2.23.dist-info}/LICENSE +0 -0
- {bioguider-0.2.21.dist-info → bioguider-0.2.23.dist-info}/WHEEL +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
+
from bioguider.agents.evaluation_tutorial_task import EvaluationTutorialTask
|
|
4
5
|
from bioguider.agents.evaluation_userguide_task import EvaluationUserGuideTask
|
|
5
6
|
from bioguider.agents.prompt_utils import CollectionGoalItemEnum
|
|
6
7
|
from bioguider.database.code_structure_db import CodeStructureDb
|
|
@@ -82,22 +83,6 @@ class EvaluationManager:
|
|
|
82
83
|
results, readme_files = task.evaluate()
|
|
83
84
|
return results, readme_files
|
|
84
85
|
|
|
85
|
-
def evaluate_tutorial(self):
|
|
86
|
-
pass
|
|
87
|
-
# task = CollectionTask(
|
|
88
|
-
# llm=self.llm,
|
|
89
|
-
# step_callback=self.step_callback,
|
|
90
|
-
# )
|
|
91
|
-
# task.compile(
|
|
92
|
-
# repo_path=self.rag.repo_dir,
|
|
93
|
-
# gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
|
|
94
|
-
# db=self.summary_file_db,
|
|
95
|
-
# goal_item=CollectionGoalItemEnum.Tutorial.name,
|
|
96
|
-
# )
|
|
97
|
-
# s = task.collect()
|
|
98
|
-
# if s is None or 'final_answer' not in s:
|
|
99
|
-
# return None
|
|
100
|
-
|
|
101
86
|
def evaluate_installation(self):
|
|
102
87
|
evaluation_task = EvaluationInstallationTask(
|
|
103
88
|
llm=self.llm,
|
|
@@ -142,6 +127,19 @@ class EvaluationManager:
|
|
|
142
127
|
)
|
|
143
128
|
evaluation, files = evaluation_task.evaluate()
|
|
144
129
|
return evaluation, files
|
|
130
|
+
|
|
131
|
+
def evaluate_tutorial(self):
|
|
132
|
+
evaluation_task = EvaluationTutorialTask(
|
|
133
|
+
llm=self.llm,
|
|
134
|
+
repo_path=self.rag.repo_dir,
|
|
135
|
+
gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
|
|
136
|
+
meta_data=self.project_metadata,
|
|
137
|
+
step_callback=self.step_callback,
|
|
138
|
+
summarized_files_db=self.summary_file_db,
|
|
139
|
+
code_structure_db=self.code_structure_db,
|
|
140
|
+
)
|
|
141
|
+
evaluation, files = evaluation_task.evaluate()
|
|
142
|
+
return evaluation, files
|
|
145
143
|
|
|
146
144
|
|
|
147
145
|
|
bioguider/rag/data_pipeline.py
CHANGED
|
@@ -91,7 +91,7 @@ def download_repo(repo_url: str, local_path: str, access_token: str = None):
|
|
|
91
91
|
logger.info(f"Cloning repository from {repo_url} to {local_path}")
|
|
92
92
|
# We use repo_url in the log to avoid exposing the token in logs
|
|
93
93
|
result = subprocess.run(
|
|
94
|
-
["git", "clone", clone_url, local_path],
|
|
94
|
+
["git", "clone", "--recurse-submodules", clone_url, local_path],
|
|
95
95
|
check=True,
|
|
96
96
|
stdout=subprocess.PIPE,
|
|
97
97
|
stderr=subprocess.PIPE,
|
|
@@ -12,16 +12,18 @@ logger = logging.getLogger(__name__)
|
|
|
12
12
|
class CodeStructureBuilder:
|
|
13
13
|
def __init__(
|
|
14
14
|
self,
|
|
15
|
-
repo_path: str,
|
|
16
|
-
gitignore_path: str,
|
|
15
|
+
repo_path: str | Path,
|
|
16
|
+
gitignore_path: str | Path,
|
|
17
17
|
code_structure_db: CodeStructureDb,
|
|
18
18
|
):
|
|
19
|
-
self.repo_path = repo_path
|
|
20
|
-
self.gitignore_checker = GitignoreChecker(repo_path, gitignore_path)
|
|
19
|
+
self.repo_path = str(repo_path)
|
|
20
|
+
self.gitignore_checker = GitignoreChecker(repo_path, str(gitignore_path))
|
|
21
21
|
self.file_handler = PythonFileHandler(repo_path)
|
|
22
22
|
self.code_structure_db = code_structure_db
|
|
23
23
|
|
|
24
24
|
def build_code_structure(self):
|
|
25
|
+
if self.code_structure_db.is_database_built():
|
|
26
|
+
return
|
|
25
27
|
files = self.gitignore_checker.check_files_and_folders()
|
|
26
28
|
for file in files:
|
|
27
29
|
if not file.endswith(".py") and not file.endswith(".R"):
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union, Dict, Any, List
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
def extract_markdown_from_notebook(
|
|
7
|
+
ipynb_path: Union[str, Path],
|
|
8
|
+
out_path: Union[str, Path, None] = None,
|
|
9
|
+
) -> Dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Extract markdown from a Jupyter notebook.
|
|
12
|
+
"""
|
|
13
|
+
ipynb_path = Path(ipynb_path)
|
|
14
|
+
if not ipynb_path.exists():
|
|
15
|
+
raise FileNotFoundError(f"File {ipynb_path} does not exist")
|
|
16
|
+
try:
|
|
17
|
+
with ipynb_path.open("r", encoding="utf-8") as f:
|
|
18
|
+
nb = json.load(f)
|
|
19
|
+
except json.JSONDecodeError:
|
|
20
|
+
raise ValueError(f"File {ipynb_path} is not a valid JSON file")
|
|
21
|
+
|
|
22
|
+
markdown_txts = [
|
|
23
|
+
"\n".join(cell.get("source")) if isinstance(cell.get("source"), list) else cell.get("source") for cell in nb.get("cells", [])
|
|
24
|
+
if cell.get("cell_type") == "markdown"
|
|
25
|
+
]
|
|
26
|
+
text = "\n".join(markdown_txts)
|
|
27
|
+
if out_path is not None:
|
|
28
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
29
|
+
f.write(text)
|
|
30
|
+
return text
|
|
31
|
+
|
|
32
|
+
def strip_notebook_to_code_and_markdown(
|
|
33
|
+
ipynb_path: Union[str, Path],
|
|
34
|
+
out_path: Union[str, Path, None] = None,
|
|
35
|
+
keep_top_metadata: bool = True,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
"""
|
|
38
|
+
Load a .ipynb and return a new notebook that:
|
|
39
|
+
- keeps ONLY 'code' and 'markdown' cells
|
|
40
|
+
- empties outputs and execution_count for code cells
|
|
41
|
+
- drops all other cell types (e.g., 'raw')
|
|
42
|
+
- preserves attachments on markdown cells
|
|
43
|
+
- optionally preserves top-level metadata (kernelspec, language_info, etc.)
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
ipynb_path : str | Path
|
|
48
|
+
Path to the input .ipynb file.
|
|
49
|
+
out_path : str | Path | None, default None
|
|
50
|
+
If provided, write the cleaned notebook to this path.
|
|
51
|
+
keep_top_metadata : bool, default True
|
|
52
|
+
If True, copy top-level metadata as-is (useful for re-running).
|
|
53
|
+
If False, keep only minimal metadata.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
dict
|
|
58
|
+
The cleaned notebook (nbformat v4-style dict).
|
|
59
|
+
"""
|
|
60
|
+
ipynb_path = Path(ipynb_path)
|
|
61
|
+
if not ipynb_path.exists():
|
|
62
|
+
raise FileNotFoundError(f"File {ipynb_path} does not exist")
|
|
63
|
+
try:
|
|
64
|
+
with ipynb_path.open("r", encoding="utf-8") as f:
|
|
65
|
+
nb = json.load(f)
|
|
66
|
+
except json.JSONDecodeError:
|
|
67
|
+
raise ValueError(f"File {ipynb_path} is not a valid JSON file")
|
|
68
|
+
|
|
69
|
+
nbformat = nb.get("nbformat", 4)
|
|
70
|
+
nbformat_minor = nb.get("nbformat_minor", 5)
|
|
71
|
+
|
|
72
|
+
def _to_text(src) -> str:
|
|
73
|
+
# nbformat allows str or list of lines
|
|
74
|
+
if isinstance(src, list):
|
|
75
|
+
return "".join(src)
|
|
76
|
+
return src or ""
|
|
77
|
+
|
|
78
|
+
new_cells: List[Dict[str, Any]] = []
|
|
79
|
+
for cell in nb.get("cells", []):
|
|
80
|
+
ctype = cell.get("cell_type")
|
|
81
|
+
if ctype == "markdown":
|
|
82
|
+
new_cell = {
|
|
83
|
+
"cell_type": "markdown",
|
|
84
|
+
"metadata": cell.get("metadata", {}),
|
|
85
|
+
"source": _to_text(cell.get("source", "")),
|
|
86
|
+
}
|
|
87
|
+
if "attachments" in cell:
|
|
88
|
+
new_cell["attachments"] = cell["attachments"]
|
|
89
|
+
new_cells.append(new_cell)
|
|
90
|
+
|
|
91
|
+
elif ctype == "code":
|
|
92
|
+
new_cells.append({
|
|
93
|
+
"cell_type": "code",
|
|
94
|
+
"metadata": cell.get("metadata", {}),
|
|
95
|
+
"source": _to_text(cell.get("source", "")),
|
|
96
|
+
"execution_count": None, # clear execution count
|
|
97
|
+
"outputs": [], # strip ALL outputs
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
# else: drop 'raw' and any other unknown cell types
|
|
101
|
+
|
|
102
|
+
# Build new notebook object
|
|
103
|
+
new_nb: Dict[str, Any] = {
|
|
104
|
+
"nbformat": nbformat,
|
|
105
|
+
"nbformat_minor": nbformat_minor,
|
|
106
|
+
"metadata": nb.get("metadata", {}) if keep_top_metadata else {},
|
|
107
|
+
"cells": new_cells,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if out_path is not None:
|
|
111
|
+
out_path = Path(out_path)
|
|
112
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
113
|
+
with out_path.open("w", encoding="utf-8") as f:
|
|
114
|
+
json.dump(new_nb, f, ensure_ascii=False, indent=1)
|
|
115
|
+
|
|
116
|
+
return new_nb
|
|
117
|
+
|