bioguider 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/agents/agent_utils.py +16 -10
- bioguider/agents/collection_observe_step.py +7 -2
- bioguider/agents/collection_task_utils.py +1 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +109 -0
- bioguider/agents/consistency_query_step.py +74 -0
- bioguider/agents/evaluation_task.py +0 -110
- bioguider/agents/evaluation_tutorial_task.py +156 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +114 -0
- bioguider/agents/evaluation_userguide_task.py +13 -43
- bioguider/agents/prompt_utils.py +15 -2
- bioguider/database/code_structure_db.py +20 -9
- bioguider/database/summarized_file_db.py +6 -3
- bioguider/managers/evaluation_manager.py +16 -2
- bioguider/rag/data_pipeline.py +1 -1
- bioguider/utils/code_structure_builder.py +15 -8
- bioguider/utils/constants.py +12 -12
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/{file_handler.py → python_file_handler.py} +1 -1
- bioguider/utils/r_file_handler.py +549 -0
- bioguider/utils/utils.py +34 -1
- {bioguider-0.2.20.dist-info → bioguider-0.2.22.dist-info}/METADATA +1 -1
- {bioguider-0.2.20.dist-info → bioguider-0.2.22.dist-info}/RECORD +27 -23
- bioguider/agents/consistency_collection_execute_step.py +0 -152
- bioguider/agents/consistency_collection_observe_step.py +0 -128
- bioguider/agents/consistency_collection_plan_step.py +0 -128
- bioguider/agents/consistency_collection_task.py +0 -109
- bioguider/agents/consistency_collection_task_utils.py +0 -137
- {bioguider-0.2.20.dist-info → bioguider-0.2.22.dist-info}/LICENSE +0 -0
- {bioguider-0.2.20.dist-info → bioguider-0.2.22.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Callable
|
|
6
|
+
from langchain.prompts import ChatPromptTemplate
|
|
7
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from bioguider.agents.agent_utils import read_file
|
|
12
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
|
|
13
|
+
from bioguider.agents.consistency_evaluation_task import ConsistencyEvaluationResult, ConsistencyEvaluationTask
|
|
14
|
+
from bioguider.agents.evaluation_task import EvaluationTask
|
|
15
|
+
from bioguider.agents.collection_task import CollectionTask
|
|
16
|
+
from bioguider.agents.evaluation_tutorial_task_prompts import INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT
|
|
17
|
+
from bioguider.agents.prompt_utils import CollectionGoalItemEnum
|
|
18
|
+
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
|
|
19
|
+
from bioguider.utils.notebook_utils import extract_markdown_from_notebook, strip_notebook_to_code_and_markdown
|
|
20
|
+
from bioguider.utils.pyphen_utils import PyphenReadability
|
|
21
|
+
from bioguider.utils.utils import increase_token_usage
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
class TutorialEvaluationResult(BaseModel):
|
|
26
|
+
overall_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
27
|
+
overall_key_strengths: str=Field(description="A string value, the key strengths of the tutorial")
|
|
28
|
+
overall_improvement_suggestions: str=Field(description="Suggestions to improve the overall score if necessary")
|
|
29
|
+
readability_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
30
|
+
readability_suggestions: str=Field(description="Suggestions to improve readability if necessary")
|
|
31
|
+
setup_and_dependencies_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
32
|
+
setup_and_dependencies_suggestions: str=Field(description="Suggestions to improve setup and dependencies if necessary")
|
|
33
|
+
reproducibility_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
34
|
+
reproducibility_suggestions: str=Field(description="Suggestions to improve reproducibility if necessary")
|
|
35
|
+
structure_and_navigation_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
36
|
+
structure_and_navigation_suggestions: str=Field(description="Suggestions to improve structure and navigation if necessary")
|
|
37
|
+
executable_code_quality_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
38
|
+
executable_code_quality_suggestions: str=Field(description="Suggestions to improve executable code quality if necessary")
|
|
39
|
+
result_verification_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
40
|
+
result_verification_suggestions: str=Field(description="Suggestions to improve result verification if necessary")
|
|
41
|
+
performance_and_resource_notes_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
42
|
+
performance_and_resource_notes_suggestions: str=Field(description="Suggestions to improve performance and resource notes if necessary")
|
|
43
|
+
|
|
44
|
+
class IndividualTutorialEvaluationResult(BaseModel):
|
|
45
|
+
tutorial_evaluation: TutorialEvaluationResult | None=Field(description="The evaluation result of the tutorial")
|
|
46
|
+
consistency_evaluation: ConsistencyEvaluationResult | None=Field(description="The evaluation result of the consistency of the tutorial")
|
|
47
|
+
|
|
48
|
+
class EvaluationTutorialTask(EvaluationTask):
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
llm: BaseChatOpenAI,
|
|
52
|
+
repo_path: str,
|
|
53
|
+
gitignore_path: str,
|
|
54
|
+
meta_data: ProjectMetadata | None = None,
|
|
55
|
+
step_callback: Callable | None = None,
|
|
56
|
+
summarized_files_db = None,
|
|
57
|
+
code_structure_db = None,
|
|
58
|
+
):
|
|
59
|
+
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
|
|
60
|
+
self.evaluation_name = "Tutorial Evaluation"
|
|
61
|
+
self.code_structure_db = code_structure_db
|
|
62
|
+
|
|
63
|
+
def _collect_files(self):
|
|
64
|
+
task = CollectionTask(
|
|
65
|
+
llm=self.llm,
|
|
66
|
+
step_callback=self.step_callback,
|
|
67
|
+
summarized_files_db=self.summarized_files_db,
|
|
68
|
+
)
|
|
69
|
+
task.compile(
|
|
70
|
+
repo_path=self.repo_path,
|
|
71
|
+
gitignore_path=Path(self.repo_path, ".gitignore"),
|
|
72
|
+
goal_item=CollectionGoalItemEnum.Tutorial.name,
|
|
73
|
+
)
|
|
74
|
+
files = task.collect()
|
|
75
|
+
return files
|
|
76
|
+
|
|
77
|
+
def _evaluate_consistency(self, file: str) -> ConsistencyEvaluationResult:
|
|
78
|
+
consistency_evaluation_task = ConsistencyEvaluationTask(
|
|
79
|
+
llm=self.llm,
|
|
80
|
+
code_structure_db=self.code_structure_db,
|
|
81
|
+
step_callback=self.step_callback,
|
|
82
|
+
)
|
|
83
|
+
file = file.strip()
|
|
84
|
+
with open(Path(self.repo_path, file), "r") as f:
|
|
85
|
+
tutorial_content = f.read()
|
|
86
|
+
return consistency_evaluation_task.evaluate(
|
|
87
|
+
domain="tutorial/vignette",
|
|
88
|
+
documentation=tutorial_content,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def _evaluate_consistency_on_content(self, content: str) -> ConsistencyEvaluationResult:
|
|
92
|
+
consistency_evaluation_task = ConsistencyEvaluationTask(
|
|
93
|
+
llm=self.llm,
|
|
94
|
+
code_structure_db=self.code_structure_db,
|
|
95
|
+
step_callback=self.step_callback,
|
|
96
|
+
)
|
|
97
|
+
return consistency_evaluation_task.evaluate(
|
|
98
|
+
domain="tutorial/vignette",
|
|
99
|
+
documentation=content,
|
|
100
|
+
), {**DEFAULT_TOKEN_USAGE}
|
|
101
|
+
|
|
102
|
+
def _evaluate_individual_tutorial(self, file: str) -> tuple[IndividualTutorialEvaluationResult | None, dict]:
|
|
103
|
+
content = read_file(Path(self.repo_path, file))
|
|
104
|
+
if content is None:
|
|
105
|
+
logger.error(f"Error in reading file {file}")
|
|
106
|
+
return None, {**DEFAULT_TOKEN_USAGE}
|
|
107
|
+
|
|
108
|
+
if file.endswith(".ipynb"):
|
|
109
|
+
readability_content = extract_markdown_from_notebook(Path(self.repo_path, file))
|
|
110
|
+
content = json.dumps(strip_notebook_to_code_and_markdown(Path(self.repo_path, file)))
|
|
111
|
+
else:
|
|
112
|
+
readability_content = content
|
|
113
|
+
readability = PyphenReadability()
|
|
114
|
+
flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
|
|
115
|
+
_, _, _, _, _ = readability.readability_metrics(readability_content)
|
|
116
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
117
|
+
INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT
|
|
118
|
+
).format(
|
|
119
|
+
flesch_reading_ease=flesch_reading_ease,
|
|
120
|
+
flesch_kincaid_grade=flesch_kincaid_grade,
|
|
121
|
+
gunning_fog_index=gunning_fog_index,
|
|
122
|
+
smog_index=smog_index,
|
|
123
|
+
tutorial_file_content=readability_content,
|
|
124
|
+
)
|
|
125
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
126
|
+
res, _, token_usage, reasoning_process = agent.go(
|
|
127
|
+
system_prompt=system_prompt,
|
|
128
|
+
instruction_prompt="Now, let's begin the tutorial evaluation.",
|
|
129
|
+
schema=TutorialEvaluationResult,
|
|
130
|
+
)
|
|
131
|
+
res: TutorialEvaluationResult = res
|
|
132
|
+
|
|
133
|
+
consistency_evaluation_result, _temp_token_usage = self._evaluate_consistency_on_content(content)
|
|
134
|
+
if consistency_evaluation_result is None:
|
|
135
|
+
# No sufficient information to evaluate the consistency of the tutorial
|
|
136
|
+
consistency_evaluation_result = ConsistencyEvaluationResult(
|
|
137
|
+
consistency_score="N/A",
|
|
138
|
+
consistency_assessment="No sufficient information to evaluate the consistency of the tutorial",
|
|
139
|
+
consistency_development=[],
|
|
140
|
+
consistency_strengths=[],
|
|
141
|
+
)
|
|
142
|
+
return IndividualTutorialEvaluationResult(
|
|
143
|
+
tutorial_evaluation=res,
|
|
144
|
+
consistency_evaluation=consistency_evaluation_result,
|
|
145
|
+
), token_usage
|
|
146
|
+
|
|
147
|
+
def _evaluate(self, files: list[str] | None = None) -> tuple[dict[str, IndividualTutorialEvaluationResult] | None, dict, list[str]]:
|
|
148
|
+
total_token_usage = {**DEFAULT_TOKEN_USAGE}
|
|
149
|
+
tutorial_evaluation_results = {}
|
|
150
|
+
for file in files:
|
|
151
|
+
tutorial_evaluation_result, token_usage = self._evaluate_individual_tutorial(file)
|
|
152
|
+
total_token_usage = increase_token_usage(total_token_usage, token_usage)
|
|
153
|
+
tutorial_evaluation_results[file] = tutorial_evaluation_result
|
|
154
|
+
return tutorial_evaluation_results, total_token_usage, files
|
|
155
|
+
|
|
156
|
+
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT = """
|
|
2
|
+
|
|
3
|
+
You are an expert in evaluating the quality of tutorials in software repositories.
|
|
4
|
+
Your task is to analyze the provided tutorial file and generate a structured quality assessment based on the following criteria.
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
### **Evaluation Criteria**
|
|
8
|
+
|
|
9
|
+
1. **Readability**:
|
|
10
|
+
* **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
|
|
11
|
+
* **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
|
|
12
|
+
* **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
|
|
13
|
+
* **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
|
|
14
|
+
* **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
|
|
15
|
+
|
|
16
|
+
2. **Coverage**:
|
|
17
|
+
* **Assessment**: [Your evaluation of whether it covers all major steps needed to get started, and dependencies, prerequisites, setup steps, and example usage.]
|
|
18
|
+
* **Improvement Suggestions**:
|
|
19
|
+
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
20
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
21
|
+
|
|
22
|
+
3. **Reproducibility**:
|
|
23
|
+
* **Assessment**: [Your evaluation of whether it provides a clear **description** of reproducibility]
|
|
24
|
+
* **Improvement Suggestions**:
|
|
25
|
+
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
26
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
27
|
+
|
|
28
|
+
4. **Structure & Navigation**:
|
|
29
|
+
* **Assessment**: [Your evaluation of whether it provides logical sections (e.g., intro -> setup -> steps -> results -> next), TOC/anchors, estimated time, etc.]
|
|
30
|
+
* **Improvement Suggestions**:
|
|
31
|
+
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
32
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
33
|
+
|
|
34
|
+
5. **Executable Code Quality**:
|
|
35
|
+
* **Assessment**: [Your evaluation on whether the code snippets are executable and functional, idiomatic, no hard-coded paths, etc.]
|
|
36
|
+
* **Improvement Suggestions**:
|
|
37
|
+
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
38
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
39
|
+
|
|
40
|
+
6. **Result Verification**:
|
|
41
|
+
* **Assessment**: [Your evaluation on expected outputs shown (figures/tables/metrics), acceptance criteria, etc.]
|
|
42
|
+
* **Improvement Suggestions**:
|
|
43
|
+
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
44
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
45
|
+
|
|
46
|
+
7. **Performance & Resource Notes**:
|
|
47
|
+
* **Assessment**: [Your evaluation on performance and resource notes, e.g., CPU/GPU usage, memory usage, runtime estimates, small "lite" path provided.]
|
|
48
|
+
* **Improvement Suggestions**:
|
|
49
|
+
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
50
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
### **Final Report Ouput**
|
|
55
|
+
Your final report must **exactly match** the following format. Do not add or omit any sections.
|
|
56
|
+
|
|
57
|
+
**FinalAnswer**
|
|
58
|
+
* **Overall Score:** [Poor / Fair / Good / Excellent]
|
|
59
|
+
* **Overall Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
60
|
+
* **Overall Improvement Suggestions:**
|
|
61
|
+
- "Original text snippet 1" - Improving comment 1
|
|
62
|
+
- "Original text snippet 2" - Improving comment 2
|
|
63
|
+
- ...
|
|
64
|
+
* **Readability Score:** [Poor / Fair / Good / Excellent]
|
|
65
|
+
* **Readability Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
66
|
+
* **Readability Improvement Suggestions:**
|
|
67
|
+
- "Original text snippet 1" - Improving comment 1
|
|
68
|
+
- "Original text snippet 2" - Improving comment 2
|
|
69
|
+
- ...
|
|
70
|
+
* **Coverage Score:** [Poor / Fair / Good / Excellent]
|
|
71
|
+
* **Coverage Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
72
|
+
* **Coverage Improvement Suggestions:**
|
|
73
|
+
- "Original text snippet 1" - Improving comment 1
|
|
74
|
+
- "Original text snippet 2" - Improving comment 2
|
|
75
|
+
- ...
|
|
76
|
+
* **Reproducibility Score:** [Poor / Fair / Good / Excellent]
|
|
77
|
+
* **Reproducibility Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
78
|
+
* **Reproducibility Improvement Suggestions:**
|
|
79
|
+
- "Original text snippet 1" - Improving comment 1
|
|
80
|
+
- "Original text snippet 2" - Improving comment 2
|
|
81
|
+
- ...
|
|
82
|
+
* **Structure & Navigation Score:** [Poor / Fair / Good / Excellent]
|
|
83
|
+
* **Structure & Navigation Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
84
|
+
* **Structure & Navigation Improvement Suggestions:**
|
|
85
|
+
- "Original text snippet 1" - Improving comment 1
|
|
86
|
+
- "Original text snippet 2" - Improving comment 2
|
|
87
|
+
- ...
|
|
88
|
+
* **Executable Code Quality Score:** [Poor / Fair / Good / Excellent]
|
|
89
|
+
* **Executable Code Quality Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
90
|
+
* **Executable Code Quality Improvement Suggestions:**
|
|
91
|
+
- "Original text snippet 1" - Improving comment 1
|
|
92
|
+
- "Original text snippet 2" - Improving comment 2
|
|
93
|
+
- ...
|
|
94
|
+
* **Result Verification Score:** [Poor / Fair / Good / Excellent]
|
|
95
|
+
* **Result Verification Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
96
|
+
* **Result Verification Improvement Suggestions:**
|
|
97
|
+
- "Original text snippet 1" - Improving comment 1
|
|
98
|
+
- "Original text snippet 2" - Improving comment 2
|
|
99
|
+
- ...
|
|
100
|
+
* **Performance & Resource Notes Score:** [Poor / Fair / Good / Excellent]
|
|
101
|
+
* **Performance & Resource Notes Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
102
|
+
* **Performance & Resource Notes Improvement Suggestions:**
|
|
103
|
+
- "Original text snippet 1" - Improving comment 1
|
|
104
|
+
- "Original text snippet 2" - Improving comment 2
|
|
105
|
+
- ...
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
### **Tutorial File Content:**
|
|
110
|
+
{tutorial_file_content}
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
"""
|
|
@@ -1,36 +1,24 @@
|
|
|
1
1
|
|
|
2
|
-
import os
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
import logging
|
|
5
4
|
from langchain.prompts import ChatPromptTemplate
|
|
6
|
-
from markdownify import markdownify as md
|
|
7
5
|
from pydantic import BaseModel, Field
|
|
8
6
|
|
|
9
7
|
from bioguider.agents.agent_utils import read_file
|
|
10
8
|
from bioguider.agents.collection_task import CollectionTask
|
|
11
|
-
from bioguider.agents.
|
|
9
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
|
|
10
|
+
from bioguider.agents.consistency_evaluation_task import ConsistencyEvaluationTask, ConsistencyEvaluationResult
|
|
11
|
+
from bioguider.agents.prompt_utils import CollectionGoalItemEnum
|
|
12
12
|
from bioguider.utils.constants import (
|
|
13
13
|
DEFAULT_TOKEN_USAGE,
|
|
14
|
-
ProjectMetadata,
|
|
15
|
-
StructuredEvaluationInstallationResult,
|
|
16
|
-
FreeEvaluationInstallationResult,
|
|
17
|
-
EvaluationInstallationResult,
|
|
18
14
|
)
|
|
19
|
-
from bioguider.rag.data_pipeline import count_tokens
|
|
20
|
-
from .common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
|
|
21
15
|
from ..utils.pyphen_utils import PyphenReadability
|
|
22
16
|
|
|
23
17
|
from .evaluation_task import EvaluationTask
|
|
24
18
|
from .agent_utils import read_file
|
|
25
19
|
from bioguider.utils.utils import increase_token_usage
|
|
26
|
-
from .evaluation_userguide_prompts import
|
|
27
|
-
from .consistency_collection_task import ConsistencyCollectionTask
|
|
20
|
+
from .evaluation_userguide_prompts import INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT
|
|
28
21
|
|
|
29
|
-
class ConsistencyEvaluationResult(BaseModel):
|
|
30
|
-
consistency_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
31
|
-
consistency_assessment: str=Field(description="Your evaluation of whether the user guide/API documentation is consistent with the code definitions")
|
|
32
|
-
consistency_development: list[str]=Field(description="A list of inconsistent function/class/method name and inconsistent docstring")
|
|
33
|
-
consistency_strengths: list[str]=Field(description="A list of strengths of the user guide/API documentation on consistency")
|
|
34
22
|
|
|
35
23
|
class UserGuideEvaluationResult(BaseModel):
|
|
36
24
|
overall_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
@@ -78,39 +66,19 @@ class EvaluationUserGuideTask(EvaluationTask):
|
|
|
78
66
|
files = task.collect()
|
|
79
67
|
return files
|
|
80
68
|
|
|
81
|
-
def _evaluate_consistency(self, file: str) ->
|
|
82
|
-
|
|
69
|
+
def _evaluate_consistency(self, file: str) -> ConsistencyEvaluationResult:
|
|
70
|
+
consistency_evaluation_task = ConsistencyEvaluationTask(
|
|
83
71
|
llm=self.llm,
|
|
84
72
|
code_structure_db=self.code_structure_db,
|
|
85
73
|
step_callback=self.step_callback,
|
|
86
74
|
)
|
|
87
|
-
|
|
75
|
+
file = file.strip()
|
|
88
76
|
with open(Path(self.repo_path, file), "r") as f:
|
|
89
77
|
user_guide_api_documentation = f.read()
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
return None, {**DEFAULT_TOKEN_USAGE}
|
|
95
|
-
|
|
96
|
-
system_prompt = ChatPromptTemplate.from_template(
|
|
97
|
-
CONSISTENCY_EVAL_SYSTEM_PROMPT
|
|
98
|
-
).format(
|
|
99
|
-
user_guide_api_documentation=user_guide_api_documentation,
|
|
100
|
-
code_definitions=code_definitions,
|
|
101
|
-
)
|
|
102
|
-
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
103
|
-
res, _, token_usage, reasoning_process = agent.go(
|
|
104
|
-
system_prompt=system_prompt,
|
|
105
|
-
instruction_prompt="Now, let's begin the consistency evaluation step.",
|
|
106
|
-
schema=ConsistencyEvaluationResult,
|
|
107
|
-
)
|
|
108
|
-
res: ConsistencyEvaluationResult = res
|
|
109
|
-
self.print_step(step_output=f"Consistency Evaluation Result: {res}")
|
|
110
|
-
self.print_step(step_output=f"Consistency Evaluation Reasoning Process: {reasoning_process}")
|
|
111
|
-
self.print_step(token_usage=token_usage)
|
|
112
|
-
|
|
113
|
-
return res, token_usage
|
|
78
|
+
return consistency_evaluation_task.evaluate(
|
|
79
|
+
domain="user guide/API",
|
|
80
|
+
documentation=user_guide_api_documentation,
|
|
81
|
+
), {**DEFAULT_TOKEN_USAGE}
|
|
114
82
|
|
|
115
83
|
def _evaluate_individual_userguide(self, file: str) -> tuple[IndividualUserGuideEvaluationResult | None, dict]:
|
|
116
84
|
content = read_file(Path(self.repo_path, file))
|
|
@@ -157,6 +125,8 @@ class EvaluationUserGuideTask(EvaluationTask):
|
|
|
157
125
|
total_token_usage = {**DEFAULT_TOKEN_USAGE}
|
|
158
126
|
user_guide_evaluation_results = {}
|
|
159
127
|
for file in files:
|
|
128
|
+
if file.endswith(".py") or file.endswith(".R"):
|
|
129
|
+
continue
|
|
160
130
|
user_guide_evaluation_result, token_usage = self._evaluate_individual_userguide(file)
|
|
161
131
|
total_token_usage = increase_token_usage(total_token_usage, token_usage)
|
|
162
132
|
user_guide_evaluation_results[file] = user_guide_evaluation_result
|
bioguider/agents/prompt_utils.py
CHANGED
|
@@ -104,6 +104,7 @@ COLLECTION_PROMPTS = {
|
|
|
104
104
|
"goal_item": "User Guide",
|
|
105
105
|
"related_file_description": """A document qualifies as a **User Guide** if it includes **at least one** of the following elements.
|
|
106
106
|
If **any one** of these is present, the document should be classified as a User Guide — full coverage is **not required**:
|
|
107
|
+
- **Not source code or a script** (*.py, *.R) or notebook (*.ipynb, *.Rmd) that is not intended for end-user interaction.
|
|
107
108
|
- Document **functions, methods, or classes**
|
|
108
109
|
- Describe **input parameters, return values**, and **usage syntax**
|
|
109
110
|
- Include **technical guidance** for using specific APIs
|
|
@@ -117,9 +118,12 @@ If **any one** of these is present, the document should be classified as a User
|
|
|
117
118
|
- Code Walkthroughs: Detailed explanations of code snippets in a tutorial format.
|
|
118
119
|
**Do not** classify the document as a User Guide if it is souce code or a script (*.py, *.R) that is not intended for end-user interaction.
|
|
119
120
|
- You can include directory names if all files in the directory are relevant to the goal item.""",
|
|
120
|
-
"plan_important_instructions": """ - **Do not**
|
|
121
|
+
"plan_important_instructions": """ - **Do not** try to summarize or read the content of any source code or script (*.py, *.R) or notebook (*.ipynb, *.Rmd) that is not intended for end-user interaction.
|
|
122
|
+
- **Do not** classify the document as a User Guide if it is source code or a script (*.py, *.R) that is not intended for end-user interaction.
|
|
121
123
|
- **Do not** classify the document as a User Guide if it is a notebook (*.ipynb, *.Rmd) that is not intended for end-user interaction.
|
|
122
|
-
- You plan **must not** include any source code or script (*.py, *.R) or notebook (*.ipynb, *.Rmd) that is not intended for end-user interaction."""
|
|
124
|
+
- You plan **must not** include any source code or script (*.py, *.R) or notebook (*.ipynb, *.Rmd) that is not intended for end-user interaction.""",
|
|
125
|
+
"observe_important_instructions": """ - **Do not** classify the document as a User Guide if it is source code or a script (*.py, *.R) that is not intended for end-user interaction.
|
|
126
|
+
- **Do not** include any source code or script (*.py, *.R) or notebook (*.ipynb, *.Rmd) in the final answer that is not intended for end-user interaction."""
|
|
123
127
|
},
|
|
124
128
|
"Tutorial": {
|
|
125
129
|
"goal_item": "Tutorials & Vignettes",
|
|
@@ -131,6 +135,15 @@ If **any one** of these is present, the document should be classified as a User
|
|
|
131
135
|
- Interactive Elements: Features that allow users to experiment with the code in real-time, such as Jupyter notebooks or R Markdown files.
|
|
132
136
|
- Use Cases: Real-world applications or scenarios where the software can be applied effectively.
|
|
133
137
|
- You can include directory names if all files in the directory are relevant to the goal item.
|
|
138
|
+
**Important instructions**:
|
|
139
|
+
- **Do not** use **read_file_tool, summarize_file_tool, check_file_related_tool** on the python/R notebook files **(.ipynb, .Rmd)**, as they are too big to read.
|
|
140
|
+
""",
|
|
141
|
+
"plan_important_instructions": """ - **Do not** use **read_file_tool, summarize_file_tool, check_file_related_tool** on the python/R notebook files **(.ipynb, .Rmd)**, as they are too big to read.
|
|
142
|
+
- For python/R notebook files **(.ipynb, .Rmd)**, **only infer** if it is the tutorial/vignette from the file name and avoid reading the content of the file.
|
|
143
|
+
""",
|
|
144
|
+
"observe_important_instructions": """ - **Do not** use **read_file_tool, summarize_file_tool, check_file_related_tool** on the python/R notebook files **(.ipynb, .Rmd)**, as they are too big to read.
|
|
145
|
+
- For python/R notebook files **(.ipynb, .Rmd)**, **only infer** if it is the tutorial/vignette from the file name and avoid reading the content of the file.
|
|
146
|
+
- **Do not** include any binary files (e.g., `.png`, `.jpg`, `.jpeg`, `.gif`, `.svg`) in the final answer.s
|
|
134
147
|
""",
|
|
135
148
|
},
|
|
136
149
|
"DockerGeneration": {
|
|
@@ -123,15 +123,8 @@ class CodeStructureDb:
|
|
|
123
123
|
os.makedirs(db_path, exist_ok=True)
|
|
124
124
|
except Exception as e:
|
|
125
125
|
logging.error(e)
|
|
126
|
-
return False
|
|
127
|
-
db_path = os.path.join(db_path, "
|
|
128
|
-
# Ensure the local path exists
|
|
129
|
-
try:
|
|
130
|
-
os.makedirs(db_path, exist_ok=True)
|
|
131
|
-
except Exception as e:
|
|
132
|
-
logging.error(e)
|
|
133
|
-
return False
|
|
134
|
-
db_path = os.path.join(db_path, f"{self.author}_{self.repo_name}.db")
|
|
126
|
+
return False
|
|
127
|
+
db_path = os.path.join(db_path, f"{self.author}_{self.repo_name}_code_structure.db")
|
|
135
128
|
if not os.path.exists(db_path):
|
|
136
129
|
try:
|
|
137
130
|
with open(db_path, "w"):
|
|
@@ -142,6 +135,24 @@ class CodeStructureDb:
|
|
|
142
135
|
self.connection = sqlite3.connect(db_path)
|
|
143
136
|
return True
|
|
144
137
|
|
|
138
|
+
def is_database_built(self) -> bool:
|
|
139
|
+
res = self._connect_to_db()
|
|
140
|
+
if not res:
|
|
141
|
+
return False
|
|
142
|
+
res = self._ensure_tables()
|
|
143
|
+
if not res:
|
|
144
|
+
return False
|
|
145
|
+
try:
|
|
146
|
+
cursor = self.connection.cursor()
|
|
147
|
+
cursor.execute(f"SELECT * FROM {CODE_STRUCTURE_TABLE_NAME}")
|
|
148
|
+
return cursor.fetchone() is not None
|
|
149
|
+
except Exception as e:
|
|
150
|
+
logging.error(e)
|
|
151
|
+
return False
|
|
152
|
+
finally:
|
|
153
|
+
self.connection.close()
|
|
154
|
+
self.connection = None
|
|
155
|
+
|
|
145
156
|
def insert_code_structure(
|
|
146
157
|
self,
|
|
147
158
|
name: str,
|
|
@@ -38,10 +38,11 @@ where file_path = ? and instruction = ? and summarize_level = ? and summarize_pr
|
|
|
38
38
|
"""
|
|
39
39
|
|
|
40
40
|
class SummarizedFilesDb:
|
|
41
|
-
def __init__(self, author: str, repo_name: str):
|
|
41
|
+
def __init__(self, author: str, repo_name: str, data_folder: str = None):
|
|
42
42
|
self.author = author
|
|
43
43
|
self.repo_name = repo_name
|
|
44
44
|
self.connection: Connection | None = None
|
|
45
|
+
self.data_folder = data_folder
|
|
45
46
|
|
|
46
47
|
def _ensure_tables(self) -> bool:
|
|
47
48
|
if self.connection is None:
|
|
@@ -60,7 +61,9 @@ class SummarizedFilesDb:
|
|
|
60
61
|
def _connect_to_db(self) -> bool:
|
|
61
62
|
if self.connection is not None:
|
|
62
63
|
return True
|
|
63
|
-
db_path =
|
|
64
|
+
db_path = self.data_folder
|
|
65
|
+
if db_path is None:
|
|
66
|
+
db_path = os.environ.get("DATA_FOLDER", "./data")
|
|
64
67
|
db_path = os.path.join(db_path, "databases")
|
|
65
68
|
# Ensure the local path exists
|
|
66
69
|
try:
|
|
@@ -68,7 +71,7 @@ class SummarizedFilesDb:
|
|
|
68
71
|
except Exception as e:
|
|
69
72
|
logging.error(e)
|
|
70
73
|
return False
|
|
71
|
-
db_path = os.path.join(db_path, f"{self.author}_{self.repo_name}.db")
|
|
74
|
+
db_path = os.path.join(db_path, f"{self.author}_{self.repo_name}_summarized_file.db")
|
|
72
75
|
if not os.path.exists(db_path):
|
|
73
76
|
try:
|
|
74
77
|
with open(db_path, "w"):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
+
from bioguider.agents.evaluation_tutorial_task import EvaluationTutorialTask
|
|
4
5
|
from bioguider.agents.evaluation_userguide_task import EvaluationUserGuideTask
|
|
5
6
|
from bioguider.agents.prompt_utils import CollectionGoalItemEnum
|
|
6
7
|
from bioguider.database.code_structure_db import CodeStructureDb
|
|
@@ -35,8 +36,8 @@ class EvaluationManager:
|
|
|
35
36
|
self.summary_file_db = SummarizedFilesDb(author, repo_name)
|
|
36
37
|
self.code_structure_db = CodeStructureDb(author, repo_name)
|
|
37
38
|
code_structure_builder = CodeStructureBuilder(
|
|
38
|
-
repo_path=
|
|
39
|
-
gitignore_path=Path(
|
|
39
|
+
repo_path=self.rag.repo_dir,
|
|
40
|
+
gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
|
|
40
41
|
code_structure_db=self.code_structure_db
|
|
41
42
|
)
|
|
42
43
|
code_structure_builder.build_code_structure()
|
|
@@ -142,6 +143,19 @@ class EvaluationManager:
|
|
|
142
143
|
)
|
|
143
144
|
evaluation, files = evaluation_task.evaluate()
|
|
144
145
|
return evaluation, files
|
|
146
|
+
|
|
147
|
+
def evaluate_tutorial(self):
|
|
148
|
+
evaluation_task = EvaluationTutorialTask(
|
|
149
|
+
llm=self.llm,
|
|
150
|
+
repo_path=self.rag.repo_dir,
|
|
151
|
+
gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
|
|
152
|
+
meta_data=self.project_metadata,
|
|
153
|
+
step_callback=self.step_callback,
|
|
154
|
+
summarized_files_db=self.summary_file_db,
|
|
155
|
+
code_structure_db=self.code_structure_db,
|
|
156
|
+
)
|
|
157
|
+
evaluation, files = evaluation_task.evaluate()
|
|
158
|
+
return evaluation, files
|
|
145
159
|
|
|
146
160
|
|
|
147
161
|
|
bioguider/rag/data_pipeline.py
CHANGED
|
@@ -91,7 +91,7 @@ def download_repo(repo_url: str, local_path: str, access_token: str = None):
|
|
|
91
91
|
logger.info(f"Cloning repository from {repo_url} to {local_path}")
|
|
92
92
|
# We use repo_url in the log to avoid exposing the token in logs
|
|
93
93
|
result = subprocess.run(
|
|
94
|
-
["git", "clone", clone_url, local_path],
|
|
94
|
+
["git", "clone", "--recurse-submodules", clone_url, local_path],
|
|
95
95
|
check=True,
|
|
96
96
|
stdout=subprocess.PIPE,
|
|
97
97
|
stderr=subprocess.PIPE,
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
|
+
from bioguider.utils.r_file_handler import RFileHandler
|
|
5
|
+
|
|
4
6
|
from .gitignore_checker import GitignoreChecker
|
|
5
|
-
from .
|
|
7
|
+
from .python_file_handler import PythonFileHandler
|
|
6
8
|
from ..database.code_structure_db import CodeStructureDb
|
|
7
9
|
|
|
8
10
|
logger = logging.getLogger(__name__)
|
|
@@ -10,22 +12,27 @@ logger = logging.getLogger(__name__)
|
|
|
10
12
|
class CodeStructureBuilder:
|
|
11
13
|
def __init__(
|
|
12
14
|
self,
|
|
13
|
-
repo_path: str,
|
|
14
|
-
gitignore_path: str,
|
|
15
|
+
repo_path: str | Path,
|
|
16
|
+
gitignore_path: str | Path,
|
|
15
17
|
code_structure_db: CodeStructureDb,
|
|
16
18
|
):
|
|
17
|
-
self.repo_path = repo_path
|
|
18
|
-
self.gitignore_checker = GitignoreChecker(repo_path, gitignore_path)
|
|
19
|
-
self.file_handler =
|
|
19
|
+
self.repo_path = str(repo_path)
|
|
20
|
+
self.gitignore_checker = GitignoreChecker(repo_path, str(gitignore_path))
|
|
21
|
+
self.file_handler = PythonFileHandler(repo_path)
|
|
20
22
|
self.code_structure_db = code_structure_db
|
|
21
23
|
|
|
22
24
|
def build_code_structure(self):
|
|
25
|
+
if self.code_structure_db.is_database_built():
|
|
26
|
+
return
|
|
23
27
|
files = self.gitignore_checker.check_files_and_folders()
|
|
24
28
|
for file in files:
|
|
25
|
-
if not file.endswith(".py"):
|
|
29
|
+
if not file.endswith(".py") and not file.endswith(".R"):
|
|
26
30
|
continue
|
|
27
31
|
logger.info(f"Building code structure for {file}")
|
|
28
|
-
|
|
32
|
+
if file.endswith(".py"):
|
|
33
|
+
file_handler = PythonFileHandler(Path(self.repo_path) / file)
|
|
34
|
+
else:
|
|
35
|
+
file_handler = RFileHandler(Path(self.repo_path) / file)
|
|
29
36
|
functions_and_classes = file_handler.get_functions_and_classes()
|
|
30
37
|
# fixme: currently, we don't extract reference graph for each function or class
|
|
31
38
|
for function_or_class in functions_and_classes:
|
bioguider/utils/constants.py
CHANGED
|
@@ -119,15 +119,15 @@ class DemoInstructionsResult(BaseModel):
|
|
|
119
119
|
expected_output_description: Optional[bool] = Field(description="A boolean value. Does it provide the description of expected output?")
|
|
120
120
|
|
|
121
121
|
class EvaluationSubmissionRequirementsResult(BaseModel):
|
|
122
|
-
compiled_standalone_software: bool
|
|
123
|
-
source_code: bool
|
|
124
|
-
demo_dataset: bool
|
|
125
|
-
run_on_data_instruction: bool
|
|
126
|
-
run_on_custom_instruction: bool
|
|
127
|
-
expected_output_description: bool
|
|
128
|
-
complete_readme: bool
|
|
129
|
-
software_dependency: bool
|
|
130
|
-
install_tutorial: bool
|
|
131
|
-
license: bool
|
|
132
|
-
hardware_requirements: bool
|
|
133
|
-
compatible_os: bool
|
|
122
|
+
compiled_standalone_software: bool | None
|
|
123
|
+
source_code: bool | None
|
|
124
|
+
demo_dataset: bool | None
|
|
125
|
+
run_on_data_instruction: bool | None
|
|
126
|
+
run_on_custom_instruction: bool | None
|
|
127
|
+
expected_output_description: bool | None
|
|
128
|
+
complete_readme: bool | None
|
|
129
|
+
software_dependency: bool | None
|
|
130
|
+
install_tutorial: bool | None
|
|
131
|
+
license: bool | None
|
|
132
|
+
hardware_requirements: bool | None
|
|
133
|
+
compatible_os: bool | None
|