bioguider 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/agents/agent_utils.py +5 -3
- bioguider/agents/collection_execute_step.py +1 -1
- bioguider/agents/common_conversation.py +20 -2
- bioguider/agents/consistency_collection_execute_step.py +152 -0
- bioguider/agents/consistency_collection_observe_step.py +128 -0
- bioguider/agents/consistency_collection_plan_step.py +128 -0
- bioguider/agents/consistency_collection_task.py +109 -0
- bioguider/agents/consistency_collection_task_utils.py +137 -0
- bioguider/agents/evaluation_readme_task.py +29 -24
- bioguider/agents/evaluation_task.py +2 -2
- bioguider/agents/evaluation_userguide_prompts.py +162 -0
- bioguider/agents/evaluation_userguide_task.py +164 -0
- bioguider/agents/prompt_utils.py +11 -8
- bioguider/database/code_structure_db.py +489 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/change_planner.py +140 -0
- bioguider/generation/document_renderer.py +47 -0
- bioguider/generation/llm_cleaner.py +43 -0
- bioguider/generation/llm_content_generator.py +69 -0
- bioguider/generation/llm_injector.py +270 -0
- bioguider/generation/models.py +77 -0
- bioguider/generation/output_manager.py +54 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +151 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +136 -0
- bioguider/generation/test_metrics.py +104 -0
- bioguider/managers/evaluation_manager.py +24 -0
- bioguider/managers/generation_manager.py +160 -0
- bioguider/managers/generation_test_manager.py +74 -0
- bioguider/utils/code_structure_builder.py +42 -0
- bioguider/utils/file_handler.py +65 -0
- {bioguider-0.2.18.dist-info → bioguider-0.2.20.dist-info}/METADATA +1 -1
- {bioguider-0.2.18.dist-info → bioguider-0.2.20.dist-info}/RECORD +36 -11
- {bioguider-0.2.18.dist-info → bioguider-0.2.20.dist-info}/LICENSE +0 -0
- {bioguider-0.2.18.dist-info → bioguider-0.2.20.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Callable, Optional, TypedDict
|
|
4
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
5
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
6
|
+
from langchain_core.messages import AIMessage
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from bioguider.agents.agent_tools import agent_tool
|
|
11
|
+
from bioguider.database.code_structure_db import CodeStructureDb
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
class ConsistencyCollectionWorkflowState(TypedDict):
|
|
16
|
+
user_guide_api_documentation: str
|
|
17
|
+
step_output_callback: Optional[Callable]
|
|
18
|
+
intermediate_steps: Optional[str]
|
|
19
|
+
step_output: Optional[str]
|
|
20
|
+
step_analysis: Optional[str]
|
|
21
|
+
step_thoughts: Optional[str]
|
|
22
|
+
plan_actions: Optional[list[dict]]
|
|
23
|
+
|
|
24
|
+
final_answer: Optional[str]
|
|
25
|
+
final_assembly_result: Optional[str]
|
|
26
|
+
step_count: Optional[int]
|
|
27
|
+
|
|
28
|
+
class retrieve_method_definition_and_docstring_tool:
|
|
29
|
+
""" Retrieve the method definition and docstring.
|
|
30
|
+
If the method is a method of a class, you **must** put the class name as the parent name and better to put the file path as the file path of the class.
|
|
31
|
+
Args:
|
|
32
|
+
method_name str: the name of the method
|
|
33
|
+
class_name str: the name of the class that the method is in.
|
|
34
|
+
file_path str: the path of the file that the method is in. If not sure, just put "N/A"
|
|
35
|
+
Returns:
|
|
36
|
+
str: the method definition and docstring
|
|
37
|
+
"""
|
|
38
|
+
def __init__(self, llm: BaseChatOpenAI, code_structure_db: CodeStructureDb):
|
|
39
|
+
self.llm = llm
|
|
40
|
+
self.code_structure_db = code_structure_db
|
|
41
|
+
|
|
42
|
+
def run(self, method_name: str, class_name: str, file_path: str) -> str:
|
|
43
|
+
if file_path != "N/A":
|
|
44
|
+
row = self.code_structure_db.select_by_name_and_parent_and_path(method_name, class_name, file_path)
|
|
45
|
+
if row is None:
|
|
46
|
+
return "Can't retrieve method definition and docstring"
|
|
47
|
+
return f"Method: {row['name']}\nDocstring: {row['doc_string']}\nParams: {row['params']}"
|
|
48
|
+
else:
|
|
49
|
+
rows = self.code_structure_db.select_by_name_and_parent(method_name, class_name)
|
|
50
|
+
if rows is None or len(rows) == 0:
|
|
51
|
+
return "Can't retrieve method definition and docstring"
|
|
52
|
+
return f"Method: {rows[0]['name']}\nDocstring: {rows[0]['doc_string']}\nParams: {rows[0]['params']}"
|
|
53
|
+
|
|
54
|
+
class retrieve_function_definition_and_docstring_tool:
|
|
55
|
+
""" Retrieve the function definition and docstring
|
|
56
|
+
Args:
|
|
57
|
+
function_name str: the name of the function
|
|
58
|
+
file_path str: the path of the file that the function is in. If not sure, just put "N/A"
|
|
59
|
+
Returns:
|
|
60
|
+
str: the function definition and docstring
|
|
61
|
+
"""
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
llm: BaseChatOpenAI,
|
|
65
|
+
code_structure_db: CodeStructureDb,
|
|
66
|
+
):
|
|
67
|
+
self.llm = llm
|
|
68
|
+
self.code_structure_db = code_structure_db
|
|
69
|
+
|
|
70
|
+
def run(self, function_name: str, file_path: str) -> str:
|
|
71
|
+
if file_path != "N/A":
|
|
72
|
+
row = self.code_structure_db.select_by_name_and_path(function_name, file_path)
|
|
73
|
+
if row is None:
|
|
74
|
+
return f"No such function {function_name}"
|
|
75
|
+
return f"Function: {row['name']}\nDocstring: {row['doc_string']}\nParams: {row['params']}"
|
|
76
|
+
else:
|
|
77
|
+
rows = self.code_structure_db.select_by_name(function_name)
|
|
78
|
+
if rows is None or len(rows) == 0:
|
|
79
|
+
return f"No such function {function_name}"
|
|
80
|
+
return f"Function: {rows[0]['name']}\nDocstring: {rows[0]['doc_string']}\nParams: {rows[0]['params']}"
|
|
81
|
+
|
|
82
|
+
class retrieve_class_definition_and_docstring_tool:
|
|
83
|
+
""" Retrieve the class definition and docstring
|
|
84
|
+
Args:
|
|
85
|
+
class_name str: the name of the class
|
|
86
|
+
file_path str: the path of the file that the class is in. If not sure, just put "N/A"
|
|
87
|
+
Returns:
|
|
88
|
+
str: the class definition and docstring
|
|
89
|
+
"""
|
|
90
|
+
def __init__(self, llm: BaseChatOpenAI, code_structure_db: CodeStructureDb):
|
|
91
|
+
self.llm = llm
|
|
92
|
+
self.code_structure_db = code_structure_db
|
|
93
|
+
|
|
94
|
+
def run(self, class_name: str, file_path: str) -> str:
|
|
95
|
+
if file_path != "N/A":
|
|
96
|
+
row = self.code_structure_db.select_by_name_and_path(class_name, file_path)
|
|
97
|
+
if row is None:
|
|
98
|
+
return f"No such class {class_name}"
|
|
99
|
+
return f"Class: {row['name']}\nDocstring: {row['doc_string']}\nParams: {row['params']}"
|
|
100
|
+
else:
|
|
101
|
+
rows = self.code_structure_db.select_by_name(class_name)
|
|
102
|
+
if rows is None or len(rows) == 0:
|
|
103
|
+
return f"No such class {class_name}"
|
|
104
|
+
return f"Class: {rows[0]['name']}\nDocstring: {rows[0]['doc_string']}\nParams: {rows[0]['params']}"
|
|
105
|
+
|
|
106
|
+
class retrieve_class_and_method_definition_and_docstring_tool:
|
|
107
|
+
""" Retrieve the class and all methods definition and docstring
|
|
108
|
+
Args:
|
|
109
|
+
class_name str: the name of the class
|
|
110
|
+
file_path str: the path of the file that the class is in. If not sure, just put "N/A"
|
|
111
|
+
Returns:
|
|
112
|
+
str: the class and method definition and docstring
|
|
113
|
+
"""
|
|
114
|
+
def __init__(self, llm: BaseChatOpenAI, code_structure_db: CodeStructureDb):
|
|
115
|
+
self.llm = llm
|
|
116
|
+
self.code_structure_db = code_structure_db
|
|
117
|
+
|
|
118
|
+
def run(self, class_name: str, file_path: str) -> str:
|
|
119
|
+
if file_path != "N/A":
|
|
120
|
+
row = self.code_structure_db.select_by_name_and_path(class_name, file_path)
|
|
121
|
+
if row is None:
|
|
122
|
+
return f"No such class {class_name}"
|
|
123
|
+
else:
|
|
124
|
+
rows = self.code_structure_db.select_by_name(class_name)
|
|
125
|
+
if rows is None or len(rows) == 0:
|
|
126
|
+
return f"No such class {class_name}"
|
|
127
|
+
row = rows[0]
|
|
128
|
+
|
|
129
|
+
parent_path = file_path if file_path is not None and file_path.lower() != "n/a" else row["path"]
|
|
130
|
+
methods = self.code_structure_db.select_by_parent(
|
|
131
|
+
class_name,
|
|
132
|
+
parent_path
|
|
133
|
+
)
|
|
134
|
+
method_definitions = []
|
|
135
|
+
for method in methods:
|
|
136
|
+
method_definitions.append(f"Method: {method['name']}\nDocstring: {method['doc_string']}\nParams: {method['params']}\n\n")
|
|
137
|
+
return f"Class: {row['name']}\nDocstring: {row['doc_string']}\nParams: {row['params']}\nMethods: {method_definitions}"
|
|
@@ -13,7 +13,10 @@ from bioguider.agents.agent_utils import (
|
|
|
13
13
|
read_file, read_license_file,
|
|
14
14
|
summarize_file
|
|
15
15
|
)
|
|
16
|
-
from bioguider.agents.common_agent_2step import
|
|
16
|
+
from bioguider.agents.common_agent_2step import (
|
|
17
|
+
CommonAgentTwoChainSteps,
|
|
18
|
+
CommonAgentTwoSteps,
|
|
19
|
+
)
|
|
17
20
|
from bioguider.agents.evaluation_task import EvaluationTask
|
|
18
21
|
from bioguider.utils.constants import (
|
|
19
22
|
DEFAULT_TOKEN_USAGE,
|
|
@@ -168,6 +171,28 @@ You will be given:
|
|
|
168
171
|
|
|
169
172
|
---
|
|
170
173
|
|
|
174
|
+
### **Output Format**
|
|
175
|
+
Your output must **exactly match** the following format. Do not add or omit any sections.
|
|
176
|
+
|
|
177
|
+
**FinalAnswer**
|
|
178
|
+
**Available:**
|
|
179
|
+
<Your assessment and suggestion here>
|
|
180
|
+
**Readability:**
|
|
181
|
+
<Your assessment and suggestion here>
|
|
182
|
+
**Project Purpose:**
|
|
183
|
+
<Your assessment and suggestion here>
|
|
184
|
+
**Hardware and software spec and compatibility description:**
|
|
185
|
+
<Your assessment and suggestion here>
|
|
186
|
+
**Dependencies clearly stated:**
|
|
187
|
+
<Your assessment and suggestion here>
|
|
188
|
+
**License Information Included:**
|
|
189
|
+
<Your assessment and suggestion here>
|
|
190
|
+
**Code contributor / Author information included
|
|
191
|
+
<Your assessment and suggestion here>
|
|
192
|
+
**Overall Score:**
|
|
193
|
+
<Your assessment and suggestion here>
|
|
194
|
+
---
|
|
195
|
+
|
|
171
196
|
### **Instructions**
|
|
172
197
|
1. Based on the provided structured evaluation and its reasoning process, generate a free evaluation of the README file.
|
|
173
198
|
2. Focus on the explanation of assessment in structured evaluation and how to improve the README file based on the structured evaluation and its reasoning process.
|
|
@@ -175,8 +200,9 @@ You will be given:
|
|
|
175
200
|
3. For each item in the structured evaluation, provide a detailed assessment followed by specific, actionable comments for improvement.
|
|
176
201
|
4. Your improvement suggestions must also include the original text snippet and the improving comments.
|
|
177
202
|
5. Your improvement suggestions must also include suggestions to improve readability.
|
|
178
|
-
6. In each section output, please first give a detailed explanation of the assessment, and then provide the detailed suggestion for improvement. If you think the it is good enough, you can say so.
|
|
203
|
+
6. In the **FinalAnswer** of output, in each section output, please first give a detailed explanation of the assessment, and then provide the detailed suggestion for improvement. If you think the it is good enough, you can say so.
|
|
179
204
|
The following is an example of the output format:
|
|
205
|
+
**FinalAnswer**
|
|
180
206
|
**Available:**
|
|
181
207
|
Detailed explanation of the assessment. Such as: The README file is present in the repository. The content of the file has been shared completely and is accessible. This confirms the availability of the README documentation for evaluation. There's no issue with availability.
|
|
182
208
|
Detailed suggestion for improvement. Such as: Add a brief introductory section summarizing the project and its main purpose would help orient readers.
|
|
@@ -222,28 +248,7 @@ You will be given:
|
|
|
222
248
|
- <original text snippet> - <improving comments>
|
|
223
249
|
- ...
|
|
224
250
|
- Break down long instructions into smaller bullet points.
|
|
225
|
-
---
|
|
226
|
-
|
|
227
|
-
### **Output Format**
|
|
228
|
-
Your output must **exactly match** the following format. Do not add or omit any sections.
|
|
229
251
|
|
|
230
|
-
**FinalAnswer**
|
|
231
|
-
**Available:**
|
|
232
|
-
<Your detailed assessment and detailed suggestion here, including the original text snippet and the improving comments>
|
|
233
|
-
**Readability:**
|
|
234
|
-
<Your detailed assessment and detailed suggestion here, including the original text snippet and the improving comments>
|
|
235
|
-
**Project Purpose:**
|
|
236
|
-
<Your detailed assessment and detailed suggestion here, including the original text snippet and the improving comments>
|
|
237
|
-
**Hardware and software spec and compatibility description:**
|
|
238
|
-
<Your detailed assessment and detailed suggestion here, including the original text snippet and the improving comments>
|
|
239
|
-
**Dependencies clearly stated:**
|
|
240
|
-
<Your detailed assessment and detailed suggestion here, including the original text snippet and the improving comments>
|
|
241
|
-
**License Information Included:**
|
|
242
|
-
<Your detailed assessment and detailed suggestion here, including the original text snippet and the improving comments>
|
|
243
|
-
**Code contributor / Author information included
|
|
244
|
-
<Your detailed assessment and detailed suggestion here, including the original text snippet and the improving comments>
|
|
245
|
-
**Overall Score:**
|
|
246
|
-
<Your detailed assessment and detailed suggestion here, including the original text snippet and the improving comments>
|
|
247
252
|
---
|
|
248
253
|
|
|
249
254
|
### **Structured Evaluation and Reasoning Process**
|
|
@@ -492,7 +497,7 @@ class EvaluationREADMETask(EvaluationTask):
|
|
|
492
497
|
readme_content=readme_content,
|
|
493
498
|
structured_evaluation=structured_reasoning_process,
|
|
494
499
|
)
|
|
495
|
-
agent =
|
|
500
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
496
501
|
response, _, token_usage, reasoning_process = agent.go(
|
|
497
502
|
system_prompt=system_prompt,
|
|
498
503
|
instruction_prompt=EVALUATION_INSTRUCTION,
|
|
@@ -184,7 +184,7 @@ class EvaluationTask(ABC):
|
|
|
184
184
|
token_usage=token_usage,
|
|
185
185
|
)
|
|
186
186
|
|
|
187
|
-
def evaluate(self) -> dict:
|
|
187
|
+
def evaluate(self) -> tuple[dict, list[str]]:
|
|
188
188
|
self._enter_evaluation()
|
|
189
189
|
files = self._collect_files()
|
|
190
190
|
evaluations, token_usage, files = self._evaluate(files)
|
|
@@ -198,7 +198,7 @@ class EvaluationTask(ABC):
|
|
|
198
198
|
self.print_step(token_usage=token_usage)
|
|
199
199
|
|
|
200
200
|
@abstractmethod
|
|
201
|
-
def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
|
|
201
|
+
def _evaluate(self, files: list[str]) -> tuple[dict, dict, list[str]]:
|
|
202
202
|
pass
|
|
203
203
|
|
|
204
204
|
@abstractmethod
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
|
|
2
|
+
INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT = """
|
|
3
|
+
You are an expert in evaluating the quality of user guide in software repositories.
|
|
4
|
+
Your task is to analyze the provided files related to user guide and generate a structured quality assessment based on the following criteria.
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
### **Evaluation Criteria**
|
|
8
|
+
|
|
9
|
+
1. **Readability**:
|
|
10
|
+
* **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
|
|
11
|
+
* **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
|
|
12
|
+
* **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
|
|
13
|
+
* **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
|
|
14
|
+
* **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
|
|
15
|
+
|
|
16
|
+
2. **Arguments and Clarity**:
|
|
17
|
+
* **Assessment**: [Your evaluation of whether it provides a clear **description** of arguments and their usage]
|
|
18
|
+
* **Improvement Suggestions**:
|
|
19
|
+
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
20
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
21
|
+
|
|
22
|
+
3. **Return Value and Clarity**:
|
|
23
|
+
* **Assessment**: [Your evaluation of whether it provides a clear **description** of return value and its meaning]
|
|
24
|
+
* **Improvement Suggestions**:
|
|
25
|
+
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
26
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
27
|
+
|
|
28
|
+
4. **Context and Purpose**:
|
|
29
|
+
* **Assessment**: [Your evaluation of whether it provides a clear **description** of the context and purpose of the module]
|
|
30
|
+
* **Improvement Suggestions**:
|
|
31
|
+
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
32
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
33
|
+
|
|
34
|
+
5. **Error Handling**:
|
|
35
|
+
* **Assessment**: [Your evaluation of whether it provides a clear **description** of error handling]
|
|
36
|
+
* **Improvement Suggestions**:
|
|
37
|
+
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
38
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
39
|
+
|
|
40
|
+
6. **Usage Examples**:
|
|
41
|
+
* **Assessment**: [Your evaluation of whether it provides a clear **description** of usage examples]
|
|
42
|
+
* **Improvement Suggestions**:
|
|
43
|
+
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
44
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
45
|
+
|
|
46
|
+
7. **Overall Score**: Give an overall quality rating of the User Guide information.
|
|
47
|
+
* Output: `Poor`, `Fair`, `Good`, or `Excellent`
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
### **Final Report Ouput**
|
|
52
|
+
Your final report must **exactly match** the following format. Do not add or omit any sections.
|
|
53
|
+
|
|
54
|
+
**FinalAnswer**
|
|
55
|
+
* **Overall Score:** [Poor / Fair / Good / Excellent]
|
|
56
|
+
* **Overall Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
57
|
+
* **Overall Improvement Suggestions:**
|
|
58
|
+
- "Original text snippet 1" - Improving comment 1
|
|
59
|
+
- "Original text snippet 2" - Improving comment 2
|
|
60
|
+
- ...
|
|
61
|
+
* **Readability Analysis Score:** [Poor / Fair / Good / Excellent]
|
|
62
|
+
* **Readability Analysis Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
63
|
+
* **Readability Analysis Improvement Suggestions:**
|
|
64
|
+
- "Original text snippet 1" - Improving comment 1
|
|
65
|
+
- "Original text snippet 2" - Improving comment 2
|
|
66
|
+
- ...
|
|
67
|
+
* **Arguments and Clarity Score:** [Poor / Fair / Good / Excellent]
|
|
68
|
+
* **Arguments and Clarity Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
69
|
+
* **Arguments and Clarity Improvement Suggestions:**
|
|
70
|
+
- "Original text snippet 1" - Improving comment 1
|
|
71
|
+
- "Original text snippet 2" - Improving comment 2
|
|
72
|
+
- ...
|
|
73
|
+
* **Return Value and Clarity Score:** [Poor / Fair / Good / Excellent]
|
|
74
|
+
* **Return Value and Clarity Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
75
|
+
* **Return Value and Clarity Improvement Suggestions:**
|
|
76
|
+
- "Original text snippet 1" - Improving comment 1
|
|
77
|
+
- "Original text snippet 2" - Improving comment 2
|
|
78
|
+
- ...
|
|
79
|
+
* **Context and Purpose Score:** [Poor / Fair / Good / Excellent]
|
|
80
|
+
* **Context and Purpose Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
81
|
+
* **Context and Purpose Improvement Suggestions:**
|
|
82
|
+
- "Original text snippet 1" - Improving comment 1
|
|
83
|
+
- "Original text snippet 2" - Improving comment 2
|
|
84
|
+
- ...
|
|
85
|
+
* **Error Handling Score:** [Poor / Fair / Good / Excellent]
|
|
86
|
+
* **Error Handling Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
87
|
+
* **Error Handling Improvement Suggestions:**
|
|
88
|
+
- "Original text snippet 1" - Improving comment 1
|
|
89
|
+
- "Original text snippet 2" - Improving comment 2
|
|
90
|
+
- ...
|
|
91
|
+
* **Usage Examples Score:** [Poor / Fair / Good / Excellent]
|
|
92
|
+
* **Usage Examples Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
93
|
+
* **Usage Examples Improvement Suggestions:**
|
|
94
|
+
- "Original text snippet 1" - Improving comment 1
|
|
95
|
+
- "Original text snippet 2" - Improving comment 2
|
|
96
|
+
- ...
|
|
97
|
+
...
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
### **User Guide Content:**
|
|
102
|
+
{userguide_content}
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
CONSISTENCY_EVAL_SYSTEM_PROMPT = """
|
|
109
|
+
You are an expert in evaluating the consistency of user guide in software repositories.
|
|
110
|
+
Your task is to analyze both:
|
|
111
|
+
1. the provided file related to user guide/API documentation,
|
|
112
|
+
2. the code definitions related to the user guide/API documentation
|
|
113
|
+
and generate a structured consistency assessment based on the following criteria.
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
### **Evaluation Criteria**
|
|
118
|
+
|
|
119
|
+
**Consistency**:
|
|
120
|
+
* **Score**: [Poor / Fair / Good / Excellent]
|
|
121
|
+
* **Assessment**: [Your evaluation of whether the user guide/API documentation is consistent with the code definitions]
|
|
122
|
+
* **Development**: [A list of inconsistent function/class/method name and inconsistent docstring]
|
|
123
|
+
* **Strengths**: [A list of strengths of the user guide/API documentation on consistency]
|
|
124
|
+
|
|
125
|
+
### **Output Format**
|
|
126
|
+
Your output **must exactly match** the following format:
|
|
127
|
+
```
|
|
128
|
+
**Consistency**:
|
|
129
|
+
* **Score**: [Poor / Fair / Good / Excellent]
|
|
130
|
+
* **Assessment**: [Your evaluation of whether the user guide/API documentation is consistent with the code definitions]
|
|
131
|
+
* **Development**: [A list of inconsistent function/class/method name and inconsistent docstring]
|
|
132
|
+
* **Strengths**: [A list of strengths of the user guide/API documentation on consistency]
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### **Output Example**
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
**Consistency**:
|
|
139
|
+
* **Assessment**: [Your evaluation of whether the user guide/API documentation is consistent with the code definitions]
|
|
140
|
+
* **Development**:
|
|
141
|
+
- Inconsistent function/class/method name 1
|
|
142
|
+
- Inconsistent docstring 1
|
|
143
|
+
- Inconsistent function/class/method name 2
|
|
144
|
+
- Inconsistent docstring 2
|
|
145
|
+
- ...
|
|
146
|
+
* **Strengths**:
|
|
147
|
+
- Strengths 1
|
|
148
|
+
- Strengths 2
|
|
149
|
+
- ...
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
### **Input User Guide/API Documentation**
|
|
155
|
+
{user_guide_api_documentation}
|
|
156
|
+
|
|
157
|
+
### **Code Definitions**
|
|
158
|
+
{code_definitions}
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
"""
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import logging
|
|
5
|
+
from langchain.prompts import ChatPromptTemplate
|
|
6
|
+
from markdownify import markdownify as md
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
from bioguider.agents.agent_utils import read_file
|
|
10
|
+
from bioguider.agents.collection_task import CollectionTask
|
|
11
|
+
from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION, CollectionGoalItemEnum
|
|
12
|
+
from bioguider.utils.constants import (
|
|
13
|
+
DEFAULT_TOKEN_USAGE,
|
|
14
|
+
ProjectMetadata,
|
|
15
|
+
StructuredEvaluationInstallationResult,
|
|
16
|
+
FreeEvaluationInstallationResult,
|
|
17
|
+
EvaluationInstallationResult,
|
|
18
|
+
)
|
|
19
|
+
from bioguider.rag.data_pipeline import count_tokens
|
|
20
|
+
from .common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
|
|
21
|
+
from ..utils.pyphen_utils import PyphenReadability
|
|
22
|
+
|
|
23
|
+
from .evaluation_task import EvaluationTask
|
|
24
|
+
from .agent_utils import read_file
|
|
25
|
+
from bioguider.utils.utils import increase_token_usage
|
|
26
|
+
from .evaluation_userguide_prompts import CONSISTENCY_EVAL_SYSTEM_PROMPT, INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT
|
|
27
|
+
from .consistency_collection_task import ConsistencyCollectionTask
|
|
28
|
+
|
|
29
|
+
class ConsistencyEvaluationResult(BaseModel):
|
|
30
|
+
consistency_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
31
|
+
consistency_assessment: str=Field(description="Your evaluation of whether the user guide/API documentation is consistent with the code definitions")
|
|
32
|
+
consistency_development: list[str]=Field(description="A list of inconsistent function/class/method name and inconsistent docstring")
|
|
33
|
+
consistency_strengths: list[str]=Field(description="A list of strengths of the user guide/API documentation on consistency")
|
|
34
|
+
|
|
35
|
+
class UserGuideEvaluationResult(BaseModel):
|
|
36
|
+
overall_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
37
|
+
overall_key_strengths: str=Field(description="A string value, the key strengths of the user guide")
|
|
38
|
+
overall_improvement_suggestions: str=Field(description="Suggestions to improve the overall score if necessary")
|
|
39
|
+
readability_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
40
|
+
readability_suggestions: str=Field(description="Suggestions to improve readability if necessary")
|
|
41
|
+
context_and_purpose_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
42
|
+
context_and_purpose_suggestions: str=Field(description="Suggestions to improve context and purpose if necessary")
|
|
43
|
+
error_handling_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
44
|
+
error_handling_suggestions: str=Field(description="Suggestions to improve error handling if necessary")
|
|
45
|
+
|
|
46
|
+
class IndividualUserGuideEvaluationResult(BaseModel):
|
|
47
|
+
user_guide_evaluation: UserGuideEvaluationResult | None=Field(description="The evaluation result of the user guide")
|
|
48
|
+
consistency_evaluation: ConsistencyEvaluationResult | None=Field(description="The evaluation result of the consistency of the user guide")
|
|
49
|
+
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
class EvaluationUserGuideTask(EvaluationTask):
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
llm,
|
|
56
|
+
repo_path,
|
|
57
|
+
gitignore_path,
|
|
58
|
+
meta_data = None,
|
|
59
|
+
step_callback = None,
|
|
60
|
+
summarized_files_db = None,
|
|
61
|
+
code_structure_db = None,
|
|
62
|
+
):
|
|
63
|
+
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
|
|
64
|
+
self.evaluation_name = "User Guide Evaluation"
|
|
65
|
+
self.code_structure_db = code_structure_db
|
|
66
|
+
|
|
67
|
+
def _collect_files(self):
|
|
68
|
+
task = CollectionTask(
|
|
69
|
+
llm=self.llm,
|
|
70
|
+
step_callback=self.step_callback,
|
|
71
|
+
summarized_files_db=self.summarized_files_db,
|
|
72
|
+
)
|
|
73
|
+
task.compile(
|
|
74
|
+
repo_path=self.repo_path,
|
|
75
|
+
gitignore_path=Path(self.repo_path, ".gitignore"),
|
|
76
|
+
goal_item=CollectionGoalItemEnum.UserGuide.name,
|
|
77
|
+
)
|
|
78
|
+
files = task.collect()
|
|
79
|
+
return files
|
|
80
|
+
|
|
81
|
+
def _evaluate_consistency(self, file: str) -> tuple[EvaluationInstallationResult | None, dict, list[str]]:
|
|
82
|
+
consistency_collect_task = ConsistencyCollectionTask(
|
|
83
|
+
llm=self.llm,
|
|
84
|
+
code_structure_db=self.code_structure_db,
|
|
85
|
+
step_callback=self.step_callback,
|
|
86
|
+
)
|
|
87
|
+
consistency_collect_task.compile(repo_path=self.repo_path, gitignore_path=Path(self.repo_path, ".gitignore"))
|
|
88
|
+
with open(Path(self.repo_path, file), "r") as f:
|
|
89
|
+
user_guide_api_documentation = f.read()
|
|
90
|
+
res, code_definitions = consistency_collect_task.collect(user_guide_api_documentation)
|
|
91
|
+
|
|
92
|
+
if not res:
|
|
93
|
+
# No sufficient information to evaluate the consistency of the user guide/API documentation
|
|
94
|
+
return None, {**DEFAULT_TOKEN_USAGE}
|
|
95
|
+
|
|
96
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
97
|
+
CONSISTENCY_EVAL_SYSTEM_PROMPT
|
|
98
|
+
).format(
|
|
99
|
+
user_guide_api_documentation=user_guide_api_documentation,
|
|
100
|
+
code_definitions=code_definitions,
|
|
101
|
+
)
|
|
102
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
103
|
+
res, _, token_usage, reasoning_process = agent.go(
|
|
104
|
+
system_prompt=system_prompt,
|
|
105
|
+
instruction_prompt="Now, let's begin the consistency evaluation step.",
|
|
106
|
+
schema=ConsistencyEvaluationResult,
|
|
107
|
+
)
|
|
108
|
+
res: ConsistencyEvaluationResult = res
|
|
109
|
+
self.print_step(step_output=f"Consistency Evaluation Result: {res}")
|
|
110
|
+
self.print_step(step_output=f"Consistency Evaluation Reasoning Process: {reasoning_process}")
|
|
111
|
+
self.print_step(token_usage=token_usage)
|
|
112
|
+
|
|
113
|
+
return res, token_usage
|
|
114
|
+
|
|
115
|
+
def _evaluate_individual_userguide(self, file: str) -> tuple[IndividualUserGuideEvaluationResult | None, dict]:
|
|
116
|
+
content = read_file(Path(self.repo_path, file))
|
|
117
|
+
|
|
118
|
+
if content is None:
|
|
119
|
+
logger.error(f"Error in reading file {file}")
|
|
120
|
+
return None, {**DEFAULT_TOKEN_USAGE}
|
|
121
|
+
|
|
122
|
+
readability = PyphenReadability()
|
|
123
|
+
flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
|
|
124
|
+
_, _, _, _, _ = readability.readability_metrics(content)
|
|
125
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
126
|
+
INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT
|
|
127
|
+
).format(
|
|
128
|
+
flesch_reading_ease=flesch_reading_ease,
|
|
129
|
+
flesch_kincaid_grade=flesch_kincaid_grade,
|
|
130
|
+
gunning_fog_index=gunning_fog_index,
|
|
131
|
+
smog_index=smog_index,
|
|
132
|
+
userguide_content=content,
|
|
133
|
+
)
|
|
134
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
135
|
+
res, _, token_usage, reasoning_process = agent.go(
|
|
136
|
+
system_prompt=system_prompt,
|
|
137
|
+
instruction_prompt="Now, let's begin the user guide/API documentation evaluation.",
|
|
138
|
+
schema=UserGuideEvaluationResult,
|
|
139
|
+
)
|
|
140
|
+
res: UserGuideEvaluationResult = res
|
|
141
|
+
|
|
142
|
+
consistency_evaluation_result, _temp_token_usage = self._evaluate_consistency(file)
|
|
143
|
+
if consistency_evaluation_result is None:
|
|
144
|
+
# No sufficient information to evaluate the consistency of the user guide/API documentation
|
|
145
|
+
consistency_evaluation_result = ConsistencyEvaluationResult(
|
|
146
|
+
consistency_score="N/A",
|
|
147
|
+
consistency_assessment="No sufficient information to evaluate the consistency of the user guide/API documentation",
|
|
148
|
+
consistency_development=[],
|
|
149
|
+
consistency_strengths=[],
|
|
150
|
+
)
|
|
151
|
+
return IndividualUserGuideEvaluationResult(
|
|
152
|
+
user_guide_evaluation=res,
|
|
153
|
+
consistency_evaluation=consistency_evaluation_result,
|
|
154
|
+
), token_usage
|
|
155
|
+
|
|
156
|
+
def _evaluate(self, files: list[str] | None = None) -> tuple[dict[str, IndividualUserGuideEvaluationResult] | None, dict, list[str]]:
|
|
157
|
+
total_token_usage = {**DEFAULT_TOKEN_USAGE}
|
|
158
|
+
user_guide_evaluation_results = {}
|
|
159
|
+
for file in files:
|
|
160
|
+
user_guide_evaluation_result, token_usage = self._evaluate_individual_userguide(file)
|
|
161
|
+
total_token_usage = increase_token_usage(total_token_usage, token_usage)
|
|
162
|
+
user_guide_evaluation_results[file] = user_guide_evaluation_result
|
|
163
|
+
|
|
164
|
+
return user_guide_evaluation_results, total_token_usage, files
|
bioguider/agents/prompt_utils.py
CHANGED
|
@@ -104,19 +104,22 @@ COLLECTION_PROMPTS = {
|
|
|
104
104
|
"goal_item": "User Guide",
|
|
105
105
|
"related_file_description": """A document qualifies as a **User Guide** if it includes **at least one** of the following elements.
|
|
106
106
|
If **any one** of these is present, the document should be classified as a User Guide — full coverage is **not required**:
|
|
107
|
-
-
|
|
108
|
-
-
|
|
109
|
-
-
|
|
110
|
-
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
- Document **functions, methods, or classes**
|
|
108
|
+
- Describe **input parameters, return values**, and **usage syntax**
|
|
109
|
+
- Include **technical guidance** for using specific APIs
|
|
110
|
+
- Are often found in folders such as
|
|
111
|
+
* `man/` (for `.Rd` files in R)
|
|
112
|
+
* `docs/reference/`, `docs/api/`, `docs/dev/` (for Python) or similar
|
|
113
|
+
* Standalone files with names like `api.md`, `reference.md`, `user_guide.md`
|
|
114
|
+
**Do not** classify the document as a User Guide if it primarily serves as a Tutorial or Example. Such documents typically include:
|
|
115
115
|
- Sample Datasets: Example data used to illustrate functionality.
|
|
116
116
|
- Narrative Explanations: Story-like descriptions guiding the user through examples.
|
|
117
117
|
- Code Walkthroughs: Detailed explanations of code snippets in a tutorial format.
|
|
118
118
|
**Do not** classify the document as a User Guide if it is souce code or a script (*.py, *.R) that is not intended for end-user interaction.
|
|
119
119
|
- You can include directory names if all files in the directory are relevant to the goal item.""",
|
|
120
|
+
"plan_important_instructions": """ - **Do not** classify the document as a User Guide if it is source code or a script (*.py, *.R) that is not intended for end-user interaction.
|
|
121
|
+
- **Do not** classify the document as a User Guide if it is a notebook (*.ipynb, *.Rmd) that is not intended for end-user interaction.
|
|
122
|
+
- You plan **must not** include any source code or script (*.py, *.R) or notebook (*.ipynb, *.Rmd) that is not intended for end-user interaction."""
|
|
120
123
|
},
|
|
121
124
|
"Tutorial": {
|
|
122
125
|
"goal_item": "Tutorials & Vignettes",
|