bioguider 0.2.10__tar.gz → 0.2.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- {bioguider-0.2.10 → bioguider-0.2.11}/PKG-INFO +1 -1
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/agent_task.py +1 -1
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/agent_utils.py +31 -1
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/collection_task.py +12 -24
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/evaluation_installation_task.py +101 -12
- bioguider-0.2.11/bioguider/agents/evaluation_readme_task.py +473 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/evaluation_task.py +4 -109
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/identification_task.py +37 -25
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/prompt_utils.py +3 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/managers/evaluation_manager.py +1 -1
- {bioguider-0.2.10 → bioguider-0.2.11}/pyproject.toml +1 -1
- {bioguider-0.2.10 → bioguider-0.2.11}/LICENSE +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/README.md +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/__init__.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/__init__.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/agent_tools.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/collection_execute_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/collection_observe_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/collection_plan_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/collection_task_utils.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/common_agent.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/common_agent_2step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/common_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/dockergeneration_execute_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/dockergeneration_observe_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/dockergeneration_plan_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/dockergeneration_task.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/dockergeneration_task_utils.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/identification_execute_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/identification_observe_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/identification_plan_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/identification_task_utils.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/peo_common_step.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/python_ast_repl_tool.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/agents/rag_collection_task.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/conversation.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/database/summarized_file_db.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/rag/__init__.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/rag/config.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/rag/data_pipeline.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/rag/embedder.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/rag/rag.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/settings.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/utils/constants.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/utils/default.gitignore +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/utils/file_utils.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/utils/gitignore_checker.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/utils/pyphen_utils.py +0 -0
- {bioguider-0.2.10 → bioguider-0.2.11}/bioguider/utils/utils.py +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
|
|
2
2
|
import json
|
|
3
|
+
from json import JSONDecodeError
|
|
3
4
|
import os
|
|
4
5
|
import re
|
|
5
6
|
import subprocess
|
|
@@ -376,4 +377,33 @@ def escape_braces(text: str) -> str:
|
|
|
376
377
|
text = re.sub(r'(?<!})}(?!})', '}}', text)
|
|
377
378
|
# Then replace single { not part of {{
|
|
378
379
|
text = re.sub(r'(?<!{){(?!{)', '{{', text)
|
|
379
|
-
return text
|
|
380
|
+
return text
|
|
381
|
+
|
|
382
|
+
def try_parse_json_object(json_obj: str) -> dict | None:
|
|
383
|
+
json_obj = json_obj.strip()
|
|
384
|
+
|
|
385
|
+
# First, try to parse
|
|
386
|
+
try:
|
|
387
|
+
obj = json.loads(json_obj)
|
|
388
|
+
return obj
|
|
389
|
+
except JSONDecodeError as e:
|
|
390
|
+
logger.error(e)
|
|
391
|
+
|
|
392
|
+
# Second, let's handle some common errors
|
|
393
|
+
if not json_obj.startswith("{") and not json_obj.endswith("}") and ":" in json_obj:
|
|
394
|
+
json_obj = "{" + json_obj + "}"
|
|
395
|
+
if json_obj.startswith("{{"):
|
|
396
|
+
json_obj = json_obj[1:]
|
|
397
|
+
if json_obj.endswith("}}"):
|
|
398
|
+
json_obj = json_obj[:-1]
|
|
399
|
+
|
|
400
|
+
# Finally, let's try to parse again
|
|
401
|
+
try:
|
|
402
|
+
obj = json.loads(json_obj)
|
|
403
|
+
return obj
|
|
404
|
+
except JSONDecodeError as e:
|
|
405
|
+
logger.error(e)
|
|
406
|
+
return None
|
|
407
|
+
except Exception as e:
|
|
408
|
+
logger.error(e)
|
|
409
|
+
return None
|
|
@@ -24,7 +24,7 @@ from langgraph.graph import StateGraph, START, END
|
|
|
24
24
|
|
|
25
25
|
from bioguider.database.summarized_file_db import SummarizedFilesDb
|
|
26
26
|
from bioguider.utils.file_utils import get_file_type
|
|
27
|
-
from bioguider.agents.agent_utils import read_directory
|
|
27
|
+
from bioguider.agents.agent_utils import read_directory, try_parse_json_object
|
|
28
28
|
from bioguider.agents.collection_task_utils import (
|
|
29
29
|
RELATED_FILE_GOAL_ITEM,
|
|
30
30
|
CollectionWorkflowState,
|
|
@@ -172,28 +172,16 @@ class CollectionTask(AgentTask):
|
|
|
172
172
|
if s["final_answer"] is None:
|
|
173
173
|
return None
|
|
174
174
|
result = s["final_answer"].strip()
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
result = json_obj["final_answer"]
|
|
178
|
-
if isinstance(result, str):
|
|
179
|
-
result = result.strip()
|
|
180
|
-
return [result]
|
|
181
|
-
elif isinstance(result, list):
|
|
182
|
-
return result
|
|
183
|
-
else:
|
|
184
|
-
logger.error(f"Final answer is not a valid JSON list or string: {result}")
|
|
185
|
-
return None
|
|
186
|
-
except json.JSONDecodeError:
|
|
175
|
+
the_obj = try_parse_json_object(result)
|
|
176
|
+
if the_obj is None or "final_answer" not in the_obj:
|
|
187
177
|
logger.error(f"Final answer is not a valid JSON: {result}")
|
|
188
178
|
return None
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
179
|
+
final_result = the_obj["final_answer"]
|
|
180
|
+
if isinstance(final_result, str):
|
|
181
|
+
final_result = final_result.strip()
|
|
182
|
+
return [final_result]
|
|
183
|
+
elif isinstance(final_result, list):
|
|
184
|
+
return final_result
|
|
185
|
+
else:
|
|
186
|
+
logger.error(f"Final answer is not a valid JSON list or string: {result}")
|
|
187
|
+
return None
|
|
@@ -9,6 +9,7 @@ from pydantic import BaseModel, Field
|
|
|
9
9
|
from markdownify import markdownify as md
|
|
10
10
|
|
|
11
11
|
from bioguider.agents.agent_utils import read_file
|
|
12
|
+
from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
|
|
12
13
|
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
|
|
13
14
|
from bioguider.rag.data_pipeline import count_tokens
|
|
14
15
|
from .common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
|
|
@@ -16,11 +17,52 @@ from .common_agent import CommonConversation
|
|
|
16
17
|
from ..utils.pyphen_utils import PyphenReadability
|
|
17
18
|
from ..utils.gitignore_checker import GitignoreChecker
|
|
18
19
|
from .evaluation_task import EvaluationTask
|
|
19
|
-
from .agent_utils import read_file
|
|
20
|
+
from .agent_utils import increase_token_usage, read_file
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
23
24
|
|
|
25
|
+
STRUCTURED_EVALUATION_INSTALLATION_SYSTEM_PROMPT = """
|
|
26
|
+
You are an expert in evaluating the quality of installation information in software repositories.
|
|
27
|
+
Your task is to analyze the provided files related to installation and generate a structured quality assessment based on the following criteria.
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
### **Evaluation Criteria**
|
|
31
|
+
|
|
32
|
+
1. **Installation Available**: Is the installation documents accessible and present?
|
|
33
|
+
* Output: `Yes` or `No`
|
|
34
|
+
|
|
35
|
+
2. **Installation Tutorial**: Is the installation tutorial provided?
|
|
36
|
+
* Ouput: `Yes` or `No`
|
|
37
|
+
|
|
38
|
+
3. **Number of required Dependencies Installation**: The number of dependencies that are required to install
|
|
39
|
+
* Output: Number
|
|
40
|
+
* Suggest specific improvements if necessary, such as missing dependencies
|
|
41
|
+
|
|
42
|
+
4. **Overall Score**: Give an overall quality rating of the Installation information.
|
|
43
|
+
* Output: `Poor`, `Fair`, `Good`, or `Excellent`
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
### **Final Report Ouput**
|
|
48
|
+
Your final report must **exactly match** the following format. Do not add or omit any sections.
|
|
49
|
+
|
|
50
|
+
**FinalAnswer**
|
|
51
|
+
**Install Available:** [Yes / No]
|
|
52
|
+
**Install Tutorial:** [Yes / No]
|
|
53
|
+
**Dependency:**
|
|
54
|
+
* number: [Number]
|
|
55
|
+
* suggestions: <suggestion to improve **dependency information** like missing dependencies
|
|
56
|
+
**Overall Score:** [Poor / Fair / Good / Excellent]
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
### Installation Files Provided:
|
|
61
|
+
{installation_files_content}
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
|
|
24
66
|
EVALUATION_INSTALLATION_SYSTEM_PROMPT = """
|
|
25
67
|
You are an expert in evaluating the quality of **installation instructions** in software repositories.
|
|
26
68
|
Your task is to analyze the provided content of installation-related files and generate a **comprehensive, structured quality report**.
|
|
@@ -62,10 +104,17 @@ Your response **must exactly follow** the structure below:
|
|
|
62
104
|
---
|
|
63
105
|
|
|
64
106
|
### Installation Files Provided:
|
|
65
|
-
{
|
|
107
|
+
{installation_files_content}
|
|
66
108
|
|
|
67
109
|
"""
|
|
68
110
|
|
|
111
|
+
class StructuredEvaluationInstallationResult(BaseModel):
|
|
112
|
+
install_available: Optional[bool]=Field(description="A boolean value. Is the installation documents accessible and present?")
|
|
113
|
+
install_tutorial: Optional[bool]=Field(description="A boolean value. Is the installation tutorial provided?")
|
|
114
|
+
dependency_number: Optional[int]=Field(description="A number. It is the number of dependencies that are required to install.")
|
|
115
|
+
dependency_suggestions: Optional[str]=Field(description="A string value. It is the specific improvements if necessary, such as missing dependencies")
|
|
116
|
+
overall_score: Optional[str]=Field(description="A overall scroll for the installation quality, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
117
|
+
|
|
69
118
|
class EvaluationInstallationResult(BaseModel):
|
|
70
119
|
ease_of_access: Optional[str]=Field(description="Is the installation information easy to access")
|
|
71
120
|
score: Optional[str]=Field(description="An overall score, could be Poor, Fair, Good or Excellent")
|
|
@@ -118,10 +167,10 @@ class EvaluationInstallationTask(EvaluationTask):
|
|
|
118
167
|
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
|
|
119
168
|
self.evaluation_name = "Installation Evaluation"
|
|
120
169
|
|
|
121
|
-
|
|
170
|
+
|
|
171
|
+
def _collect_install_files_content(self, files: list[str] | None=None) -> str:
|
|
122
172
|
if files is None or len(files) == 0:
|
|
123
|
-
return
|
|
124
|
-
|
|
173
|
+
return "N/A"
|
|
125
174
|
files_content = ""
|
|
126
175
|
MAX_TOKENS = os.environ.get("OPENAI_MAX_INPUT_TOKENS", 102400)
|
|
127
176
|
for f in files:
|
|
@@ -137,24 +186,64 @@ class EvaluationInstallationTask(EvaluationTask):
|
|
|
137
186
|
{content}
|
|
138
187
|
|
|
139
188
|
"""
|
|
189
|
+
return files_content
|
|
190
|
+
|
|
191
|
+
def _structured_evaluate(self, files: list[str] | None = None) -> tuple[dict|None, dict]:
|
|
192
|
+
if files is None or len(files) == 0:
|
|
193
|
+
return None, {**DEFAULT_TOKEN_USAGE}
|
|
194
|
+
|
|
195
|
+
files_content = self._collect_install_files_content(files)
|
|
196
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
197
|
+
STRUCTURED_EVALUATION_INSTALLATION_SYSTEM_PROMPT,
|
|
198
|
+
).format(
|
|
199
|
+
installation_files_content=files_content,
|
|
200
|
+
)
|
|
201
|
+
agent = CommonAgentTwoChainSteps(llm=self.llm)
|
|
202
|
+
res, _, token_usage, reasoning_process = agent.go(
|
|
203
|
+
system_prompt=system_prompt,
|
|
204
|
+
instruction_prompt=EVALUATION_INSTRUCTION,
|
|
205
|
+
schema=StructuredEvaluationInstallationResult,
|
|
206
|
+
)
|
|
207
|
+
self.print_step(step_output=reasoning_process)
|
|
208
|
+
self.print_step(token_usage=token_usage)
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
"structured_evaluation": res,
|
|
212
|
+
"structured_reasoning_process": reasoning_process,
|
|
213
|
+
}, token_usage
|
|
214
|
+
|
|
215
|
+
def _free_evaluate(self, files: list[str] | None=None) -> tuple[dict|None, dict]:
|
|
216
|
+
if files is None or len(files) == 0:
|
|
217
|
+
return None, {**DEFAULT_TOKEN_USAGE}
|
|
218
|
+
|
|
219
|
+
files_content = self._collect_install_files_content(files)
|
|
140
220
|
system_prompt = ChatPromptTemplate.from_template(EVALUATION_INSTALLATION_SYSTEM_PROMPT).format(
|
|
141
|
-
|
|
221
|
+
installation_files_content=files_content
|
|
142
222
|
)
|
|
143
223
|
agent = CommonAgentTwoChainSteps(llm=self.llm)
|
|
144
224
|
res, _, token_usage, reasoning_process = agent.go(
|
|
145
225
|
system_prompt=system_prompt,
|
|
146
|
-
instruction_prompt=
|
|
226
|
+
instruction_prompt=EVALUATION_INSTRUCTION,
|
|
147
227
|
schema=EvaluationInstallationResultSchema,
|
|
148
228
|
)
|
|
149
229
|
res = EvaluationInstallationResult(**res)
|
|
150
230
|
self.print_step(step_output=reasoning_process)
|
|
231
|
+
self.print_step(token_usage=token_usage)
|
|
151
232
|
evaluation = {
|
|
152
|
-
"
|
|
153
|
-
"ease_of_access": res.ease_of_access,
|
|
154
|
-
"hardware_requirements": res.hardware_requirements,
|
|
155
|
-
"clarity_of_dependency": res.clarity_of_dependency,
|
|
156
|
-
"installation_guide": res.installation_guide,
|
|
233
|
+
"evaluation": res,
|
|
157
234
|
"reasoning_process": reasoning_process,
|
|
158
235
|
}
|
|
159
236
|
return evaluation, token_usage
|
|
237
|
+
|
|
238
|
+
def _evaluate(self, files: list[str] | None = None) -> tuple[dict | None, dict]:
|
|
239
|
+
evaluation, token_usage = self._free_evaluate(files)
|
|
240
|
+
structured_evaluation, structured_token_usage = self._structured_evaluate(files)
|
|
241
|
+
|
|
242
|
+
combined_evaluation = {
|
|
243
|
+
**evaluation,
|
|
244
|
+
**structured_evaluation,
|
|
245
|
+
}
|
|
246
|
+
total_token_usage = increase_token_usage(token_usage, structured_token_usage)
|
|
247
|
+
|
|
248
|
+
return combined_evaluation, total_token_usage
|
|
160
249
|
|
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Optional
|
|
5
|
+
from langchain.prompts import ChatPromptTemplate
|
|
6
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
|
|
10
|
+
|
|
11
|
+
from ..utils.pyphen_utils import PyphenReadability
|
|
12
|
+
from bioguider.agents.agent_utils import increase_token_usage, read_file, summarize_file
|
|
13
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps
|
|
14
|
+
from bioguider.agents.evaluation_task import EvaluationTask
|
|
15
|
+
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
STRUCTURED_EVALUATION_README_SYSTEM_PROMPT = """
|
|
20
|
+
You are an expert in evaluating the quality of README files in software repositories.
|
|
21
|
+
Your task is to analyze the provided README file and generate a structured quality assessment based on the following criteria.
|
|
22
|
+
If a LICENSE file is present in the repository, its content will also be provided to support your evaluation of license-related criteria.
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
### **Evaluation Criteria**
|
|
26
|
+
|
|
27
|
+
1. **Available**: Is the README accessible and present?
|
|
28
|
+
* Output: `Yes` or `No`
|
|
29
|
+
|
|
30
|
+
2. **Readability**: Evaluate based on readability metrics such as Flesch-Kincaid Grade Level, SMOG Index, etc.
|
|
31
|
+
* Output: `Poor`, `Fair`, `Good`, or `Excellent`
|
|
32
|
+
* Suggest specific improvements if necessary
|
|
33
|
+
|
|
34
|
+
3. **Project Purpose**: Is the project's goal or function clearly stated?
|
|
35
|
+
* Output: `Yes` or `No`
|
|
36
|
+
* Provide suggestions if unclear
|
|
37
|
+
|
|
38
|
+
4. **Hardware and Software Requirements**: Are hardware/software specs and compatibility details included?
|
|
39
|
+
* Output: `Poor`, `Fair`, `Good`, or `Excellent`
|
|
40
|
+
* Suggest how to improve the section if needed
|
|
41
|
+
|
|
42
|
+
5. **Dependencies**: Are all necessary software libraries and dependencies clearly listed?
|
|
43
|
+
* Output: `Poor`, `Fair`, `Good`, or `Excellent`
|
|
44
|
+
* Suggest improvements if applicable
|
|
45
|
+
|
|
46
|
+
6. **License Information**: Is license type clearly indicated?
|
|
47
|
+
* Output: `Yes` or `No`
|
|
48
|
+
* Suggest improvement if missing or unclear
|
|
49
|
+
|
|
50
|
+
7. **Author / Contributor Info**: Are contributor or maintainer details provided?
|
|
51
|
+
* Output: `Yes` or `No`
|
|
52
|
+
* Suggest improvement if missing
|
|
53
|
+
|
|
54
|
+
8. **Overall Score**: Give an overall quality rating of the README.
|
|
55
|
+
* Output: `Poor`, `Fair`, `Good`, or `Excellent`
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
### **Readability Metrics**
|
|
60
|
+
* **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
|
|
61
|
+
* **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
|
|
62
|
+
* **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
|
|
63
|
+
* **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
### **Final Report Ouput**
|
|
68
|
+
Your final report must **exactly match** the following format. Do not add or omit any sections.
|
|
69
|
+
|
|
70
|
+
**FinalAnswer**
|
|
71
|
+
**Available:** [Yes / No]
|
|
72
|
+
**Readability:**
|
|
73
|
+
* score: [Poor / Fair / Good / Excellent]
|
|
74
|
+
* suggestions: <suggestions to improve README readability>
|
|
75
|
+
**Project Purpose:**
|
|
76
|
+
* score: [Yes / No]
|
|
77
|
+
* suggestions: <suggestions to improve project purpose.>
|
|
78
|
+
**Hardware and software spec and compatibility description:**
|
|
79
|
+
* score: [Poor / Fair / Good / Excellent]
|
|
80
|
+
* suggestions: <suggestions to improve **hardware and software** description>
|
|
81
|
+
**Dependencies clearly stated:**
|
|
82
|
+
* score: [Poor / Fair / Good / Excellent]
|
|
83
|
+
* suggestions: <suggestions to improve **Dependencies** description>
|
|
84
|
+
**License Information Included:**
|
|
85
|
+
* score: [Yes / No]
|
|
86
|
+
* suggestions: <suggestions to improve **License Information**>
|
|
87
|
+
**Overall Score:** [Poor / Fair / Good / Excellent]
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
### **README Path**
|
|
92
|
+
{readme_path}
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
### **README content**
|
|
97
|
+
{readme_content}
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
### **LICENSE Path**
|
|
102
|
+
{license_path}
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
### **LICENSE Summarized Content**
|
|
107
|
+
{license_summarized_content}
|
|
108
|
+
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
EVALUATION_README_SYSTEM_PROMPT = """
|
|
112
|
+
You are an expert in evaluating the quality of README files in software repositories.
|
|
113
|
+
Your task is to analyze the provided README file and generate a comprehensive quality report.
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
### **Step 1: Identify README type
|
|
118
|
+
|
|
119
|
+
First, determine whether the provided README is a **project-level README** (typically at the root of a repository) or a **folder-level README** (typically inside subdirectories).
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
### **Evaluation Criteria**
|
|
124
|
+
|
|
125
|
+
#### If the README is a **project-level** file, evaluate it using the following criteria.
|
|
126
|
+
|
|
127
|
+
For each criterion below, provide a brief assessment followed by specific, actionable comments for improvement.
|
|
128
|
+
|
|
129
|
+
**1. Project Clarity & Purpose**
|
|
130
|
+
* **Assessment**: [Your evaluation of whether the project's purpose is clear.]
|
|
131
|
+
* **Improvement Suggestions**:
|
|
132
|
+
* **Original text:** [Quote a specific line/section from the README.]
|
|
133
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
134
|
+
* **Original text:** [Quote a specific line/section from the README.]
|
|
135
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
136
|
+
...
|
|
137
|
+
|
|
138
|
+
**2. Installation Instructions**
|
|
139
|
+
* **Assessment**: [Your evaluation of the installation instructions.]
|
|
140
|
+
* **Improvement Suggestions**:
|
|
141
|
+
* **Original text:** [Quote text related to installation.]
|
|
142
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
143
|
+
* **Original text:** [Quote text related to installation.]
|
|
144
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
145
|
+
...
|
|
146
|
+
|
|
147
|
+
**3. Usage Instructions**
|
|
148
|
+
* **Assessment**: [Your evaluation of the usage instructions.]
|
|
149
|
+
* **Improvement Suggestions**:
|
|
150
|
+
* **Original text:** [Quote text related to usage.]
|
|
151
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
152
|
+
* **Original text:** [Quote text related to usage.]
|
|
153
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
154
|
+
...
|
|
155
|
+
|
|
156
|
+
**4. Contributing Guidelines**
|
|
157
|
+
* **Assessment**: [Your evaluation of the contributing guidelines.]
|
|
158
|
+
* **Improvement Suggestions**:
|
|
159
|
+
* **Original text:** [Quote text related to contributions.]
|
|
160
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
161
|
+
* **Original text:** [Quote text related to contributions.]
|
|
162
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
163
|
+
...
|
|
164
|
+
|
|
165
|
+
**5. License Information**
|
|
166
|
+
* **Assessment**: [Your evaluation of the license information.]
|
|
167
|
+
* **Improvement Suggestions**:
|
|
168
|
+
* **Original text:** [Quote text related to the license.]
|
|
169
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
170
|
+
* **Original text:** [Quote text related to the license.]
|
|
171
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
172
|
+
...
|
|
173
|
+
|
|
174
|
+
**6. Readability Analysis**
|
|
175
|
+
* **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
|
|
176
|
+
* **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
|
|
177
|
+
* **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
|
|
178
|
+
* **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
|
|
179
|
+
* **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
#### If if is a **folder-level** file, use the following criteria instead.
|
|
184
|
+
|
|
185
|
+
For each criterion below, provide a brief assessment followed by specific, actionable comments for improvement.
|
|
186
|
+
|
|
187
|
+
**1. Folder Description**
|
|
188
|
+
* **Assessment**: [Your evaluation of whether it Provides a clear **description** of what the folder contains (e.g., modules, scripts, data).]
|
|
189
|
+
* **Improvement Suggestions**:
|
|
190
|
+
* **Original text:** [Quote a specific line/section from the README.]
|
|
191
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
192
|
+
|
|
193
|
+
**2. Folder Purpose**
|
|
194
|
+
* **Assessment**: [Your evaluation of whether it explains the **purpose** or **role** of the components inside this subfolder.]
|
|
195
|
+
* **Improvement Suggestions**:
|
|
196
|
+
* **Original text:** [Quote text related to purpose.]
|
|
197
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
198
|
+
|
|
199
|
+
**3. Usage**
|
|
200
|
+
* **Assessment**: [Your evaluation of whether it includes **usage instructions** specific to this folder (e.g., commands, import paths, input/output files).]
|
|
201
|
+
* **Improvement Suggestions**:
|
|
202
|
+
* **Original text:** [Quote text related to usage.]
|
|
203
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
204
|
+
|
|
205
|
+
**4. Readability Analysis**
|
|
206
|
+
* **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
|
|
207
|
+
* **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
|
|
208
|
+
* **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
|
|
209
|
+
* **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
|
|
210
|
+
* **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
### Final Report Format
|
|
215
|
+
|
|
216
|
+
#### Your output **must exactly match** the following template:
|
|
217
|
+
|
|
218
|
+
**FinalAnswer**
|
|
219
|
+
|
|
220
|
+
* Project-Level README: Yes / No
|
|
221
|
+
* **Score:** [Poor / Fair / Good / Excellent]
|
|
222
|
+
* **Key Strengths**: <brief summary of the README's strongest points in 2-3 sentences>
|
|
223
|
+
* **Overall Improvement Suggestions:**
|
|
224
|
+
- "Original text snippet 1" - Improving comment 1
|
|
225
|
+
- "Original text snippet 2" - Improving comment 2
|
|
226
|
+
- ...
|
|
227
|
+
|
|
228
|
+
#### Notes
|
|
229
|
+
|
|
230
|
+
* **Project-Level README**: "Yes" if root-level; "No" if folder-level.
|
|
231
|
+
* **Score**: Overall quality rating, could be Poor / Fair / Good / Excellent.
|
|
232
|
+
* **Key Strengths**: Briefly highlight the README's strongest aspects.
|
|
233
|
+
* **Improvement Suggestions**: Provide concrete snippets and suggested improvements.
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
### **README path:**
|
|
239
|
+
{readme_path}
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
### **README Content:**
|
|
244
|
+
{readme_content}
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class StructuredEvaluationREADMEResult(BaseModel):
|
|
249
|
+
available_score: Optional[bool]=Field(description="A boolean value, Is the README accessible and present?")
|
|
250
|
+
readability_score: Optional[str]=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
251
|
+
readability_suggestions: Optional[str]=Field(description="Suggestions to improve readability if necessary")
|
|
252
|
+
project_purpose_score: Optional[bool]=Field(description="A boolean value. Is the project's goal or function clearly stated?")
|
|
253
|
+
project_purpose_suggestions: Optional[str]=Field(description="Suggestions if not clear")
|
|
254
|
+
hardware_and_software_spec_score: Optional[str]=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
255
|
+
hardware_and_software_spec_suggestions: Optional[str]=Field(description="Suggestions if not clear")
|
|
256
|
+
dependency_score: Optional[str]=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
257
|
+
dependency_suggestions: Optional[str]=Field(description="Suggestions if dependencies are not clearly stated")
|
|
258
|
+
license_score: Optional[bool]=Field(description="A boolean value, Are contributor or maintainer details provided?")
|
|
259
|
+
license_suggestions: Optional[str]=Field(description="Suggestions to improve license information")
|
|
260
|
+
overall_score: str=Field(description="A overall scroll for the README quality, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
261
|
+
|
|
262
|
+
class EvaluationREADMEResult(BaseModel):
|
|
263
|
+
project_level: Optional[bool]=Field(description="A boolean value specifying if the README file is **project-level** README. TRUE: project-level, FALSE, folder-level")
|
|
264
|
+
score: Optional[str]=Field(description="An overall score")
|
|
265
|
+
key_strengths: Optional[str]=Field(description="A string specifying the key strengths of README file.")
|
|
266
|
+
overall_improvement_suggestions: Optional[list[str]]=Field(description="A list of overall improvement suggestions")
|
|
267
|
+
|
|
268
|
+
EvaluationREADMEResultSchema = {
|
|
269
|
+
"title": "EvaluationREADMEResult",
|
|
270
|
+
"type": "object",
|
|
271
|
+
"properties": {
|
|
272
|
+
"project_level": {
|
|
273
|
+
"anyOf": [{"type": "boolean"}, {"type": "null"}],
|
|
274
|
+
"description": "A boolean value specifying if the README file is **project-level** README. TRUE: project-level, FALSE: folder-level.",
|
|
275
|
+
"title": "Project Level"
|
|
276
|
+
},
|
|
277
|
+
"score": {
|
|
278
|
+
"anyOf": [{"type": "string"}, {"type": "null"}],
|
|
279
|
+
"description": "An overall score",
|
|
280
|
+
"title": "Score"
|
|
281
|
+
},
|
|
282
|
+
"key_strengths": {
|
|
283
|
+
"anyOf": [{"type": "string"}, {"type": "null"}],
|
|
284
|
+
"description": "A string specifying the key strengths of README file.",
|
|
285
|
+
"title": "Key Strengths",
|
|
286
|
+
},
|
|
287
|
+
"overall_improvement_suggestions": {
|
|
288
|
+
"anyOf": [{"items": {"type": "string"}, "type": "array"}, {"type": "null"}],
|
|
289
|
+
"description": "A list of improvement suggestions",
|
|
290
|
+
"title": "Overall Improvement Suggestions"
|
|
291
|
+
}
|
|
292
|
+
},
|
|
293
|
+
"required": ["project_level", "score", "key_strengths", "overall_improvement_suggestions"]
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
class EvaluationREADMETask(EvaluationTask):
|
|
297
|
+
def __init__(
|
|
298
|
+
self,
|
|
299
|
+
llm: BaseChatOpenAI,
|
|
300
|
+
repo_path: str,
|
|
301
|
+
gitignore_path: str,
|
|
302
|
+
meta_data: ProjectMetadata | None = None,
|
|
303
|
+
step_callback: Callable | None = None
|
|
304
|
+
):
|
|
305
|
+
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
|
|
306
|
+
self.evaluation_name = "README Evaluation"
|
|
307
|
+
|
|
308
|
+
def _structured_evaluate(self, free_readme_evaluations: dict[str, dict]):
|
|
309
|
+
""" Evaluate README in structure:
|
|
310
|
+
available: bool
|
|
311
|
+
readability: score and suggestion
|
|
312
|
+
project purpose: bool, suggestion
|
|
313
|
+
hardware and software spec and compatibility description: score and suggestion
|
|
314
|
+
dependencies clearly stated: score and suggestion
|
|
315
|
+
license information included: bool and suggestion
|
|
316
|
+
Code contributor / author information included: bool and suggestion
|
|
317
|
+
overall score:
|
|
318
|
+
"""
|
|
319
|
+
total_token_usage = {**DEFAULT_TOKEN_USAGE}
|
|
320
|
+
if free_readme_evaluations is None:
|
|
321
|
+
return None, total_token_usage
|
|
322
|
+
|
|
323
|
+
license_path = "LICENSE"
|
|
324
|
+
license_content = read_file(Path(self.repo_path, license_path))
|
|
325
|
+
license_summarized_content = summarize_file(
|
|
326
|
+
llm=self.llm,
|
|
327
|
+
name=license_path,
|
|
328
|
+
content=license_content,
|
|
329
|
+
level=6,
|
|
330
|
+
summary_instructions="What license is the repository using?",
|
|
331
|
+
) if license_content is not None else "N/A"
|
|
332
|
+
license_path = license_path if license_content is not None else "N/A"
|
|
333
|
+
structured_readme_evaluations = {}
|
|
334
|
+
for readme_file in free_readme_evaluations.keys():
|
|
335
|
+
evaluation = free_readme_evaluations[readme_file]["evaluation"]
|
|
336
|
+
if not evaluation["project_level"]:
|
|
337
|
+
continue
|
|
338
|
+
full_path = Path(self.repo_path, readme_file)
|
|
339
|
+
readme_content = read_file(full_path)
|
|
340
|
+
if readme_content is None:
|
|
341
|
+
logger.error(f"Error in reading file {readme_file}")
|
|
342
|
+
continue
|
|
343
|
+
if len(readme_content.strip()) == 0:
|
|
344
|
+
structured_readme_evaluations[readme_file] = {
|
|
345
|
+
"structured_evaluation": StructuredEvaluationREADMEResult(
|
|
346
|
+
available_score=False,
|
|
347
|
+
readability_score="Poor",
|
|
348
|
+
readability_suggestions="No readability provided",
|
|
349
|
+
project_purpose_score=False,
|
|
350
|
+
project_purpose_suggestions="No project purpose provided",
|
|
351
|
+
hardware_and_software_spec_score="Poor",
|
|
352
|
+
hardware_and_software_spec_suggestions="No hardware and software spec provided",
|
|
353
|
+
dependency_score="Poor",
|
|
354
|
+
dependency_suggestions="No dependency provided",
|
|
355
|
+
license_score=False,
|
|
356
|
+
license_suggestions="No license information",
|
|
357
|
+
overall_score="Poor",
|
|
358
|
+
),
|
|
359
|
+
"structured_reasoning_process": f"{readme_file} is an empty file.",
|
|
360
|
+
}
|
|
361
|
+
continue
|
|
362
|
+
readability = PyphenReadability()
|
|
363
|
+
flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
|
|
364
|
+
_, _, _, _, _ = readability.readability_metrics(readme_content)
|
|
365
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
366
|
+
STRUCTURED_EVALUATION_README_SYSTEM_PROMPT
|
|
367
|
+
).format(
|
|
368
|
+
readme_path=readme_file,
|
|
369
|
+
readme_content=readme_content,
|
|
370
|
+
license_path=license_path,
|
|
371
|
+
license_summarized_content=license_summarized_content,
|
|
372
|
+
flesch_reading_ease=flesch_reading_ease,
|
|
373
|
+
flesch_kincaid_grade=flesch_kincaid_grade,
|
|
374
|
+
gunning_fog_index=gunning_fog_index,
|
|
375
|
+
smog_index=smog_index,
|
|
376
|
+
)
|
|
377
|
+
agent = CommonAgentTwoChainSteps(llm=self.llm)
|
|
378
|
+
response, _, token_usage, reasoning_process = agent.go(
|
|
379
|
+
system_prompt=system_prompt,
|
|
380
|
+
instruction_prompt=EVALUATION_INSTRUCTION,
|
|
381
|
+
schema=StructuredEvaluationREADMEResult,
|
|
382
|
+
)
|
|
383
|
+
self.print_step(step_output=f"README: {readme_file} structured evaluation")
|
|
384
|
+
self.print_step(step_output=reasoning_process)
|
|
385
|
+
structured_readme_evaluations[readme_file] = {
|
|
386
|
+
"structured_evaluation": response,
|
|
387
|
+
"structured_reasoning_process": reasoning_process,
|
|
388
|
+
}
|
|
389
|
+
total_token_usage = increase_token_usage(total_token_usage, token_usage)
|
|
390
|
+
|
|
391
|
+
return structured_readme_evaluations, total_token_usage
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _free_evaluate(self, files: list[str]):
|
|
395
|
+
readme_files = files
|
|
396
|
+
if readme_files is None or len(readme_files) == 0:
|
|
397
|
+
return None, {**DEFAULT_TOKEN_USAGE}
|
|
398
|
+
|
|
399
|
+
readme_evaluations = {}
|
|
400
|
+
total_token_usage = {**DEFAULT_TOKEN_USAGE}
|
|
401
|
+
for readme_file in readme_files:
|
|
402
|
+
readme_path = Path(self.repo_path, readme_file)
|
|
403
|
+
readme_content = read_file(readme_path)
|
|
404
|
+
if readme_content is None:
|
|
405
|
+
logger.error(f"Error in reading file {readme_file}")
|
|
406
|
+
continue
|
|
407
|
+
if len(readme_content.strip()) == 0:
|
|
408
|
+
readme_evaluations[readme_file] = {
|
|
409
|
+
"evaluation": {
|
|
410
|
+
"project_level": not "/" in readme_file,
|
|
411
|
+
"score": "Poor",
|
|
412
|
+
"key_strengths": f"{readme_file} is an empty file.",
|
|
413
|
+
"overall_improvement_suggestions": f"{readme_file} is an empty file.",
|
|
414
|
+
},
|
|
415
|
+
"reasoning_process": f"{readme_file} is an empty file.",
|
|
416
|
+
}
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
readability = PyphenReadability()
|
|
420
|
+
flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
|
|
421
|
+
_, _, _, _, _ = readability.readability_metrics(readme_content)
|
|
422
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
423
|
+
EVALUATION_README_SYSTEM_PROMPT
|
|
424
|
+
).format(
|
|
425
|
+
readme_content=readme_content,
|
|
426
|
+
readme_path=readme_file,
|
|
427
|
+
flesch_reading_ease=flesch_reading_ease,
|
|
428
|
+
flesch_kincaid_grade=flesch_kincaid_grade,
|
|
429
|
+
gunning_fog_index=gunning_fog_index,
|
|
430
|
+
smog_index=smog_index,
|
|
431
|
+
)
|
|
432
|
+
# conversation = CommonConversation(llm=self.llm)
|
|
433
|
+
agent = CommonAgentTwoChainSteps(llm=self.llm)
|
|
434
|
+
response, _, token_usage, reasoning_process = agent.go(
|
|
435
|
+
system_prompt=system_prompt,
|
|
436
|
+
instruction_prompt=EVALUATION_INSTRUCTION,
|
|
437
|
+
schema=EvaluationREADMEResultSchema,
|
|
438
|
+
)
|
|
439
|
+
response = EvaluationREADMEResult(**response)
|
|
440
|
+
self.print_step(step_output=f"README: {readme_file} free evaluation")
|
|
441
|
+
self.print_step(step_output=reasoning_process)
|
|
442
|
+
readme_evaluations[readme_file] = {
|
|
443
|
+
"evaluation": {
|
|
444
|
+
"project_level": response.project_level,
|
|
445
|
+
"score": response.score,
|
|
446
|
+
"key_strengths": response.key_strengths,
|
|
447
|
+
"overall_improvement_suggestions": response.overall_improvement_suggestions,
|
|
448
|
+
},
|
|
449
|
+
"reasoning_process": reasoning_process
|
|
450
|
+
}
|
|
451
|
+
total_token_usage = increase_token_usage(total_token_usage, token_usage)
|
|
452
|
+
return readme_evaluations, total_token_usage
|
|
453
|
+
|
|
454
|
+
def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
|
|
455
|
+
free_readme_evaluations, free_token_usage = self._free_evaluate(files)
|
|
456
|
+
structured_readme_evaluations, structured_token_usage = self._structured_evaluate(free_readme_evaluations)
|
|
457
|
+
|
|
458
|
+
# combine result
|
|
459
|
+
combined_evaluations = {}
|
|
460
|
+
for f in files:
|
|
461
|
+
if not f in structured_readme_evaluations:
|
|
462
|
+
combined_evaluations = {**free_readme_evaluations[f]}
|
|
463
|
+
else:
|
|
464
|
+
combined_evaluations[f] = {
|
|
465
|
+
**free_readme_evaluations[f],
|
|
466
|
+
**structured_readme_evaluations[f],
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
total_token_usage = increase_token_usage(free_token_usage, structured_token_usage)
|
|
470
|
+
|
|
471
|
+
return combined_evaluations, total_token_usage
|
|
472
|
+
|
|
473
|
+
|
|
@@ -2,18 +2,16 @@
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
import logging
|
|
5
|
-
from typing import Callable
|
|
5
|
+
from typing import Callable
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from langchain.prompts import ChatPromptTemplate
|
|
8
8
|
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
9
|
-
from pydantic import BaseModel, Field
|
|
10
9
|
|
|
11
10
|
from bioguider.agents.agent_utils import read_file
|
|
11
|
+
from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
|
|
12
12
|
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
|
|
13
|
-
from .common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
|
|
14
13
|
from .common_agent import CommonConversation
|
|
15
14
|
from ..utils.pyphen_utils import PyphenReadability
|
|
16
|
-
from ..utils.gitignore_checker import GitignoreChecker
|
|
17
15
|
|
|
18
16
|
logger = logging.getLogger(__name__)
|
|
19
17
|
|
|
@@ -198,110 +196,7 @@ class EvaluationTask(ABC):
|
|
|
198
196
|
def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
|
|
199
197
|
pass
|
|
200
198
|
|
|
201
|
-
|
|
202
|
-
project_level: Optional[bool]=Field(description="A boolean value specifying if the README file is **project-level** README. TRUE: project-level, FALSE, folder-level")
|
|
203
|
-
score: Optional[str]=Field(description="An overall score")
|
|
204
|
-
key_strengths: Optional[str]=Field(description="A string specifying the key strengths of README file.")
|
|
205
|
-
overall_improvement_suggestions: Optional[list[str]]=Field(description="A list of overall improvement suggestions")
|
|
206
|
-
|
|
207
|
-
EvaluationREADMEResultSchema = {
|
|
208
|
-
"title": "EvaluationREADMEResult",
|
|
209
|
-
"type": "object",
|
|
210
|
-
"properties": {
|
|
211
|
-
"project_level": {
|
|
212
|
-
"anyOf": [{"type": "boolean"}, {"type": "null"}],
|
|
213
|
-
"description": "A boolean value specifying if the README file is **project-level** README. TRUE: project-level, FALSE: folder-level.",
|
|
214
|
-
"title": "Project Level"
|
|
215
|
-
},
|
|
216
|
-
"score": {
|
|
217
|
-
"anyOf": [{"type": "string"}, {"type": "null"}],
|
|
218
|
-
"description": "An overall score",
|
|
219
|
-
"title": "Score"
|
|
220
|
-
},
|
|
221
|
-
"key_strengths": {
|
|
222
|
-
"anyOf": [{"type": "string"}, {"type": "null"}],
|
|
223
|
-
"description": "A string specifying the key strengths of README file.",
|
|
224
|
-
"title": "Key Strengths",
|
|
225
|
-
},
|
|
226
|
-
"overall_improvement_suggestions": {
|
|
227
|
-
"anyOf": [{"items": {"type": "string"}, "type": "array"}, {"type": "null"}],
|
|
228
|
-
"description": "A list of improvement suggestions",
|
|
229
|
-
"title": "Overall Improvement Suggestions"
|
|
230
|
-
}
|
|
231
|
-
},
|
|
232
|
-
"required": ["project_level", "score", "key_strengths", "overall_improvement_suggestions"]
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
class EvaluationREADMETask(EvaluationTask):
|
|
236
|
-
def __init__(
|
|
237
|
-
self,
|
|
238
|
-
llm: BaseChatOpenAI,
|
|
239
|
-
repo_path: str,
|
|
240
|
-
gitignore_path: str,
|
|
241
|
-
meta_data: ProjectMetadata | None = None,
|
|
242
|
-
step_callback: Callable | None = None
|
|
243
|
-
):
|
|
244
|
-
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
|
|
245
|
-
self.evaluation_name = "README Evaluation"
|
|
246
|
-
|
|
247
|
-
def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
|
|
248
|
-
readme_files = files
|
|
249
|
-
if readme_files is None or len(readme_files) == 0:
|
|
250
|
-
return None
|
|
251
|
-
|
|
252
|
-
readme_evaluations = {}
|
|
253
|
-
for readme_file in readme_files:
|
|
254
|
-
readme_path = Path(self.repo_path, readme_file)
|
|
255
|
-
readme_content = read_file(readme_path)
|
|
256
|
-
if readme_content is None:
|
|
257
|
-
logger.error(f"Error in reading file {readme_file}")
|
|
258
|
-
continue
|
|
259
|
-
if len(readme_content.strip()) == 0:
|
|
260
|
-
readme_evaluations[readme_file] = {
|
|
261
|
-
"evaluation": {
|
|
262
|
-
"project_level": "/" in readme_file,
|
|
263
|
-
"score": "Poor",
|
|
264
|
-
"key_strengths": f"{readme_file} is an empty file.",
|
|
265
|
-
"overall_improvement_suggestions": f"{readme_file} is an empty file.",
|
|
266
|
-
},
|
|
267
|
-
"reasoning_process": f"{readme_file} is an empty file.",
|
|
268
|
-
}
|
|
269
|
-
continue
|
|
270
|
-
|
|
271
|
-
readability = PyphenReadability()
|
|
272
|
-
flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
|
|
273
|
-
_, _, _, _, _ = readability.readability_metrics(readme_content)
|
|
274
|
-
system_prompt = ChatPromptTemplate.from_template(
|
|
275
|
-
EVALUATION_README_SYSTEM_PROMPT
|
|
276
|
-
).format(
|
|
277
|
-
readme_content=readme_content,
|
|
278
|
-
readme_path=readme_file,
|
|
279
|
-
flesch_reading_ease=flesch_reading_ease,
|
|
280
|
-
flesch_kincaid_grade=flesch_kincaid_grade,
|
|
281
|
-
gunning_fog_index=gunning_fog_index,
|
|
282
|
-
smog_index=smog_index,
|
|
283
|
-
)
|
|
284
|
-
# conversation = CommonConversation(llm=self.llm)
|
|
285
|
-
agent = CommonAgentTwoChainSteps(llm=self.llm)
|
|
286
|
-
response, _, token_usage, reasoning_process = agent.go(
|
|
287
|
-
system_prompt=system_prompt,
|
|
288
|
-
instruction_prompt="Before arriving at the conclusion, clearly explain your reasoning step by step. Now, let's begin the evaluation.",
|
|
289
|
-
schema=EvaluationREADMEResultSchema,
|
|
290
|
-
)
|
|
291
|
-
response = EvaluationREADMEResult(**response)
|
|
292
|
-
self.print_step(step_output=f"README: {readme_file}")
|
|
293
|
-
self.print_step(step_output=reasoning_process)
|
|
294
|
-
readme_evaluations[readme_file] = {
|
|
295
|
-
"evaluation": {
|
|
296
|
-
"project_level": response.project_level,
|
|
297
|
-
"score": response.score,
|
|
298
|
-
"key_strengths": response.key_strengths,
|
|
299
|
-
"overall_improvement_suggestions": response.overall_improvement_suggestions,
|
|
300
|
-
},
|
|
301
|
-
"reasoning_process": reasoning_process
|
|
302
|
-
}
|
|
303
|
-
return readme_evaluations, token_usage
|
|
304
|
-
|
|
199
|
+
|
|
305
200
|
EVALUATION_TUTORIAL_SYSTEM_PROMPT="""
|
|
306
201
|
You are an expert in software documentation and developer education.
|
|
307
202
|
You are given the content of a tutorial file from a GitHub repository. Your task is to **critically evaluate** the quality of this tutorial based on best practices in technical writing and developer onboarding.
|
|
@@ -399,7 +294,7 @@ class EvaluationTutorialTask(EvaluationTask):
|
|
|
399
294
|
conversation = CommonConversation(llm=self.llm)
|
|
400
295
|
response, token_usage = conversation.generate(
|
|
401
296
|
system_prompt=system_prompt,
|
|
402
|
-
instruction_prompt=
|
|
297
|
+
instruction_prompt=EVALUATION_INSTRUCTION,
|
|
403
298
|
)
|
|
404
299
|
self.print_step(step_output=f"Tutorial: {file}")
|
|
405
300
|
self.print_step(step_output=response)
|
|
@@ -18,6 +18,7 @@ from bioguider.agents.agent_tools import (
|
|
|
18
18
|
)
|
|
19
19
|
from bioguider.agents.agent_utils import (
|
|
20
20
|
read_directory,
|
|
21
|
+
try_parse_json_object,
|
|
21
22
|
)
|
|
22
23
|
from bioguider.agents.identification_execute_step import IdentificationExecuteStep
|
|
23
24
|
from bioguider.agents.identification_observe_step import IdentificationObserveStep
|
|
@@ -189,13 +190,18 @@ class IdentificationTask(AgentTask):
|
|
|
189
190
|
|
|
190
191
|
|
|
191
192
|
def _parse_project_type(self, proj_type_obj: str) -> ProjectTypeEnum:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
193
|
+
proj_type_obj = proj_type_obj.strip()
|
|
194
|
+
the_obj = try_parse_json_object(proj_type_obj)
|
|
195
|
+
if not the_obj is None and "project_type" in the_obj:
|
|
196
|
+
proj_type = the_obj["project_type"]
|
|
197
|
+
elif proj_type_obj in [
|
|
198
|
+
ProjectTypeEnum.application.value,
|
|
199
|
+
ProjectTypeEnum.package.value,
|
|
200
|
+
ProjectTypeEnum.pipeline.value
|
|
201
|
+
]:
|
|
202
|
+
return ProjectTypeEnum(proj_type_obj)
|
|
203
|
+
else:
|
|
204
|
+
proj_type = "unknown"
|
|
199
205
|
if proj_type == "application":
|
|
200
206
|
return ProjectTypeEnum.application
|
|
201
207
|
elif proj_type == "package":
|
|
@@ -206,12 +212,19 @@ class IdentificationTask(AgentTask):
|
|
|
206
212
|
return ProjectTypeEnum.unknown
|
|
207
213
|
|
|
208
214
|
def _parse_primary_language(self, language_obj: str) -> PrimaryLanguageEnum:
|
|
209
|
-
try
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
+
# try to handle some common errors
|
|
216
|
+
language_obj = language_obj.strip()
|
|
217
|
+
the_obj = try_parse_json_object(language_obj)
|
|
218
|
+
if not the_obj is None and "primary_language" in the_obj:
|
|
219
|
+
language = the_obj["primary_language"]
|
|
220
|
+
elif language_obj in [
|
|
221
|
+
PrimaryLanguageEnum.python.value,
|
|
222
|
+
PrimaryLanguageEnum.R.value,
|
|
223
|
+
]:
|
|
224
|
+
return PrimaryLanguageEnum(language_obj)
|
|
225
|
+
else:
|
|
226
|
+
language = "unknown"
|
|
227
|
+
|
|
215
228
|
language = language.strip()
|
|
216
229
|
if language == "python":
|
|
217
230
|
return PrimaryLanguageEnum.python
|
|
@@ -221,15 +234,14 @@ class IdentificationTask(AgentTask):
|
|
|
221
234
|
return PrimaryLanguageEnum.unknown
|
|
222
235
|
|
|
223
236
|
def _parse_meta_data(self, meta_data_obj: str) -> dict:
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
}
|
|
237
|
+
meta_data_obj = meta_data_obj.strip()
|
|
238
|
+
the_obj = try_parse_json_object(meta_data_obj)
|
|
239
|
+
|
|
240
|
+
return the_obj if the_obj is not None else {
|
|
241
|
+
"name": "unknown",
|
|
242
|
+
"description": "unknown",
|
|
243
|
+
"license": "unknown",
|
|
244
|
+
"owner": "unknown",
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
|
|
@@ -82,6 +82,7 @@ IDENTIFICATION_GOAL_META_DATA = """Identify the following meta data of the repos
|
|
|
82
82
|
"""
|
|
83
83
|
|
|
84
84
|
COT_USER_INSTRUCTION = "Do not give the answer immediately. First, explain your reasoning process step by step, then provide the answer."
|
|
85
|
+
EVALUATION_INSTRUCTION="Before arriving at the conclusion, clearly explain your reasoning step by step. Now, let's begin the evaluation."
|
|
85
86
|
|
|
86
87
|
class CollectionGoalItemEnum(Enum):
|
|
87
88
|
UserGuide = "User Guide"
|
|
@@ -188,3 +189,5 @@ If **any one** of these is present, the document should be classified as Contrib
|
|
|
188
189
|
},
|
|
189
190
|
}
|
|
190
191
|
|
|
192
|
+
|
|
193
|
+
|
|
@@ -9,7 +9,7 @@ from ..agents.identification_task import IdentificationTask
|
|
|
9
9
|
from ..rag.rag import RAG
|
|
10
10
|
from ..utils.file_utils import parse_repo_url
|
|
11
11
|
from ..database.summarized_file_db import SummarizedFilesDb
|
|
12
|
-
from ..agents.
|
|
12
|
+
from ..agents.evaluation_readme_task import EvaluationREADMETask
|
|
13
13
|
from ..agents.evaluation_installation_task import EvaluationInstallationTask
|
|
14
14
|
from ..agents.collection_task import CollectionTask
|
|
15
15
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|