bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
from bioguider.agents.agent_utils import try_parse_json_object, try_parse_with_llm
|
|
4
|
+
from bioguider.agents.evaluation_task import EvaluationTask
|
|
5
|
+
from bioguider.agents.collection_task import CollectionTask
|
|
6
|
+
from bioguider.agents.identification_task import IdentificationTask
|
|
7
|
+
from bioguider.agents.prompt_utils import CollectionGoalItemEnum
|
|
8
|
+
from bioguider.utils.constants import (
|
|
9
|
+
DEFAULT_TOKEN_USAGE,
|
|
10
|
+
EvaluationInstallationResult,
|
|
11
|
+
EvaluationREADMEResult,
|
|
12
|
+
SoftwarePackageContentResult,
|
|
13
|
+
DemoInstructionsResult,
|
|
14
|
+
EvaluationSubmissionRequirementsResult,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
DEMO_INSTRUCTION_GOAL = """
|
|
18
|
+
1. Identify if it provides the instructions to run on provided data
|
|
19
|
+
2. Identify if it provides the instructions to run on custom data
|
|
20
|
+
3. Identify if it provides the expected output
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
DEMO_INSTRUCTION_FINAL_ANSWER = \
|
|
24
|
+
'{{"run_on_data_instruction": <True or False>, "run_on_custom_instruction": <True or False>, "expected_output_description": <True Or False>}}'
|
|
25
|
+
|
|
26
|
+
class EvaluationSubmissionRequirementsTask(EvaluationTask):
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
llm,
|
|
30
|
+
repo_path,
|
|
31
|
+
gitignore_path,
|
|
32
|
+
meta_data = None,
|
|
33
|
+
step_callback = None,
|
|
34
|
+
summarized_files_db = None,
|
|
35
|
+
readme_files_evaluation: dict[str, EvaluationREADMEResult] | None = None,
|
|
36
|
+
installation_evaluation: EvaluationInstallationResult | None = None,
|
|
37
|
+
installation_files: list[str] | None = None
|
|
38
|
+
):
|
|
39
|
+
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
|
|
40
|
+
self.evaluation_name = "Submission Requirements Evaluation"
|
|
41
|
+
self.readme_files_evaluation = readme_files_evaluation
|
|
42
|
+
self.installation_evaluation = installation_evaluation
|
|
43
|
+
self.installation_files = installation_files
|
|
44
|
+
|
|
45
|
+
def _collect_software_package_content(self):
|
|
46
|
+
collection_task = CollectionTask(
|
|
47
|
+
llm = self.llm,
|
|
48
|
+
step_callback=self.step_callback,
|
|
49
|
+
summarize_instruction="We are collecting compiled standalone files, source code files and example data files.",
|
|
50
|
+
summarized_files_db=self.summarized_files_db,
|
|
51
|
+
)
|
|
52
|
+
collection_task.compile(
|
|
53
|
+
repo_path=self.repo_path,
|
|
54
|
+
gitignore_path=self.gitignore_path,
|
|
55
|
+
db=self.summarized_files_db,
|
|
56
|
+
goal_item=CollectionGoalItemEnum.SoftwarePackageContent.name,
|
|
57
|
+
)
|
|
58
|
+
files = collection_task.collect()
|
|
59
|
+
|
|
60
|
+
return files
|
|
61
|
+
|
|
62
|
+
def _evaluate_software_package_content(self) -> tuple[SoftwarePackageContentResult, list[str]]:
|
|
63
|
+
files = self._collect_software_package_content()
|
|
64
|
+
if len(files) == 3:
|
|
65
|
+
return SoftwarePackageContentResult(
|
|
66
|
+
compiled_standalone_software=files[0].strip().lower() != "n/a",
|
|
67
|
+
source_code=files[1].strip().lower() != "n/a",
|
|
68
|
+
demo_dataset=files[2].strip().lower() != "n/a",
|
|
69
|
+
), files
|
|
70
|
+
else:
|
|
71
|
+
return SoftwarePackageContentResult(
|
|
72
|
+
compiled_standalone_software=False,
|
|
73
|
+
source_code=False,
|
|
74
|
+
demo_dataset=False,
|
|
75
|
+
), files
|
|
76
|
+
|
|
77
|
+
def _evaluatie_demo_instructions(self) -> tuple[DemoInstructionsResult | None, list[str]]:
|
|
78
|
+
readme_files = [f for f in self.readme_files_evaluation.keys() \
|
|
79
|
+
if self.readme_files_evaluation[f].project_level]
|
|
80
|
+
installation_files = self.installation_files if self.installation_files is not None else []
|
|
81
|
+
provided_files = readme_files + installation_files
|
|
82
|
+
provided_files = provided_files if len(provided_files) > 0 else None
|
|
83
|
+
identify_task = IdentificationTask(
|
|
84
|
+
llm=self.llm,
|
|
85
|
+
step_callback=self.step_callback,
|
|
86
|
+
summarized_files_db=self.summarized_files_db,
|
|
87
|
+
provided_files=provided_files
|
|
88
|
+
)
|
|
89
|
+
identify_task.compile(
|
|
90
|
+
repo_path=self.repo_path,
|
|
91
|
+
gitignore_path=self.gitignore_path,
|
|
92
|
+
)
|
|
93
|
+
final_answer = identify_task.identify_customize_goal(
|
|
94
|
+
goal="demo instructions",
|
|
95
|
+
final_answer_example=DEMO_INSTRUCTION_FINAL_ANSWER,
|
|
96
|
+
)
|
|
97
|
+
final_answer = final_answer["final_answer"] \
|
|
98
|
+
if final_answer is not None and "final_answer" in final_answer else final_answer
|
|
99
|
+
parsed_obj = self._parse_demo_instruction_result(final_answer)
|
|
100
|
+
return parsed_obj, provided_files
|
|
101
|
+
|
|
102
|
+
def _parse_demo_instruction_result(self, result: str | dict) -> DemoInstructionsResult:
|
|
103
|
+
parsed_obj = None
|
|
104
|
+
if isinstance(result, dict):
|
|
105
|
+
parsed_obj = result
|
|
106
|
+
else:
|
|
107
|
+
parsed_obj = try_parse_json_object(result)
|
|
108
|
+
if parsed_obj is None:
|
|
109
|
+
parsed_obj, token_usage = try_parse_with_llm(
|
|
110
|
+
llm=self.llm,
|
|
111
|
+
input_text=result,
|
|
112
|
+
schema=DemoInstructionsResult,
|
|
113
|
+
)
|
|
114
|
+
parsed_obj = vars(parsed_obj) if parsed_obj is not None else parsed_obj
|
|
115
|
+
self.print_step(token_usage=token_usage)
|
|
116
|
+
self.print_step(step_output=str(parsed_obj))
|
|
117
|
+
|
|
118
|
+
return DemoInstructionsResult(
|
|
119
|
+
run_on_data_instruction = parsed_obj["run_on_data_instruction"] \
|
|
120
|
+
if "run_on_data_instruction" in parsed_obj else False,
|
|
121
|
+
run_on_custom_instruction = parsed_obj["run_on_custom_instruction"] \
|
|
122
|
+
if "run_on_custom_instruction" in parsed_obj else False,
|
|
123
|
+
expected_output_description = parsed_obj["expected_output_description"] \
|
|
124
|
+
if "expected_output_description" in parsed_obj else False,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def _combine_evaluation(
|
|
128
|
+
self,
|
|
129
|
+
software_evaluation: SoftwarePackageContentResult,
|
|
130
|
+
demo_evaluation: DemoInstructionsResult,
|
|
131
|
+
) -> EvaluationSubmissionRequirementsResult:
|
|
132
|
+
readme_files = [f for f in self.readme_files_evaluation.keys() \
|
|
133
|
+
if self.readme_files_evaluation[f].project_level]
|
|
134
|
+
structured_install_evaluation = self.installation_evaluation.structured_evaluation
|
|
135
|
+
software_dependency = structured_install_evaluation.dependency_number > 0
|
|
136
|
+
install_tutorial = structured_install_evaluation.install_tutorial
|
|
137
|
+
hardware_requirements = structured_install_evaluation.hardware_requirements
|
|
138
|
+
compatible_os = structured_install_evaluation.compatible_os
|
|
139
|
+
license = any([
|
|
140
|
+
self.readme_files_evaluation[f].structured_evaluation.license_score \
|
|
141
|
+
if self.readme_files_evaluation[f].structured_evaluation is not None \
|
|
142
|
+
else False for f in readme_files
|
|
143
|
+
])
|
|
144
|
+
return EvaluationSubmissionRequirementsResult(
|
|
145
|
+
compiled_standalone_software=software_evaluation.compiled_standalone_software,
|
|
146
|
+
source_code=software_evaluation.source_code,
|
|
147
|
+
demo_dataset=software_evaluation.demo_dataset,
|
|
148
|
+
run_on_data_instruction=demo_evaluation.run_on_data_instruction,
|
|
149
|
+
run_on_custom_instruction=demo_evaluation.run_on_custom_instruction,
|
|
150
|
+
expected_output_description=demo_evaluation.expected_output_description,
|
|
151
|
+
complete_readme=len(readme_files) > 0,
|
|
152
|
+
software_dependency=software_dependency,
|
|
153
|
+
install_tutorial=install_tutorial,
|
|
154
|
+
license=license,
|
|
155
|
+
hardware_requirements=hardware_requirements,
|
|
156
|
+
compatible_os=compatible_os,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _evaluate(self, files):
|
|
160
|
+
|
|
161
|
+
software_evaluation, software_files = self._evaluate_software_package_content()
|
|
162
|
+
demo_evaluation, demo_files = self._evaluatie_demo_instructions()
|
|
163
|
+
files = list(set(software_files + demo_files))
|
|
164
|
+
|
|
165
|
+
return self._combine_evaluation(software_evaluation, demo_evaluation), {**DEFAULT_TOKEN_USAGE}, files
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _collect_files(self):
|
|
169
|
+
return []
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Callable
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from langchain.prompts import ChatPromptTemplate
|
|
8
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
9
|
+
|
|
10
|
+
from bioguider.agents.agent_utils import read_file
|
|
11
|
+
from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
|
|
12
|
+
from bioguider.database.summarized_file_db import SummarizedFilesDb
|
|
13
|
+
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
|
|
14
|
+
from .common_conversation import CommonConversation
|
|
15
|
+
from ..utils.pyphen_utils import PyphenReadability
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
EVALUATION_README_SYSTEM_PROMPT = """
|
|
20
|
+
You are an expert in evaluating the quality of README files in software repositories.
|
|
21
|
+
Your task is to analyze the provided README file and generate a comprehensive quality report.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
### **Step 1: Identify README type
|
|
26
|
+
|
|
27
|
+
First, determine whether the provided README is a **project-level README** (typically at the root of a repository) or a **folder-level README** (typically inside subdirectories).
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
### **Evaluation Criteria**
|
|
32
|
+
|
|
33
|
+
#### If the README is a **project-level** file, evaluate it using the following criteria.
|
|
34
|
+
|
|
35
|
+
For each criterion below, provide a brief assessment followed by specific, actionable comments for improvement.
|
|
36
|
+
|
|
37
|
+
**1. Project Clarity & Purpose**
|
|
38
|
+
* **Assessment**: [Your evaluation of whether the project's purpose is clear.]
|
|
39
|
+
* **Improvement Suggestions**:
|
|
40
|
+
* **Original text:** [Quote a specific line/section from the README.]
|
|
41
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
42
|
+
* **Original text:** [Quote a specific line/section from the README.]
|
|
43
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
**2. Installation Instructions**
|
|
47
|
+
* **Assessment**: [Your evaluation of the installation instructions.]
|
|
48
|
+
* **Improvement Suggestions**:
|
|
49
|
+
* **Original text:** [Quote text related to installation.]
|
|
50
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
51
|
+
* **Original text:** [Quote text related to installation.]
|
|
52
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
**3. Usage Instructions**
|
|
56
|
+
* **Assessment**: [Your evaluation of the usage instructions.]
|
|
57
|
+
* **Improvement Suggestions**:
|
|
58
|
+
* **Original text:** [Quote text related to usage.]
|
|
59
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
60
|
+
* **Original text:** [Quote text related to usage.]
|
|
61
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
62
|
+
...
|
|
63
|
+
|
|
64
|
+
**4. Contributing Guidelines**
|
|
65
|
+
* **Assessment**: [Your evaluation of the contributing guidelines.]
|
|
66
|
+
* **Improvement Suggestions**:
|
|
67
|
+
* **Original text:** [Quote text related to contributions.]
|
|
68
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
69
|
+
* **Original text:** [Quote text related to contributions.]
|
|
70
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
**5. License Information**
|
|
74
|
+
* **Assessment**: [Your evaluation of the license information.]
|
|
75
|
+
* **Improvement Suggestions**:
|
|
76
|
+
* **Original text:** [Quote text related to the license.]
|
|
77
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
78
|
+
* **Original text:** [Quote text related to the license.]
|
|
79
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
80
|
+
...
|
|
81
|
+
|
|
82
|
+
**6. Readability Analysis**
|
|
83
|
+
* **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
|
|
84
|
+
* **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
|
|
85
|
+
* **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
|
|
86
|
+
* **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
|
|
87
|
+
* **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
#### If if is a **folder-level** file, use the following criteria instead.
|
|
92
|
+
|
|
93
|
+
For each criterion below, provide a brief assessment followed by specific, actionable comments for improvement.
|
|
94
|
+
|
|
95
|
+
**1. Folder Description**
|
|
96
|
+
* **Assessment**: [Your evaluation of whether it Provides a clear **description** of what the folder contains (e.g., modules, scripts, data).]
|
|
97
|
+
* **Improvement Suggestions**:
|
|
98
|
+
* **Original text:** [Quote a specific line/section from the README.]
|
|
99
|
+
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
100
|
+
|
|
101
|
+
**2. Folder Purpose**
|
|
102
|
+
* **Assessment**: [Your evaluation of whether it explains the **purpose** or **role** of the components inside this subfolder.]
|
|
103
|
+
* **Improvement Suggestions**:
|
|
104
|
+
* **Original text:** [Quote text related to purpose.]
|
|
105
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
106
|
+
|
|
107
|
+
**3. Usage**
|
|
108
|
+
* **Assessment**: [Your evaluation of whether it includes **usage instructions** specific to this folder (e.g., commands, import paths, input/output files).]
|
|
109
|
+
* **Improvement Suggestions**:
|
|
110
|
+
* **Original text:** [Quote text related to usage.]
|
|
111
|
+
* **Improving comments:** [Provide your suggestions.]
|
|
112
|
+
|
|
113
|
+
**4. Readability Analysis**
|
|
114
|
+
* **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
|
|
115
|
+
* **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
|
|
116
|
+
* **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
|
|
117
|
+
* **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
|
|
118
|
+
* **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
### Final Report Format
|
|
123
|
+
|
|
124
|
+
#### Your output **must exactly match** the following template:
|
|
125
|
+
|
|
126
|
+
**FinalAnswer**
|
|
127
|
+
|
|
128
|
+
* Project-Level README: Yes / No
|
|
129
|
+
* **Score:** [Poor / Fair / Good / Excellent]
|
|
130
|
+
* **Key Strengths**: <brief summary of the README's strongest points in 2-3 sentences>
|
|
131
|
+
* **Overall Improvement Suggestions:**
|
|
132
|
+
- "Original text snippet 1" - Improving comment 1
|
|
133
|
+
- "Original text snippet 2" - Improving comment 2
|
|
134
|
+
- ...
|
|
135
|
+
|
|
136
|
+
#### Notes
|
|
137
|
+
|
|
138
|
+
* **Project-Level README**: "Yes" if root-level; "No" if folder-level.
|
|
139
|
+
* **Score**: Overall quality rating, could be Poor / Fair / Good / Excellent.
|
|
140
|
+
* **Key Strengths**: Briefly highlight the README's strongest aspects.
|
|
141
|
+
* **Improvement Suggestions**: Provide concrete snippets and suggested improvements.
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
### **README path:**
|
|
147
|
+
{readme_path}
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
### **README Content:**
|
|
152
|
+
{readme_content}
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
class EvaluationTask(ABC):
|
|
156
|
+
def __init__(
|
|
157
|
+
self,
|
|
158
|
+
llm: BaseChatOpenAI,
|
|
159
|
+
repo_path: str,
|
|
160
|
+
gitignore_path: str,
|
|
161
|
+
meta_data: ProjectMetadata | None = None,
|
|
162
|
+
step_callback: Callable | None = None,
|
|
163
|
+
summarized_files_db: SummarizedFilesDb | None=None,
|
|
164
|
+
):
|
|
165
|
+
self.evaluation_name = ""
|
|
166
|
+
self.llm = llm
|
|
167
|
+
self.repo_path = repo_path
|
|
168
|
+
self.gitignore_path = gitignore_path
|
|
169
|
+
self.step_callback = step_callback
|
|
170
|
+
self.metadata = meta_data
|
|
171
|
+
self.summarized_files_db = summarized_files_db
|
|
172
|
+
|
|
173
|
+
def print_step(
|
|
174
|
+
self,
|
|
175
|
+
step_name: str | None = None,
|
|
176
|
+
step_output: str | None = None,
|
|
177
|
+
token_usage: dict | None = None,
|
|
178
|
+
):
|
|
179
|
+
if self.step_callback is None:
|
|
180
|
+
return
|
|
181
|
+
self.step_callback(
|
|
182
|
+
step_name=step_name,
|
|
183
|
+
step_output=step_output,
|
|
184
|
+
token_usage=token_usage,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def evaluate(self) -> tuple[dict, list[str]]:
|
|
188
|
+
self._enter_evaluation()
|
|
189
|
+
files = self._collect_files()
|
|
190
|
+
evaluations, token_usage, files = self._evaluate(files)
|
|
191
|
+
self._leave_evaluation(token_usage)
|
|
192
|
+
return evaluations, files
|
|
193
|
+
|
|
194
|
+
def _enter_evaluation(self):
|
|
195
|
+
self.print_step(step_name=self.evaluation_name)
|
|
196
|
+
|
|
197
|
+
def _leave_evaluation(self, token_usage):
|
|
198
|
+
self.print_step(token_usage=token_usage)
|
|
199
|
+
|
|
200
|
+
@abstractmethod
|
|
201
|
+
def _evaluate(self, files: list[str]) -> tuple[dict, dict, list[str]]:
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
@abstractmethod
|
|
205
|
+
def _collect_files(self) -> list[str]:
|
|
206
|
+
pass
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Optional
|
|
5
|
+
from langchain.prompts import ChatPromptTemplate
|
|
6
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from bioguider.agents.consistency_evaluation_task import ConsistencyEvaluationResult
|
|
11
|
+
from bioguider.agents.evaluation_task import EvaluationTask
|
|
12
|
+
from bioguider.agents.collection_task import CollectionTask
|
|
13
|
+
from bioguider.agents.evaluation_tutorial_task_prompts import INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT
|
|
14
|
+
from bioguider.agents.prompt_utils import CollectionGoalItemEnum
|
|
15
|
+
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
|
|
16
|
+
from bioguider.utils.file_utils import flatten_files
|
|
17
|
+
from bioguider.utils.utils import increase_token_usage, get_overall_score
|
|
18
|
+
from .evaluation_utils import (
|
|
19
|
+
compute_readability_metrics,
|
|
20
|
+
default_consistency_result,
|
|
21
|
+
evaluate_consistency_on_content,
|
|
22
|
+
normalize_evaluation_content,
|
|
23
|
+
run_llm_evaluation,
|
|
24
|
+
sanitize_files,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
MAX_FILE_SIZE = 1024 * 100 # 100K
|
|
30
|
+
|
|
31
|
+
class TutorialEvaluationResult(BaseModel):
|
|
32
|
+
overall_score: int=Field(description="A number between 0 and 100 representing the overall quality rating.")
|
|
33
|
+
overall_key_strengths: str=Field(description="A string value, the key strengths of the tutorial")
|
|
34
|
+
# overall_improvement_suggestions: str=Field(description="Suggestions to improve the overall score if necessary")
|
|
35
|
+
readability_score: int=Field(description="A number between 0 and 100 representing the readability quality rating.")
|
|
36
|
+
readability_error_count: Optional[int]=Field(default=0, description="Total number of ERROR INSTANCES found (count every occurrence, not types)")
|
|
37
|
+
readability_errors_found: list[str]=Field(default_factory=list, description="List of ALL individual error instances with format: 'ERROR_TYPE: original → corrected - location'")
|
|
38
|
+
readability_suggestions: list[str]=Field(default_factory=list, description="General readability improvement suggestions (non-error related)")
|
|
39
|
+
setup_and_dependencies_score: int=Field(description="A number between 0 and 100 representing the setup and dependencies quality rating.")
|
|
40
|
+
setup_and_dependencies_suggestions: list[str]=Field(description="A list of string values, suggestions to improve setup and dependencies if necessary")
|
|
41
|
+
reproducibility_score: int=Field(description="A number between 0 and 100 representing the reproducibility quality rating.")
|
|
42
|
+
reproducibility_suggestions: list[str]=Field(description="A list of string values, suggestions to improve reproducibility if necessary")
|
|
43
|
+
structure_and_navigation_score: int=Field(description="A number between 0 and 100 representing the structure and navigation quality rating.")
|
|
44
|
+
structure_and_navigation_suggestions: list[str]=Field(description="A list of string values, suggestions to improve structure and navigation if necessary")
|
|
45
|
+
executable_code_quality_score: int=Field(description="A number between 0 and 100 representing the executable code quality rating.")
|
|
46
|
+
executable_code_quality_suggestions: list[str]=Field(description="A list of string values, suggestions to improve executable code quality if necessary")
|
|
47
|
+
result_verification_score: int=Field(description="A number between 0 and 100 representing the result verification quality rating.")
|
|
48
|
+
result_verification_suggestions: list[str]=Field(description="A list of string values, suggestions to improve result verification if necessary")
|
|
49
|
+
performance_and_resource_notes_score: int=Field(description="A number between 0 and 100 representing the performance and resource notes quality rating.")
|
|
50
|
+
performance_and_resource_notes_suggestions: list[str]=Field(description="A list of string values, suggestions to improve performance and resource notes if necessary")
|
|
51
|
+
|
|
52
|
+
class IndividualTutorialEvaluationResult(BaseModel):
|
|
53
|
+
tutorial_evaluation: TutorialEvaluationResult | None=Field(description="The evaluation result of the tutorial")
|
|
54
|
+
consistency_evaluation: ConsistencyEvaluationResult | None=Field(description="The evaluation result of the consistency of the tutorial")
|
|
55
|
+
|
|
56
|
+
class EvaluationTutorialTask(EvaluationTask):
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
llm: BaseChatOpenAI,
|
|
60
|
+
repo_path: str,
|
|
61
|
+
gitignore_path: str,
|
|
62
|
+
meta_data: ProjectMetadata | None = None,
|
|
63
|
+
step_callback: Callable | None = None,
|
|
64
|
+
summarized_files_db = None,
|
|
65
|
+
code_structure_db = None,
|
|
66
|
+
collected_files: list[str] | None = None,
|
|
67
|
+
):
|
|
68
|
+
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
|
|
69
|
+
self.evaluation_name = "Tutorial Evaluation"
|
|
70
|
+
self.code_structure_db = code_structure_db
|
|
71
|
+
self.collected_files = collected_files
|
|
72
|
+
|
|
73
|
+
def _sanitize_files(self, files: list[str]) -> list[str]:
|
|
74
|
+
return sanitize_files(
|
|
75
|
+
self.repo_path,
|
|
76
|
+
files,
|
|
77
|
+
max_size_bytes=MAX_FILE_SIZE,
|
|
78
|
+
disallowed_exts={".svg"},
|
|
79
|
+
check_ipynb_size=False,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def _collect_files(self):
|
|
83
|
+
if self.collected_files is not None:
|
|
84
|
+
return self.collected_files
|
|
85
|
+
|
|
86
|
+
task = CollectionTask(
|
|
87
|
+
llm=self.llm,
|
|
88
|
+
step_callback=self.step_callback,
|
|
89
|
+
summarized_files_db=self.summarized_files_db,
|
|
90
|
+
)
|
|
91
|
+
task.compile(
|
|
92
|
+
repo_path=self.repo_path,
|
|
93
|
+
gitignore_path=Path(self.repo_path, ".gitignore"),
|
|
94
|
+
goal_item=CollectionGoalItemEnum.Tutorial.name,
|
|
95
|
+
)
|
|
96
|
+
files = task.collect()
|
|
97
|
+
files = flatten_files(self.repo_path, files)
|
|
98
|
+
files = self._sanitize_files(files)
|
|
99
|
+
return files
|
|
100
|
+
|
|
101
|
+
def _evaluate_individual_tutorial(self, file: str) -> tuple[IndividualTutorialEvaluationResult | None, dict]:
|
|
102
|
+
content, readability_content = normalize_evaluation_content(
|
|
103
|
+
self.repo_path, file
|
|
104
|
+
)
|
|
105
|
+
if content is None or readability_content is None:
|
|
106
|
+
logger.error(f"Error in sanitizing file {file} - {Path(self.repo_path, file).resolve()}")
|
|
107
|
+
return None, {**DEFAULT_TOKEN_USAGE}
|
|
108
|
+
|
|
109
|
+
# evaluate general criteria
|
|
110
|
+
flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index = \
|
|
111
|
+
compute_readability_metrics(readability_content)
|
|
112
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
113
|
+
INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT
|
|
114
|
+
).format(
|
|
115
|
+
flesch_reading_ease=flesch_reading_ease,
|
|
116
|
+
flesch_kincaid_grade=flesch_kincaid_grade,
|
|
117
|
+
gunning_fog_index=gunning_fog_index,
|
|
118
|
+
smog_index=smog_index,
|
|
119
|
+
tutorial_file_content=readability_content,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
res, token_usage, reasoning_process = run_llm_evaluation(
|
|
123
|
+
llm=self.llm,
|
|
124
|
+
system_prompt=system_prompt,
|
|
125
|
+
instruction_prompt="Now, let's begin the tutorial evaluation.",
|
|
126
|
+
schema=TutorialEvaluationResult,
|
|
127
|
+
)
|
|
128
|
+
res: TutorialEvaluationResult = res
|
|
129
|
+
|
|
130
|
+
# evaluate consistency
|
|
131
|
+
consistency_evaluation_result, _temp_token_usage = evaluate_consistency_on_content(
|
|
132
|
+
llm=self.llm,
|
|
133
|
+
code_structure_db=self.code_structure_db,
|
|
134
|
+
step_callback=self.step_callback,
|
|
135
|
+
domain="tutorial/vignette",
|
|
136
|
+
content=content,
|
|
137
|
+
)
|
|
138
|
+
if consistency_evaluation_result is None:
|
|
139
|
+
# No sufficient information to evaluate the consistency of the tutorial
|
|
140
|
+
consistency_evaluation_result = default_consistency_result("tutorial/vignette")
|
|
141
|
+
|
|
142
|
+
# calculate overall score
|
|
143
|
+
res.overall_score = get_overall_score(
|
|
144
|
+
[
|
|
145
|
+
consistency_evaluation_result.score,
|
|
146
|
+
res.readability_score,
|
|
147
|
+
res.setup_and_dependencies_score,
|
|
148
|
+
res.reproducibility_score,
|
|
149
|
+
res.structure_and_navigation_score,
|
|
150
|
+
res.executable_code_quality_score,
|
|
151
|
+
res.result_verification_score,
|
|
152
|
+
res.performance_and_resource_notes_score,
|
|
153
|
+
],
|
|
154
|
+
[3, 3, 3, 1, 1, 2, 1, 1],
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return IndividualTutorialEvaluationResult(
|
|
158
|
+
tutorial_evaluation=res,
|
|
159
|
+
consistency_evaluation=consistency_evaluation_result,
|
|
160
|
+
), token_usage
|
|
161
|
+
|
|
162
|
+
def _evaluate(self, files: list[str] | None = None) -> tuple[dict[str, IndividualTutorialEvaluationResult] | None, dict, list[str]]:
|
|
163
|
+
total_token_usage = {**DEFAULT_TOKEN_USAGE}
|
|
164
|
+
tutorial_evaluation_results = {}
|
|
165
|
+
for file in files:
|
|
166
|
+
tutorial_evaluation_result, token_usage = self._evaluate_individual_tutorial(file)
|
|
167
|
+
total_token_usage = increase_token_usage(total_token_usage, token_usage)
|
|
168
|
+
tutorial_evaluation_results[file] = tutorial_evaluation_result
|
|
169
|
+
return tutorial_evaluation_results, total_token_usage, files
|