bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,172 @@
1
+
2
+
3
+ from bioguider.agents.agent_utils import try_parse_json_object, try_parse_with_llm
4
+ from bioguider.agents.evaluation_task import EvaluationTask
5
+ from bioguider.agents.collection_task import CollectionTask
6
+ from bioguider.agents.identification_task import IdentificationTask
7
+ from bioguider.agents.prompt_utils import CollectionGoalItemEnum
8
+ from bioguider.utils.constants import (
9
+ DEFAULT_TOKEN_USAGE,
10
+ EvaluationInstallationResult,
11
+ EvaluationREADMEResult,
12
+ SoftwarePackageContentResult,
13
+ DemoInstructionsResult,
14
+ EvaluationSubmissionRequirementsResult,
15
+ )
16
+
17
+ DEMO_INSTRUCTION_GOAL = """
18
+ 1. Identify if it provides the instructions to run on provided data
19
+ 2. Identify if it provides the instructions to run on custom data
20
+ 3. Identify if it provides the expected output
21
+ """
22
+
23
+ DEMO_INSTRUCTION_FINAL_ANSWER = \
24
+ '{{"run_on_data_instruction": <True or False>, "run_on_custom_instruction": <True or False>, "expected_output_description": <True Or False>}}'
25
+
26
+ class EvaluationSubmissionRequirementsTask(EvaluationTask):
27
+ def __init__(
28
+ self,
29
+ llm,
30
+ repo_path,
31
+ gitignore_path,
32
+ meta_data = None,
33
+ step_callback = None,
34
+ summarized_files_db = None,
35
+ readme_files_evaluation: dict[str, EvaluationREADMEResult] | None = None,
36
+ installation_evaluation: EvaluationInstallationResult | None = None,
37
+ installation_files: list[str] | None = None
38
+ ):
39
+ super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
40
+ self.evaluation_name = "Submission Requirements Evaluation"
41
+ self.readme_files_evaluation = readme_files_evaluation
42
+ self.installation_evaluation = installation_evaluation
43
+ self.installation_files = installation_files
44
+
45
+ def _collect_software_package_content(self):
46
+ collection_task = CollectionTask(
47
+ llm = self.llm,
48
+ step_callback=self.step_callback,
49
+ summarize_instruction="We are collecting compiled standalone files, source code files and example data files.",
50
+ summarized_files_db=self.summarized_files_db,
51
+ )
52
+ collection_task.compile(
53
+ repo_path=self.repo_path,
54
+ gitignore_path=self.gitignore_path,
55
+ db=self.summarized_files_db,
56
+ goal_item=CollectionGoalItemEnum.SoftwarePackageContent.name,
57
+ )
58
+ files = collection_task.collect()
59
+
60
+ return files
61
+
62
+ def _evaluate_software_package_content(self) -> tuple[SoftwarePackageContentResult, list[str]]:
63
+ files = self._collect_software_package_content()
64
+ if len(files) == 3:
65
+ return SoftwarePackageContentResult(
66
+ compiled_standalone_software=files[0].strip().lower() != "n/a",
67
+ source_code=files[1].strip().lower() != "n/a",
68
+ demo_dataset=files[2].strip().lower() != "n/a",
69
+ ), files
70
+ else:
71
+ return SoftwarePackageContentResult(
72
+ compiled_standalone_software=False,
73
+ source_code=False,
74
+ demo_dataset=False,
75
+ ), files
76
+
77
+ def _evaluatie_demo_instructions(self) -> tuple[DemoInstructionsResult | None, list[str]]:
78
+ readme_files = [f for f in self.readme_files_evaluation.keys() \
79
+ if self.readme_files_evaluation[f].project_level]
80
+ installation_files = self.installation_files if self.installation_files is not None else []
81
+ provided_files = readme_files + installation_files
82
+ provided_files = provided_files if len(provided_files) > 0 else None
83
+ identify_task = IdentificationTask(
84
+ llm=self.llm,
85
+ step_callback=self.step_callback,
86
+ summarized_files_db=self.summarized_files_db,
87
+ provided_files=provided_files
88
+ )
89
+ identify_task.compile(
90
+ repo_path=self.repo_path,
91
+ gitignore_path=self.gitignore_path,
92
+ )
93
+ final_answer = identify_task.identify_customize_goal(
94
+ goal="demo instructions",
95
+ final_answer_example=DEMO_INSTRUCTION_FINAL_ANSWER,
96
+ )
97
+ final_answer = final_answer["final_answer"] \
98
+ if final_answer is not None and "final_answer" in final_answer else final_answer
99
+ parsed_obj = self._parse_demo_instruction_result(final_answer)
100
+ return parsed_obj, provided_files
101
+
102
+ def _parse_demo_instruction_result(self, result: str | dict) -> DemoInstructionsResult:
103
+ parsed_obj = None
104
+ if isinstance(result, dict):
105
+ parsed_obj = result
106
+ else:
107
+ parsed_obj = try_parse_json_object(result)
108
+ if parsed_obj is None:
109
+ parsed_obj, token_usage = try_parse_with_llm(
110
+ llm=self.llm,
111
+ input_text=result,
112
+ schema=DemoInstructionsResult,
113
+ )
114
+ parsed_obj = vars(parsed_obj) if parsed_obj is not None else parsed_obj
115
+ self.print_step(token_usage=token_usage)
116
+ self.print_step(step_output=str(parsed_obj))
117
+
118
+ return DemoInstructionsResult(
119
+ run_on_data_instruction = parsed_obj["run_on_data_instruction"] \
120
+ if "run_on_data_instruction" in parsed_obj else False,
121
+ run_on_custom_instruction = parsed_obj["run_on_custom_instruction"] \
122
+ if "run_on_custom_instruction" in parsed_obj else False,
123
+ expected_output_description = parsed_obj["expected_output_description"] \
124
+ if "expected_output_description" in parsed_obj else False,
125
+ )
126
+
127
+ def _combine_evaluation(
128
+ self,
129
+ software_evaluation: SoftwarePackageContentResult,
130
+ demo_evaluation: DemoInstructionsResult,
131
+ ) -> EvaluationSubmissionRequirementsResult:
132
+ readme_files = [f for f in self.readme_files_evaluation.keys() \
133
+ if self.readme_files_evaluation[f].project_level]
134
+ structured_install_evaluation = self.installation_evaluation.structured_evaluation
135
+ software_dependency = structured_install_evaluation.dependency_number > 0
136
+ install_tutorial = structured_install_evaluation.install_tutorial
137
+ hardware_requirements = structured_install_evaluation.hardware_requirements
138
+ compatible_os = structured_install_evaluation.compatible_os
139
+ license = any([
140
+ self.readme_files_evaluation[f].structured_evaluation.license_score \
141
+ if self.readme_files_evaluation[f].structured_evaluation is not None \
142
+ else False for f in readme_files
143
+ ])
144
+ return EvaluationSubmissionRequirementsResult(
145
+ compiled_standalone_software=software_evaluation.compiled_standalone_software,
146
+ source_code=software_evaluation.source_code,
147
+ demo_dataset=software_evaluation.demo_dataset,
148
+ run_on_data_instruction=demo_evaluation.run_on_data_instruction,
149
+ run_on_custom_instruction=demo_evaluation.run_on_custom_instruction,
150
+ expected_output_description=demo_evaluation.expected_output_description,
151
+ complete_readme=len(readme_files) > 0,
152
+ software_dependency=software_dependency,
153
+ install_tutorial=install_tutorial,
154
+ license=license,
155
+ hardware_requirements=hardware_requirements,
156
+ compatible_os=compatible_os,
157
+ )
158
+
159
+ def _evaluate(self, files):
160
+
161
+ software_evaluation, software_files = self._evaluate_software_package_content()
162
+ demo_evaluation, demo_files = self._evaluatie_demo_instructions()
163
+ files = list(set(software_files + demo_files))
164
+
165
+ return self._combine_evaluation(software_evaluation, demo_evaluation), {**DEFAULT_TOKEN_USAGE}, files
166
+
167
+
168
+ def _collect_files(self):
169
+ return []
170
+
171
+
172
+
@@ -0,0 +1,206 @@
1
+
2
+ import os
3
+ from pathlib import Path
4
+ import logging
5
+ from typing import Callable
6
+ from abc import ABC, abstractmethod
7
+ from langchain.prompts import ChatPromptTemplate
8
+ from langchain_openai.chat_models.base import BaseChatOpenAI
9
+
10
+ from bioguider.agents.agent_utils import read_file
11
+ from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
12
+ from bioguider.database.summarized_file_db import SummarizedFilesDb
13
+ from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
14
+ from .common_conversation import CommonConversation
15
+ from ..utils.pyphen_utils import PyphenReadability
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ EVALUATION_README_SYSTEM_PROMPT = """
20
+ You are an expert in evaluating the quality of README files in software repositories.
21
+ Your task is to analyze the provided README file and generate a comprehensive quality report.
22
+
23
+ ---
24
+
25
+ ### **Step 1: Identify README type
26
+
27
+ First, determine whether the provided README is a **project-level README** (typically at the root of a repository) or a **folder-level README** (typically inside subdirectories).
28
+
29
+ ---
30
+
31
+ ### **Evaluation Criteria**
32
+
33
+ #### If the README is a **project-level** file, evaluate it using the following criteria.
34
+
35
+ For each criterion below, provide a brief assessment followed by specific, actionable comments for improvement.
36
+
37
+ **1. Project Clarity & Purpose**
38
+ * **Assessment**: [Your evaluation of whether the project's purpose is clear.]
39
+ * **Improvement Suggestions**:
40
+ * **Original text:** [Quote a specific line/section from the README.]
41
+ * **Improving comments:** [Provide your suggestions to improve clarity.]
42
+ * **Original text:** [Quote a specific line/section from the README.]
43
+ * **Improving comments:** [Provide your suggestions to improve clarity.]
44
+ ...
45
+
46
+ **2. Installation Instructions**
47
+ * **Assessment**: [Your evaluation of the installation instructions.]
48
+ * **Improvement Suggestions**:
49
+ * **Original text:** [Quote text related to installation.]
50
+ * **Improving comments:** [Provide your suggestions.]
51
+ * **Original text:** [Quote text related to installation.]
52
+ * **Improving comments:** [Provide your suggestions.]
53
+ ...
54
+
55
+ **3. Usage Instructions**
56
+ * **Assessment**: [Your evaluation of the usage instructions.]
57
+ * **Improvement Suggestions**:
58
+ * **Original text:** [Quote text related to usage.]
59
+ * **Improving comments:** [Provide your suggestions.]
60
+ * **Original text:** [Quote text related to usage.]
61
+ * **Improving comments:** [Provide your suggestions.]
62
+ ...
63
+
64
+ **4. Contributing Guidelines**
65
+ * **Assessment**: [Your evaluation of the contributing guidelines.]
66
+ * **Improvement Suggestions**:
67
+ * **Original text:** [Quote text related to contributions.]
68
+ * **Improving comments:** [Provide your suggestions.]
69
+ * **Original text:** [Quote text related to contributions.]
70
+ * **Improving comments:** [Provide your suggestions.]
71
+ ...
72
+
73
+ **5. License Information**
74
+ * **Assessment**: [Your evaluation of the license information.]
75
+ * **Improvement Suggestions**:
76
+ * **Original text:** [Quote text related to the license.]
77
+ * **Improving comments:** [Provide your suggestions.]
78
+ * **Original text:** [Quote text related to the license.]
79
+ * **Improving comments:** [Provide your suggestions.]
80
+ ...
81
+
82
+ **6. Readability Analysis**
83
+ * **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
84
+ * **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
85
+ * **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
86
+ * **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
87
+ * **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
88
+
89
+ ---
90
+
91
+ #### If if is a **folder-level** file, use the following criteria instead.
92
+
93
+ For each criterion below, provide a brief assessment followed by specific, actionable comments for improvement.
94
+
95
+ **1. Folder Description**
96
+ * **Assessment**: [Your evaluation of whether it Provides a clear **description** of what the folder contains (e.g., modules, scripts, data).]
97
+ * **Improvement Suggestions**:
98
+ * **Original text:** [Quote a specific line/section from the README.]
99
+ * **Improving comments:** [Provide your suggestions to improve clarity.]
100
+
101
+ **2. Folder Purpose**
102
+ * **Assessment**: [Your evaluation of whether it explains the **purpose** or **role** of the components inside this subfolder.]
103
+ * **Improvement Suggestions**:
104
+ * **Original text:** [Quote text related to purpose.]
105
+ * **Improving comments:** [Provide your suggestions.]
106
+
107
+ **3. Usage**
108
+ * **Assessment**: [Your evaluation of whether it includes **usage instructions** specific to this folder (e.g., commands, import paths, input/output files).]
109
+ * **Improvement Suggestions**:
110
+ * **Original text:** [Quote text related to usage.]
111
+ * **Improving comments:** [Provide your suggestions.]
112
+
113
+ **4. Readability Analysis**
114
+ * **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
115
+ * **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
116
+ * **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
117
+ * **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
118
+ * **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
119
+
120
+ ---
121
+
122
+ ### Final Report Format
123
+
124
+ #### Your output **must exactly match** the following template:
125
+
126
+ **FinalAnswer**
127
+
128
+ * Project-Level README: Yes / No
129
+ * **Score:** [Poor / Fair / Good / Excellent]
130
+ * **Key Strengths**: <brief summary of the README's strongest points in 2-3 sentences>
131
+ * **Overall Improvement Suggestions:**
132
+ - "Original text snippet 1" - Improving comment 1
133
+ - "Original text snippet 2" - Improving comment 2
134
+ - ...
135
+
136
+ #### Notes
137
+
138
+ * **Project-Level README**: "Yes" if root-level; "No" if folder-level.
139
+ * **Score**: Overall quality rating, could be Poor / Fair / Good / Excellent.
140
+ * **Key Strengths**: Briefly highlight the README's strongest aspects.
141
+ * **Improvement Suggestions**: Provide concrete snippets and suggested improvements.
142
+
143
+
144
+ ---
145
+
146
+ ### **README path:**
147
+ {readme_path}
148
+
149
+ ---
150
+
151
+ ### **README Content:**
152
+ {readme_content}
153
+ """
154
+
155
+ class EvaluationTask(ABC):
156
+ def __init__(
157
+ self,
158
+ llm: BaseChatOpenAI,
159
+ repo_path: str,
160
+ gitignore_path: str,
161
+ meta_data: ProjectMetadata | None = None,
162
+ step_callback: Callable | None = None,
163
+ summarized_files_db: SummarizedFilesDb | None=None,
164
+ ):
165
+ self.evaluation_name = ""
166
+ self.llm = llm
167
+ self.repo_path = repo_path
168
+ self.gitignore_path = gitignore_path
169
+ self.step_callback = step_callback
170
+ self.metadata = meta_data
171
+ self.summarized_files_db = summarized_files_db
172
+
173
+ def print_step(
174
+ self,
175
+ step_name: str | None = None,
176
+ step_output: str | None = None,
177
+ token_usage: dict | None = None,
178
+ ):
179
+ if self.step_callback is None:
180
+ return
181
+ self.step_callback(
182
+ step_name=step_name,
183
+ step_output=step_output,
184
+ token_usage=token_usage,
185
+ )
186
+
187
+ def evaluate(self) -> tuple[dict, list[str]]:
188
+ self._enter_evaluation()
189
+ files = self._collect_files()
190
+ evaluations, token_usage, files = self._evaluate(files)
191
+ self._leave_evaluation(token_usage)
192
+ return evaluations, files
193
+
194
+ def _enter_evaluation(self):
195
+ self.print_step(step_name=self.evaluation_name)
196
+
197
+ def _leave_evaluation(self, token_usage):
198
+ self.print_step(token_usage=token_usage)
199
+
200
+ @abstractmethod
201
+ def _evaluate(self, files: list[str]) -> tuple[dict, dict, list[str]]:
202
+ pass
203
+
204
+ @abstractmethod
205
+ def _collect_files(self) -> list[str]:
206
+ pass
@@ -0,0 +1,169 @@
1
+
2
+
3
+ from pathlib import Path
4
+ from typing import Callable, Optional
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain_openai.chat_models.base import BaseChatOpenAI
7
+ from pydantic import BaseModel, Field
8
+ import logging
9
+
10
+ from bioguider.agents.consistency_evaluation_task import ConsistencyEvaluationResult
11
+ from bioguider.agents.evaluation_task import EvaluationTask
12
+ from bioguider.agents.collection_task import CollectionTask
13
+ from bioguider.agents.evaluation_tutorial_task_prompts import INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT
14
+ from bioguider.agents.prompt_utils import CollectionGoalItemEnum
15
+ from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
16
+ from bioguider.utils.file_utils import flatten_files
17
+ from bioguider.utils.utils import increase_token_usage, get_overall_score
18
+ from .evaluation_utils import (
19
+ compute_readability_metrics,
20
+ default_consistency_result,
21
+ evaluate_consistency_on_content,
22
+ normalize_evaluation_content,
23
+ run_llm_evaluation,
24
+ sanitize_files,
25
+ )
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ MAX_FILE_SIZE = 1024 * 100 # 100K
30
+
31
+ class TutorialEvaluationResult(BaseModel):
32
+ overall_score: int=Field(description="A number between 0 and 100 representing the overall quality rating.")
33
+ overall_key_strengths: str=Field(description="A string value, the key strengths of the tutorial")
34
+ # overall_improvement_suggestions: str=Field(description="Suggestions to improve the overall score if necessary")
35
+ readability_score: int=Field(description="A number between 0 and 100 representing the readability quality rating.")
36
+ readability_error_count: Optional[int]=Field(default=0, description="Total number of ERROR INSTANCES found (count every occurrence, not types)")
37
+ readability_errors_found: list[str]=Field(default_factory=list, description="List of ALL individual error instances with format: 'ERROR_TYPE: original → corrected - location'")
38
+ readability_suggestions: list[str]=Field(default_factory=list, description="General readability improvement suggestions (non-error related)")
39
+ setup_and_dependencies_score: int=Field(description="A number between 0 and 100 representing the setup and dependencies quality rating.")
40
+ setup_and_dependencies_suggestions: list[str]=Field(description="A list of string values, suggestions to improve setup and dependencies if necessary")
41
+ reproducibility_score: int=Field(description="A number between 0 and 100 representing the reproducibility quality rating.")
42
+ reproducibility_suggestions: list[str]=Field(description="A list of string values, suggestions to improve reproducibility if necessary")
43
+ structure_and_navigation_score: int=Field(description="A number between 0 and 100 representing the structure and navigation quality rating.")
44
+ structure_and_navigation_suggestions: list[str]=Field(description="A list of string values, suggestions to improve structure and navigation if necessary")
45
+ executable_code_quality_score: int=Field(description="A number between 0 and 100 representing the executable code quality rating.")
46
+ executable_code_quality_suggestions: list[str]=Field(description="A list of string values, suggestions to improve executable code quality if necessary")
47
+ result_verification_score: int=Field(description="A number between 0 and 100 representing the result verification quality rating.")
48
+ result_verification_suggestions: list[str]=Field(description="A list of string values, suggestions to improve result verification if necessary")
49
+ performance_and_resource_notes_score: int=Field(description="A number between 0 and 100 representing the performance and resource notes quality rating.")
50
+ performance_and_resource_notes_suggestions: list[str]=Field(description="A list of string values, suggestions to improve performance and resource notes if necessary")
51
+
52
+ class IndividualTutorialEvaluationResult(BaseModel):
53
+ tutorial_evaluation: TutorialEvaluationResult | None=Field(description="The evaluation result of the tutorial")
54
+ consistency_evaluation: ConsistencyEvaluationResult | None=Field(description="The evaluation result of the consistency of the tutorial")
55
+
56
+ class EvaluationTutorialTask(EvaluationTask):
57
+ def __init__(
58
+ self,
59
+ llm: BaseChatOpenAI,
60
+ repo_path: str,
61
+ gitignore_path: str,
62
+ meta_data: ProjectMetadata | None = None,
63
+ step_callback: Callable | None = None,
64
+ summarized_files_db = None,
65
+ code_structure_db = None,
66
+ collected_files: list[str] | None = None,
67
+ ):
68
+ super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
69
+ self.evaluation_name = "Tutorial Evaluation"
70
+ self.code_structure_db = code_structure_db
71
+ self.collected_files = collected_files
72
+
73
+ def _sanitize_files(self, files: list[str]) -> list[str]:
74
+ return sanitize_files(
75
+ self.repo_path,
76
+ files,
77
+ max_size_bytes=MAX_FILE_SIZE,
78
+ disallowed_exts={".svg"},
79
+ check_ipynb_size=False,
80
+ )
81
+
82
+ def _collect_files(self):
83
+ if self.collected_files is not None:
84
+ return self.collected_files
85
+
86
+ task = CollectionTask(
87
+ llm=self.llm,
88
+ step_callback=self.step_callback,
89
+ summarized_files_db=self.summarized_files_db,
90
+ )
91
+ task.compile(
92
+ repo_path=self.repo_path,
93
+ gitignore_path=Path(self.repo_path, ".gitignore"),
94
+ goal_item=CollectionGoalItemEnum.Tutorial.name,
95
+ )
96
+ files = task.collect()
97
+ files = flatten_files(self.repo_path, files)
98
+ files = self._sanitize_files(files)
99
+ return files
100
+
101
+ def _evaluate_individual_tutorial(self, file: str) -> tuple[IndividualTutorialEvaluationResult | None, dict]:
102
+ content, readability_content = normalize_evaluation_content(
103
+ self.repo_path, file
104
+ )
105
+ if content is None or readability_content is None:
106
+ logger.error(f"Error in sanitizing file {file} - {Path(self.repo_path, file).resolve()}")
107
+ return None, {**DEFAULT_TOKEN_USAGE}
108
+
109
+ # evaluate general criteria
110
+ flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index = \
111
+ compute_readability_metrics(readability_content)
112
+ system_prompt = ChatPromptTemplate.from_template(
113
+ INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT
114
+ ).format(
115
+ flesch_reading_ease=flesch_reading_ease,
116
+ flesch_kincaid_grade=flesch_kincaid_grade,
117
+ gunning_fog_index=gunning_fog_index,
118
+ smog_index=smog_index,
119
+ tutorial_file_content=readability_content,
120
+ )
121
+
122
+ res, token_usage, reasoning_process = run_llm_evaluation(
123
+ llm=self.llm,
124
+ system_prompt=system_prompt,
125
+ instruction_prompt="Now, let's begin the tutorial evaluation.",
126
+ schema=TutorialEvaluationResult,
127
+ )
128
+ res: TutorialEvaluationResult = res
129
+
130
+ # evaluate consistency
131
+ consistency_evaluation_result, _temp_token_usage = evaluate_consistency_on_content(
132
+ llm=self.llm,
133
+ code_structure_db=self.code_structure_db,
134
+ step_callback=self.step_callback,
135
+ domain="tutorial/vignette",
136
+ content=content,
137
+ )
138
+ if consistency_evaluation_result is None:
139
+ # No sufficient information to evaluate the consistency of the tutorial
140
+ consistency_evaluation_result = default_consistency_result("tutorial/vignette")
141
+
142
+ # calculate overall score
143
+ res.overall_score = get_overall_score(
144
+ [
145
+ consistency_evaluation_result.score,
146
+ res.readability_score,
147
+ res.setup_and_dependencies_score,
148
+ res.reproducibility_score,
149
+ res.structure_and_navigation_score,
150
+ res.executable_code_quality_score,
151
+ res.result_verification_score,
152
+ res.performance_and_resource_notes_score,
153
+ ],
154
+ [3, 3, 3, 1, 1, 2, 1, 1],
155
+ )
156
+
157
+ return IndividualTutorialEvaluationResult(
158
+ tutorial_evaluation=res,
159
+ consistency_evaluation=consistency_evaluation_result,
160
+ ), token_usage
161
+
162
+ def _evaluate(self, files: list[str] | None = None) -> tuple[dict[str, IndividualTutorialEvaluationResult] | None, dict, list[str]]:
163
+ total_token_usage = {**DEFAULT_TOKEN_USAGE}
164
+ tutorial_evaluation_results = {}
165
+ for file in files:
166
+ tutorial_evaluation_result, token_usage = self._evaluate_individual_tutorial(file)
167
+ total_token_usage = increase_token_usage(total_token_usage, token_usage)
168
+ tutorial_evaluation_results[file] = tutorial_evaluation_result
169
+ return tutorial_evaluation_results, total_token_usage, files