bioguider 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

@@ -83,6 +83,6 @@ class AgentTask(ABC):
83
83
  print(s)
84
84
 
85
85
  return s
86
-
86
+
87
87
 
88
88
 
@@ -1,5 +1,6 @@
1
1
 
2
2
  import json
3
+ from json import JSONDecodeError
3
4
  import os
4
5
  import re
5
6
  import subprocess
@@ -376,4 +377,33 @@ def escape_braces(text: str) -> str:
376
377
  text = re.sub(r'(?<!})}(?!})', '}}', text)
377
378
  # Then replace single { not part of {{
378
379
  text = re.sub(r'(?<!{){(?!{)', '{{', text)
379
- return text
380
+ return text
381
+
382
+ def try_parse_json_object(json_obj: str) -> dict | None:
383
+ json_obj = json_obj.strip()
384
+
385
+ # First, try to parse
386
+ try:
387
+ obj = json.loads(json_obj)
388
+ return obj
389
+ except JSONDecodeError as e:
390
+ logger.error(e)
391
+
392
+ # Second, let's handle some common errors
393
+ if not json_obj.startswith("{") and not json_obj.endswith("}") and ":" in json_obj:
394
+ json_obj = "{" + json_obj + "}"
395
+ if json_obj.startswith("{{"):
396
+ json_obj = json_obj[1:]
397
+ if json_obj.endswith("}}"):
398
+ json_obj = json_obj[:-1]
399
+
400
+ # Finally, let's try to parse again
401
+ try:
402
+ obj = json.loads(json_obj)
403
+ return obj
404
+ except JSONDecodeError as e:
405
+ logger.error(e)
406
+ return None
407
+ except Exception as e:
408
+ logger.error(e)
409
+ return None
@@ -24,7 +24,7 @@ from langgraph.graph import StateGraph, START, END
24
24
 
25
25
  from bioguider.database.summarized_file_db import SummarizedFilesDb
26
26
  from bioguider.utils.file_utils import get_file_type
27
- from bioguider.agents.agent_utils import read_directory
27
+ from bioguider.agents.agent_utils import read_directory, try_parse_json_object
28
28
  from bioguider.agents.collection_task_utils import (
29
29
  RELATED_FILE_GOAL_ITEM,
30
30
  CollectionWorkflowState,
@@ -172,28 +172,16 @@ class CollectionTask(AgentTask):
172
172
  if s["final_answer"] is None:
173
173
  return None
174
174
  result = s["final_answer"].strip()
175
- try:
176
- json_obj = json.loads(result)
177
- result = json_obj["final_answer"]
178
- if isinstance(result, str):
179
- result = result.strip()
180
- return [result]
181
- elif isinstance(result, list):
182
- return result
183
- else:
184
- logger.error(f"Final answer is not a valid JSON list or string: {result}")
185
- return None
186
- except json.JSONDecodeError:
175
+ the_obj = try_parse_json_object(result)
176
+ if the_obj is None or "final_answer" not in the_obj:
187
177
  logger.error(f"Final answer is not a valid JSON: {result}")
188
178
  return None
189
- except Exception as e:
190
- logger.error(str(e))
191
- return s
192
-
193
-
194
-
195
-
196
-
197
-
198
-
199
-
179
+ final_result = the_obj["final_answer"]
180
+ if isinstance(final_result, str):
181
+ final_result = final_result.strip()
182
+ return [final_result]
183
+ elif isinstance(final_result, list):
184
+ return final_result
185
+ else:
186
+ logger.error(f"Final answer is not a valid JSON list or string: {result}")
187
+ return None
@@ -154,7 +154,7 @@ class CommonAgentTwoChainSteps(CommonAgentTwoSteps):
154
154
  def _invoke_agent(self, system_prompt, instruction_prompt, schema, post_process = None, **kwargs):
155
155
  # Initialize the callback handler
156
156
  callback_handler = OpenAICallbackHandler()
157
- processed_system_prompt = system_prompt.replace("{", "(").replace("}", ")")
157
+ processed_system_prompt = system_prompt.replace("{", "{{").replace("}", "}}")
158
158
  cot_prompt = self._build_prompt_for_cot_step(
159
159
  system_prompt=processed_system_prompt,
160
160
  instruction_prompt=instruction_prompt
@@ -9,6 +9,7 @@ from pydantic import BaseModel, Field
9
9
  from markdownify import markdownify as md
10
10
 
11
11
  from bioguider.agents.agent_utils import read_file
12
+ from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
12
13
  from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
13
14
  from bioguider.rag.data_pipeline import count_tokens
14
15
  from .common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
@@ -16,11 +17,52 @@ from .common_agent import CommonConversation
16
17
  from ..utils.pyphen_utils import PyphenReadability
17
18
  from ..utils.gitignore_checker import GitignoreChecker
18
19
  from .evaluation_task import EvaluationTask
19
- from .agent_utils import read_file
20
+ from .agent_utils import increase_token_usage, read_file
20
21
 
21
22
 
22
23
  logger = logging.getLogger(__name__)
23
24
 
25
+ STRUCTURED_EVALUATION_INSTALLATION_SYSTEM_PROMPT = """
26
+ You are an expert in evaluating the quality of installation information in software repositories.
27
+ Your task is to analyze the provided files related to installation and generate a structured quality assessment based on the following criteria.
28
+ ---
29
+
30
+ ### **Evaluation Criteria**
31
+
32
+ 1. **Installation Available**: Is the installation documents accessible and present?
33
+ * Output: `Yes` or `No`
34
+
35
+ 2. **Installation Tutorial**: Is the installation tutorial provided?
36
+ * Ouput: `Yes` or `No`
37
+
38
+ 3. **Number of required Dependencies Installation**: The number of dependencies that are required to install
39
+ * Output: Number
40
+ * Suggest specific improvements if necessary, such as missing dependencies
41
+
42
+ 4. **Overall Score**: Give an overall quality rating of the Installation information.
43
+ * Output: `Poor`, `Fair`, `Good`, or `Excellent`
44
+
45
+ ---
46
+
47
+ ### **Final Report Ouput**
48
+ Your final report must **exactly match** the following format. Do not add or omit any sections.
49
+
50
+ **FinalAnswer**
51
+ **Install Available:** [Yes / No]
52
+ **Install Tutorial:** [Yes / No]
53
+ **Dependency:**
54
+ * number: [Number]
55
+ * suggestions: <suggestion to improve **dependency information** like missing dependencies
56
+ **Overall Score:** [Poor / Fair / Good / Excellent]
57
+
58
+ ---
59
+
60
+ ### Installation Files Provided:
61
+ {installation_files_content}
62
+
63
+ """
64
+
65
+
24
66
  EVALUATION_INSTALLATION_SYSTEM_PROMPT = """
25
67
  You are an expert in evaluating the quality of **installation instructions** in software repositories.
26
68
  Your task is to analyze the provided content of installation-related files and generate a **comprehensive, structured quality report**.
@@ -62,10 +104,17 @@ Your response **must exactly follow** the structure below:
62
104
  ---
63
105
 
64
106
  ### Installation Files Provided:
65
- {installation_file_contents}
107
+ {installation_files_content}
66
108
 
67
109
  """
68
110
 
111
+ class StructuredEvaluationInstallationResult(BaseModel):
112
+ install_available: Optional[bool]=Field(description="A boolean value. Is the installation documents accessible and present?")
113
+ install_tutorial: Optional[bool]=Field(description="A boolean value. Is the installation tutorial provided?")
114
+ dependency_number: Optional[int]=Field(description="A number. It is the number of dependencies that are required to install.")
115
+ dependency_suggestions: Optional[str]=Field(description="A string value. It is the specific improvements if necessary, such as missing dependencies")
116
+ overall_score: Optional[str]=Field(description="A overall scroll for the installation quality, could be `Poor`, `Fair`, `Good`, or `Excellent`")
117
+
69
118
  class EvaluationInstallationResult(BaseModel):
70
119
  ease_of_access: Optional[str]=Field(description="Is the installation information easy to access")
71
120
  score: Optional[str]=Field(description="An overall score, could be Poor, Fair, Good or Excellent")
@@ -118,10 +167,10 @@ class EvaluationInstallationTask(EvaluationTask):
118
167
  super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
119
168
  self.evaluation_name = "Installation Evaluation"
120
169
 
121
- def _evaluate(self, files: list[str] | None = None):
170
+
171
+ def _collect_install_files_content(self, files: list[str] | None=None) -> str:
122
172
  if files is None or len(files) == 0:
123
- return None
124
-
173
+ return "N/A"
125
174
  files_content = ""
126
175
  MAX_TOKENS = os.environ.get("OPENAI_MAX_INPUT_TOKENS", 102400)
127
176
  for f in files:
@@ -137,24 +186,64 @@ class EvaluationInstallationTask(EvaluationTask):
137
186
  {content}
138
187
 
139
188
  """
189
+ return files_content
190
+
191
+ def _structured_evaluate(self, files: list[str] | None = None) -> tuple[dict|None, dict]:
192
+ if files is None or len(files) == 0:
193
+ return None, {**DEFAULT_TOKEN_USAGE}
194
+
195
+ files_content = self._collect_install_files_content(files)
196
+ system_prompt = ChatPromptTemplate.from_template(
197
+ STRUCTURED_EVALUATION_INSTALLATION_SYSTEM_PROMPT,
198
+ ).format(
199
+ installation_files_content=files_content,
200
+ )
201
+ agent = CommonAgentTwoChainSteps(llm=self.llm)
202
+ res, _, token_usage, reasoning_process = agent.go(
203
+ system_prompt=system_prompt,
204
+ instruction_prompt=EVALUATION_INSTRUCTION,
205
+ schema=StructuredEvaluationInstallationResult,
206
+ )
207
+ self.print_step(step_output=reasoning_process)
208
+ self.print_step(token_usage=token_usage)
209
+
210
+ return {
211
+ "structured_evaluation": res,
212
+ "structured_reasoning_process": reasoning_process,
213
+ }, token_usage
214
+
215
+ def _free_evaluate(self, files: list[str] | None=None) -> tuple[dict|None, dict]:
216
+ if files is None or len(files) == 0:
217
+ return None, {**DEFAULT_TOKEN_USAGE}
218
+
219
+ files_content = self._collect_install_files_content(files)
140
220
  system_prompt = ChatPromptTemplate.from_template(EVALUATION_INSTALLATION_SYSTEM_PROMPT).format(
141
- installation_file_contents=files_content
221
+ installation_files_content=files_content
142
222
  )
143
223
  agent = CommonAgentTwoChainSteps(llm=self.llm)
144
224
  res, _, token_usage, reasoning_process = agent.go(
145
225
  system_prompt=system_prompt,
146
- instruction_prompt="Before arriving at the conclusion, clearly explain your reasoning step by step. Now, let's begin the evaluation.",
226
+ instruction_prompt=EVALUATION_INSTRUCTION,
147
227
  schema=EvaluationInstallationResultSchema,
148
228
  )
149
229
  res = EvaluationInstallationResult(**res)
150
230
  self.print_step(step_output=reasoning_process)
231
+ self.print_step(token_usage=token_usage)
151
232
  evaluation = {
152
- "score": res.score,
153
- "ease_of_access": res.ease_of_access,
154
- "hardware_requirements": res.hardware_requirements,
155
- "clarity_of_dependency": res.clarity_of_dependency,
156
- "installation_guide": res.installation_guide,
233
+ "evaluation": res,
157
234
  "reasoning_process": reasoning_process,
158
235
  }
159
236
  return evaluation, token_usage
237
+
238
+ def _evaluate(self, files: list[str] | None = None) -> tuple[dict | None, dict]:
239
+ evaluation, token_usage = self._free_evaluate(files)
240
+ structured_evaluation, structured_token_usage = self._structured_evaluate(files)
241
+
242
+ combined_evaluation = {
243
+ **evaluation,
244
+ **structured_evaluation,
245
+ }
246
+ total_token_usage = increase_token_usage(token_usage, structured_token_usage)
247
+
248
+ return combined_evaluation, total_token_usage
160
249
 
@@ -0,0 +1,473 @@
1
+
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import Callable, Optional
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain_openai.chat_models.base import BaseChatOpenAI
7
+ from pydantic import BaseModel, Field
8
+
9
+ from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
10
+
11
+ from ..utils.pyphen_utils import PyphenReadability
12
+ from bioguider.agents.agent_utils import increase_token_usage, read_file, summarize_file
13
+ from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps
14
+ from bioguider.agents.evaluation_task import EvaluationTask
15
+ from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ STRUCTURED_EVALUATION_README_SYSTEM_PROMPT = """
20
+ You are an expert in evaluating the quality of README files in software repositories.
21
+ Your task is to analyze the provided README file and generate a structured quality assessment based on the following criteria.
22
+ If a LICENSE file is present in the repository, its content will also be provided to support your evaluation of license-related criteria.
23
+ ---
24
+
25
+ ### **Evaluation Criteria**
26
+
27
+ 1. **Available**: Is the README accessible and present?
28
+ * Output: `Yes` or `No`
29
+
30
+ 2. **Readability**: Evaluate based on readability metrics such as Flesch-Kincaid Grade Level, SMOG Index, etc.
31
+ * Output: `Poor`, `Fair`, `Good`, or `Excellent`
32
+ * Suggest specific improvements if necessary
33
+
34
+ 3. **Project Purpose**: Is the project's goal or function clearly stated?
35
+ * Output: `Yes` or `No`
36
+ * Provide suggestions if unclear
37
+
38
+ 4. **Hardware and Software Requirements**: Are hardware/software specs and compatibility details included?
39
+ * Output: `Poor`, `Fair`, `Good`, or `Excellent`
40
+ * Suggest how to improve the section if needed
41
+
42
+ 5. **Dependencies**: Are all necessary software libraries and dependencies clearly listed?
43
+ * Output: `Poor`, `Fair`, `Good`, or `Excellent`
44
+ * Suggest improvements if applicable
45
+
46
+ 6. **License Information**: Is license type clearly indicated?
47
+ * Output: `Yes` or `No`
48
+ * Suggest improvement if missing or unclear
49
+
50
+ 7. **Author / Contributor Info**: Are contributor or maintainer details provided?
51
+ * Output: `Yes` or `No`
52
+ * Suggest improvement if missing
53
+
54
+ 8. **Overall Score**: Give an overall quality rating of the README.
55
+ * Output: `Poor`, `Fair`, `Good`, or `Excellent`
56
+
57
+ ---
58
+
59
+ ### **Readability Metrics**
60
+ * **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
61
+ * **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
62
+ * **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
63
+ * **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
64
+
65
+ ---
66
+
67
+ ### **Final Report Ouput**
68
+ Your final report must **exactly match** the following format. Do not add or omit any sections.
69
+
70
+ **FinalAnswer**
71
+ **Available:** [Yes / No]
72
+ **Readability:**
73
+ * score: [Poor / Fair / Good / Excellent]
74
+ * suggestions: <suggestions to improve README readability>
75
+ **Project Purpose:**
76
+ * score: [Yes / No]
77
+ * suggestions: <suggestions to improve project purpose.>
78
+ **Hardware and software spec and compatibility description:**
79
+ * score: [Poor / Fair / Good / Excellent]
80
+ * suggestions: <suggestions to improve **hardware and software** description>
81
+ **Dependencies clearly stated:**
82
+ * score: [Poor / Fair / Good / Excellent]
83
+ * suggestions: <suggestions to improve **Dependencies** description>
84
+ **License Information Included:**
85
+ * score: [Yes / No]
86
+ * suggestions: <suggestions to improve **License Information**>
87
+ **Overall Score:** [Poor / Fair / Good / Excellent]
88
+
89
+ ---
90
+
91
+ ### **README Path**
92
+ {readme_path}
93
+
94
+ ---
95
+
96
+ ### **README content**
97
+ {readme_content}
98
+
99
+ ---
100
+
101
+ ### **LICENSE Path**
102
+ {license_path}
103
+
104
+ ---
105
+
106
+ ### **LICENSE Summarized Content**
107
+ {license_summarized_content}
108
+
109
+ """
110
+
111
+ EVALUATION_README_SYSTEM_PROMPT = """
112
+ You are an expert in evaluating the quality of README files in software repositories.
113
+ Your task is to analyze the provided README file and generate a comprehensive quality report.
114
+
115
+ ---
116
+
117
+ ### **Step 1: Identify README type
118
+
119
+ First, determine whether the provided README is a **project-level README** (typically at the root of a repository) or a **folder-level README** (typically inside subdirectories).
120
+
121
+ ---
122
+
123
+ ### **Evaluation Criteria**
124
+
125
+ #### If the README is a **project-level** file, evaluate it using the following criteria.
126
+
127
+ For each criterion below, provide a brief assessment followed by specific, actionable comments for improvement.
128
+
129
+ **1. Project Clarity & Purpose**
130
+ * **Assessment**: [Your evaluation of whether the project's purpose is clear.]
131
+ * **Improvement Suggestions**:
132
+ * **Original text:** [Quote a specific line/section from the README.]
133
+ * **Improving comments:** [Provide your suggestions to improve clarity.]
134
+ * **Original text:** [Quote a specific line/section from the README.]
135
+ * **Improving comments:** [Provide your suggestions to improve clarity.]
136
+ ...
137
+
138
+ **2. Installation Instructions**
139
+ * **Assessment**: [Your evaluation of the installation instructions.]
140
+ * **Improvement Suggestions**:
141
+ * **Original text:** [Quote text related to installation.]
142
+ * **Improving comments:** [Provide your suggestions.]
143
+ * **Original text:** [Quote text related to installation.]
144
+ * **Improving comments:** [Provide your suggestions.]
145
+ ...
146
+
147
+ **3. Usage Instructions**
148
+ * **Assessment**: [Your evaluation of the usage instructions.]
149
+ * **Improvement Suggestions**:
150
+ * **Original text:** [Quote text related to usage.]
151
+ * **Improving comments:** [Provide your suggestions.]
152
+ * **Original text:** [Quote text related to usage.]
153
+ * **Improving comments:** [Provide your suggestions.]
154
+ ...
155
+
156
+ **4. Contributing Guidelines**
157
+ * **Assessment**: [Your evaluation of the contributing guidelines.]
158
+ * **Improvement Suggestions**:
159
+ * **Original text:** [Quote text related to contributions.]
160
+ * **Improving comments:** [Provide your suggestions.]
161
+ * **Original text:** [Quote text related to contributions.]
162
+ * **Improving comments:** [Provide your suggestions.]
163
+ ...
164
+
165
+ **5. License Information**
166
+ * **Assessment**: [Your evaluation of the license information.]
167
+ * **Improvement Suggestions**:
168
+ * **Original text:** [Quote text related to the license.]
169
+ * **Improving comments:** [Provide your suggestions.]
170
+ * **Original text:** [Quote text related to the license.]
171
+ * **Improving comments:** [Provide your suggestions.]
172
+ ...
173
+
174
+ **6. Readability Analysis**
175
+ * **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
176
+ * **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
177
+ * **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
178
+ * **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
179
+ * **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
180
+
181
+ ---
182
+
183
+ #### If if is a **folder-level** file, use the following criteria instead.
184
+
185
+ For each criterion below, provide a brief assessment followed by specific, actionable comments for improvement.
186
+
187
+ **1. Folder Description**
188
+ * **Assessment**: [Your evaluation of whether it Provides a clear **description** of what the folder contains (e.g., modules, scripts, data).]
189
+ * **Improvement Suggestions**:
190
+ * **Original text:** [Quote a specific line/section from the README.]
191
+ * **Improving comments:** [Provide your suggestions to improve clarity.]
192
+
193
+ **2. Folder Purpose**
194
+ * **Assessment**: [Your evaluation of whether it explains the **purpose** or **role** of the components inside this subfolder.]
195
+ * **Improvement Suggestions**:
196
+ * **Original text:** [Quote text related to purpose.]
197
+ * **Improving comments:** [Provide your suggestions.]
198
+
199
+ **3. Usage**
200
+ * **Assessment**: [Your evaluation of whether it includes **usage instructions** specific to this folder (e.g., commands, import paths, input/output files).]
201
+ * **Improvement Suggestions**:
202
+ * **Original text:** [Quote text related to usage.]
203
+ * **Improving comments:** [Provide your suggestions.]
204
+
205
+ **4. Readability Analysis**
206
+ * **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
207
+ * **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
208
+ * **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
209
+ * **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
210
+ * **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
211
+
212
+ ---
213
+
214
+ ### Final Report Format
215
+
216
+ #### Your output **must exactly match** the following template:
217
+
218
+ **FinalAnswer**
219
+
220
+ * Project-Level README: Yes / No
221
+ * **Score:** [Poor / Fair / Good / Excellent]
222
+ * **Key Strengths**: <brief summary of the README's strongest points in 2-3 sentences>
223
+ * **Overall Improvement Suggestions:**
224
+ - "Original text snippet 1" - Improving comment 1
225
+ - "Original text snippet 2" - Improving comment 2
226
+ - ...
227
+
228
+ #### Notes
229
+
230
+ * **Project-Level README**: "Yes" if root-level; "No" if folder-level.
231
+ * **Score**: Overall quality rating, could be Poor / Fair / Good / Excellent.
232
+ * **Key Strengths**: Briefly highlight the README's strongest aspects.
233
+ * **Improvement Suggestions**: Provide concrete snippets and suggested improvements.
234
+
235
+
236
+ ---
237
+
238
+ ### **README path:**
239
+ {readme_path}
240
+
241
+ ---
242
+
243
+ ### **README Content:**
244
+ {readme_content}
245
+ """
246
+
247
+
248
+ class StructuredEvaluationREADMEResult(BaseModel):
249
+ available_score: Optional[bool]=Field(description="A boolean value, Is the README accessible and present?")
250
+ readability_score: Optional[str]=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
251
+ readability_suggestions: Optional[str]=Field(description="Suggestions to improve readability if necessary")
252
+ project_purpose_score: Optional[bool]=Field(description="A boolean value. Is the project's goal or function clearly stated?")
253
+ project_purpose_suggestions: Optional[str]=Field(description="Suggestions if not clear")
254
+ hardware_and_software_spec_score: Optional[str]=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
255
+ hardware_and_software_spec_suggestions: Optional[str]=Field(description="Suggestions if not clear")
256
+ dependency_score: Optional[str]=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
257
+ dependency_suggestions: Optional[str]=Field(description="Suggestions if dependencies are not clearly stated")
258
+ license_score: Optional[bool]=Field(description="A boolean value, Are contributor or maintainer details provided?")
259
+ license_suggestions: Optional[str]=Field(description="Suggestions to improve license information")
260
+ overall_score: str=Field(description="A overall scroll for the README quality, could be `Poor`, `Fair`, `Good`, or `Excellent`")
261
+
262
+ class EvaluationREADMEResult(BaseModel):
263
+ project_level: Optional[bool]=Field(description="A boolean value specifying if the README file is **project-level** README. TRUE: project-level, FALSE, folder-level")
264
+ score: Optional[str]=Field(description="An overall score")
265
+ key_strengths: Optional[str]=Field(description="A string specifying the key strengths of README file.")
266
+ overall_improvement_suggestions: Optional[list[str]]=Field(description="A list of overall improvement suggestions")
267
+
268
+ EvaluationREADMEResultSchema = {
269
+ "title": "EvaluationREADMEResult",
270
+ "type": "object",
271
+ "properties": {
272
+ "project_level": {
273
+ "anyOf": [{"type": "boolean"}, {"type": "null"}],
274
+ "description": "A boolean value specifying if the README file is **project-level** README. TRUE: project-level, FALSE: folder-level.",
275
+ "title": "Project Level"
276
+ },
277
+ "score": {
278
+ "anyOf": [{"type": "string"}, {"type": "null"}],
279
+ "description": "An overall score",
280
+ "title": "Score"
281
+ },
282
+ "key_strengths": {
283
+ "anyOf": [{"type": "string"}, {"type": "null"}],
284
+ "description": "A string specifying the key strengths of README file.",
285
+ "title": "Key Strengths",
286
+ },
287
+ "overall_improvement_suggestions": {
288
+ "anyOf": [{"items": {"type": "string"}, "type": "array"}, {"type": "null"}],
289
+ "description": "A list of improvement suggestions",
290
+ "title": "Overall Improvement Suggestions"
291
+ }
292
+ },
293
+ "required": ["project_level", "score", "key_strengths", "overall_improvement_suggestions"]
294
+ }
295
+
296
+ class EvaluationREADMETask(EvaluationTask):
297
+ def __init__(
298
+ self,
299
+ llm: BaseChatOpenAI,
300
+ repo_path: str,
301
+ gitignore_path: str,
302
+ meta_data: ProjectMetadata | None = None,
303
+ step_callback: Callable | None = None
304
+ ):
305
+ super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
306
+ self.evaluation_name = "README Evaluation"
307
+
308
+ def _structured_evaluate(self, free_readme_evaluations: dict[str, dict]):
309
+ """ Evaluate README in structure:
310
+ available: bool
311
+ readability: score and suggestion
312
+ project purpose: bool, suggestion
313
+ hardware and software spec and compatibility description: score and suggestion
314
+ dependencies clearly stated: score and suggestion
315
+ license information included: bool and suggestion
316
+ Code contributor / author information included: bool and suggestion
317
+ overall score:
318
+ """
319
+ total_token_usage = {**DEFAULT_TOKEN_USAGE}
320
+ if free_readme_evaluations is None:
321
+ return None, total_token_usage
322
+
323
+ license_path = "LICENSE"
324
+ license_content = read_file(Path(self.repo_path, license_path))
325
+ license_summarized_content = summarize_file(
326
+ llm=self.llm,
327
+ name=license_path,
328
+ content=license_content,
329
+ level=6,
330
+ summary_instructions="What license is the repository using?",
331
+ ) if license_content is not None else "N/A"
332
+ license_path = license_path if license_content is not None else "N/A"
333
+ structured_readme_evaluations = {}
334
+ for readme_file in free_readme_evaluations.keys():
335
+ evaluation = free_readme_evaluations[readme_file]["evaluation"]
336
+ if not evaluation["project_level"]:
337
+ continue
338
+ full_path = Path(self.repo_path, readme_file)
339
+ readme_content = read_file(full_path)
340
+ if readme_content is None:
341
+ logger.error(f"Error in reading file {readme_file}")
342
+ continue
343
+ if len(readme_content.strip()) == 0:
344
+ structured_readme_evaluations[readme_file] = {
345
+ "structured_evaluation": StructuredEvaluationREADMEResult(
346
+ available_score=False,
347
+ readability_score="Poor",
348
+ readability_suggestions="No readability provided",
349
+ project_purpose_score=False,
350
+ project_purpose_suggestions="No project purpose provided",
351
+ hardware_and_software_spec_score="Poor",
352
+ hardware_and_software_spec_suggestions="No hardware and software spec provided",
353
+ dependency_score="Poor",
354
+ dependency_suggestions="No dependency provided",
355
+ license_score=False,
356
+ license_suggestions="No license information",
357
+ overall_score="Poor",
358
+ ),
359
+ "structured_reasoning_process": f"{readme_file} is an empty file.",
360
+ }
361
+ continue
362
+ readability = PyphenReadability()
363
+ flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
364
+ _, _, _, _, _ = readability.readability_metrics(readme_content)
365
+ system_prompt = ChatPromptTemplate.from_template(
366
+ STRUCTURED_EVALUATION_README_SYSTEM_PROMPT
367
+ ).format(
368
+ readme_path=readme_file,
369
+ readme_content=readme_content,
370
+ license_path=license_path,
371
+ license_summarized_content=license_summarized_content,
372
+ flesch_reading_ease=flesch_reading_ease,
373
+ flesch_kincaid_grade=flesch_kincaid_grade,
374
+ gunning_fog_index=gunning_fog_index,
375
+ smog_index=smog_index,
376
+ )
377
+ agent = CommonAgentTwoChainSteps(llm=self.llm)
378
+ response, _, token_usage, reasoning_process = agent.go(
379
+ system_prompt=system_prompt,
380
+ instruction_prompt=EVALUATION_INSTRUCTION,
381
+ schema=StructuredEvaluationREADMEResult,
382
+ )
383
+ self.print_step(step_output=f"README: {readme_file} structured evaluation")
384
+ self.print_step(step_output=reasoning_process)
385
+ structured_readme_evaluations[readme_file] = {
386
+ "structured_evaluation": response,
387
+ "structured_reasoning_process": reasoning_process,
388
+ }
389
+ total_token_usage = increase_token_usage(total_token_usage, token_usage)
390
+
391
+ return structured_readme_evaluations, total_token_usage
392
+
393
+
394
+ def _free_evaluate(self, files: list[str]):
395
+ readme_files = files
396
+ if readme_files is None or len(readme_files) == 0:
397
+ return None, {**DEFAULT_TOKEN_USAGE}
398
+
399
+ readme_evaluations = {}
400
+ total_token_usage = {**DEFAULT_TOKEN_USAGE}
401
+ for readme_file in readme_files:
402
+ readme_path = Path(self.repo_path, readme_file)
403
+ readme_content = read_file(readme_path)
404
+ if readme_content is None:
405
+ logger.error(f"Error in reading file {readme_file}")
406
+ continue
407
+ if len(readme_content.strip()) == 0:
408
+ readme_evaluations[readme_file] = {
409
+ "evaluation": {
410
+ "project_level": not "/" in readme_file,
411
+ "score": "Poor",
412
+ "key_strengths": f"{readme_file} is an empty file.",
413
+ "overall_improvement_suggestions": f"{readme_file} is an empty file.",
414
+ },
415
+ "reasoning_process": f"{readme_file} is an empty file.",
416
+ }
417
+ continue
418
+
419
+ readability = PyphenReadability()
420
+ flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
421
+ _, _, _, _, _ = readability.readability_metrics(readme_content)
422
+ system_prompt = ChatPromptTemplate.from_template(
423
+ EVALUATION_README_SYSTEM_PROMPT
424
+ ).format(
425
+ readme_content=readme_content,
426
+ readme_path=readme_file,
427
+ flesch_reading_ease=flesch_reading_ease,
428
+ flesch_kincaid_grade=flesch_kincaid_grade,
429
+ gunning_fog_index=gunning_fog_index,
430
+ smog_index=smog_index,
431
+ )
432
+ # conversation = CommonConversation(llm=self.llm)
433
+ agent = CommonAgentTwoChainSteps(llm=self.llm)
434
+ response, _, token_usage, reasoning_process = agent.go(
435
+ system_prompt=system_prompt,
436
+ instruction_prompt=EVALUATION_INSTRUCTION,
437
+ schema=EvaluationREADMEResultSchema,
438
+ )
439
+ response = EvaluationREADMEResult(**response)
440
+ self.print_step(step_output=f"README: {readme_file} free evaluation")
441
+ self.print_step(step_output=reasoning_process)
442
+ readme_evaluations[readme_file] = {
443
+ "evaluation": {
444
+ "project_level": response.project_level,
445
+ "score": response.score,
446
+ "key_strengths": response.key_strengths,
447
+ "overall_improvement_suggestions": response.overall_improvement_suggestions,
448
+ },
449
+ "reasoning_process": reasoning_process
450
+ }
451
+ total_token_usage = increase_token_usage(total_token_usage, token_usage)
452
+ return readme_evaluations, total_token_usage
453
+
454
+ def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
455
+ free_readme_evaluations, free_token_usage = self._free_evaluate(files)
456
+ structured_readme_evaluations, structured_token_usage = self._structured_evaluate(free_readme_evaluations)
457
+
458
+ # combine result
459
+ combined_evaluations = {}
460
+ for f in files:
461
+ if not f in structured_readme_evaluations:
462
+ combined_evaluations = {**free_readme_evaluations[f]}
463
+ else:
464
+ combined_evaluations[f] = {
465
+ **free_readme_evaluations[f],
466
+ **structured_readme_evaluations[f],
467
+ }
468
+
469
+ total_token_usage = increase_token_usage(free_token_usage, structured_token_usage)
470
+
471
+ return combined_evaluations, total_token_usage
472
+
473
+
@@ -2,18 +2,16 @@
2
2
  import os
3
3
  from pathlib import Path
4
4
  import logging
5
- from typing import Callable, Optional
5
+ from typing import Callable
6
6
  from abc import ABC, abstractmethod
7
7
  from langchain.prompts import ChatPromptTemplate
8
8
  from langchain_openai.chat_models.base import BaseChatOpenAI
9
- from pydantic import BaseModel, Field
10
9
 
11
10
  from bioguider.agents.agent_utils import read_file
11
+ from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
12
12
  from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
13
- from .common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
14
13
  from .common_agent import CommonConversation
15
14
  from ..utils.pyphen_utils import PyphenReadability
16
- from ..utils.gitignore_checker import GitignoreChecker
17
15
 
18
16
  logger = logging.getLogger(__name__)
19
17
 
@@ -198,110 +196,7 @@ class EvaluationTask(ABC):
198
196
  def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
199
197
  pass
200
198
 
201
- class EvaluationREADMEResult(BaseModel):
202
- project_level: Optional[bool]=Field(description="A boolean value specifying if the README file is **project-level** README. TRUE: project-level, FALSE, folder-level")
203
- score: Optional[float]=Field(description="An overall score")
204
- key_strengths: Optional[str]=Field(description="A string specifying the key strengths of README file.")
205
- overall_improvement_suggestions: Optional[list[str]]=Field(description="A list of overall improvement suggestions")
206
-
207
- EvaluationREADMEResultSchema = {
208
- "title": "EvaluationREADMEResult",
209
- "type": "object",
210
- "properties": {
211
- "project_level": {
212
- "anyOf": [{"type": "boolean"}, {"type": "null"}],
213
- "description": "A boolean value specifying if the README file is **project-level** README. TRUE: project-level, FALSE: folder-level.",
214
- "title": "Project Level"
215
- },
216
- "score": {
217
- "anyOf": [{"type": "number"}, {"type": "null"}],
218
- "description": "An overall score",
219
- "title": "Score"
220
- },
221
- "key_strengths": {
222
- "anyOf": [{"type": "string"}, {"type": "null"}],
223
- "description": "A string specifying the key strengths of README file.",
224
- "title": "Key Strengths",
225
- },
226
- "overall_improvement_suggestions": {
227
- "anyOf": [{"items": {"type": "string"}, "type": "array"}, {"type": "null"}],
228
- "description": "A list of improvement suggestions",
229
- "title": "Overall Improvement Suggestions"
230
- }
231
- },
232
- "required": ["project_level", "score", "key_strengths", "overall_improvement_suggestions"]
233
- }
234
-
235
- class EvaluationREADMETask(EvaluationTask):
236
- def __init__(
237
- self,
238
- llm: BaseChatOpenAI,
239
- repo_path: str,
240
- gitignore_path: str,
241
- meta_data: ProjectMetadata | None = None,
242
- step_callback: Callable | None = None
243
- ):
244
- super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
245
- self.evaluation_name = "README Evaluation"
246
-
247
- def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
248
- readme_files = files
249
- if readme_files is None or len(readme_files) == 0:
250
- return None
251
-
252
- readme_evaluations = {}
253
- for readme_file in readme_files:
254
- readme_path = Path(self.repo_path, readme_file)
255
- readme_content = read_file(readme_path)
256
- if readme_content is None:
257
- logger.error(f"Error in reading file {readme_file}")
258
- continue
259
- if len(readme_content.strip()) == 0:
260
- readme_evaluations[readme_file] = {
261
- "evaluation": {
262
- "project_level": "/" in readme_file,
263
- "score": 0,
264
- "key_strengths": f"{readme_file} is an empty file.",
265
- "overall_improvement_suggestions": f"{readme_file} is an empty file.",
266
- },
267
- "reasoning_process": f"{readme_file} is an empty file.",
268
- }
269
- continue
270
-
271
- readability = PyphenReadability()
272
- flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
273
- _, _, _, _, _ = readability.readability_metrics(readme_content)
274
- system_prompt = ChatPromptTemplate.from_template(
275
- EVALUATION_README_SYSTEM_PROMPT
276
- ).format(
277
- readme_content=readme_content,
278
- readme_path=readme_file,
279
- flesch_reading_ease=flesch_reading_ease,
280
- flesch_kincaid_grade=flesch_kincaid_grade,
281
- gunning_fog_index=gunning_fog_index,
282
- smog_index=smog_index,
283
- )
284
- # conversation = CommonConversation(llm=self.llm)
285
- agent = CommonAgentTwoChainSteps(llm=self.llm)
286
- response, _, token_usage, reasoning_process = agent.go(
287
- system_prompt=system_prompt,
288
- instruction_prompt="Before arriving at the conclusion, clearly explain your reasoning step by step. Now, let's begin the evaluation.",
289
- schema=EvaluationREADMEResultSchema,
290
- )
291
- response = EvaluationREADMEResult(**response)
292
- self.print_step(step_output=f"README: {readme_file}")
293
- self.print_step(step_output=reasoning_process)
294
- readme_evaluations[readme_file] = {
295
- "evaluation": {
296
- "project_level": response.project_level,
297
- "score": response.score,
298
- "key_strengths": response.key_strengths,
299
- "overall_improvement_suggestions": response.overall_improvement_suggestions,
300
- },
301
- "reasoning_process": reasoning_process
302
- }
303
- return readme_evaluations, token_usage
304
-
199
+
305
200
  EVALUATION_TUTORIAL_SYSTEM_PROMPT="""
306
201
  You are an expert in software documentation and developer education.
307
202
  You are given the content of a tutorial file from a GitHub repository. Your task is to **critically evaluate** the quality of this tutorial based on best practices in technical writing and developer onboarding.
@@ -399,7 +294,7 @@ class EvaluationTutorialTask(EvaluationTask):
399
294
  conversation = CommonConversation(llm=self.llm)
400
295
  response, token_usage = conversation.generate(
401
296
  system_prompt=system_prompt,
402
- instruction_prompt="Before arriving at the conclusion, clearly explain your reasoning step by step. Now, let's begin the evaluation."
297
+ instruction_prompt=EVALUATION_INSTRUCTION,
403
298
  )
404
299
  self.print_step(step_output=f"Tutorial: {file}")
405
300
  self.print_step(step_output=response)
@@ -25,7 +25,8 @@ Carefully review the **Goal**, **Repository File Structure**, and **Intermediate
25
25
  - Then provide your result under **FinalAnswer**
26
26
  ```
27
27
  **Analysis**: your analysis here
28
- **FinalAnswer**: your final answer here, in json format **without** json fence (```json ... ```), like {final_answer_example}
28
+ **FinalAnswer**: your final answer here, in **raw json format**, **including** the surrounding "{{}}" but **without** using code fence (```json ... ```),
29
+ For example, output exactly: {final_answer_example}
29
30
  ```
30
31
  - If the information is **not sufficient** to achieve the goal, simply explain why under **Thoughts**:
31
32
  ```
@@ -18,6 +18,7 @@ from bioguider.agents.agent_tools import (
18
18
  )
19
19
  from bioguider.agents.agent_utils import (
20
20
  read_directory,
21
+ try_parse_json_object,
21
22
  )
22
23
  from bioguider.agents.identification_execute_step import IdentificationExecuteStep
23
24
  from bioguider.agents.identification_observe_step import IdentificationObserveStep
@@ -189,13 +190,18 @@ class IdentificationTask(AgentTask):
189
190
 
190
191
 
191
192
  def _parse_project_type(self, proj_type_obj: str) -> ProjectTypeEnum:
192
- try:
193
- json_obj = json.loads(proj_type_obj)
194
- proj_type = json_obj["project_type"]
195
- except Exception as e:
196
- logger.error(e)
197
- return ProjectTypeEnum.unknown
198
- proj_type = proj_type.strip()
193
+ proj_type_obj = proj_type_obj.strip()
194
+ the_obj = try_parse_json_object(proj_type_obj)
195
+ if not the_obj is None and "project_type" in the_obj:
196
+ proj_type = the_obj["project_type"]
197
+ elif proj_type_obj in [
198
+ ProjectTypeEnum.application.value,
199
+ ProjectTypeEnum.package.value,
200
+ ProjectTypeEnum.pipeline.value
201
+ ]:
202
+ return ProjectTypeEnum(proj_type_obj)
203
+ else:
204
+ proj_type = "unknown"
199
205
  if proj_type == "application":
200
206
  return ProjectTypeEnum.application
201
207
  elif proj_type == "package":
@@ -206,12 +212,19 @@ class IdentificationTask(AgentTask):
206
212
  return ProjectTypeEnum.unknown
207
213
 
208
214
  def _parse_primary_language(self, language_obj: str) -> PrimaryLanguageEnum:
209
- try:
210
- json_obj = json.loads(language_obj)
211
- language = json_obj["primary_language"]
212
- except Exception as e:
213
- logger.error(e)
214
- return PrimaryLanguageEnum.unknown
215
+ # try to handle some common errors
216
+ language_obj = language_obj.strip()
217
+ the_obj = try_parse_json_object(language_obj)
218
+ if not the_obj is None and "primary_language" in the_obj:
219
+ language = the_obj["primary_language"]
220
+ elif language_obj in [
221
+ PrimaryLanguageEnum.python.value,
222
+ PrimaryLanguageEnum.R.value,
223
+ ]:
224
+ return PrimaryLanguageEnum(language_obj)
225
+ else:
226
+ language = "unknown"
227
+
215
228
  language = language.strip()
216
229
  if language == "python":
217
230
  return PrimaryLanguageEnum.python
@@ -221,15 +234,14 @@ class IdentificationTask(AgentTask):
221
234
  return PrimaryLanguageEnum.unknown
222
235
 
223
236
  def _parse_meta_data(self, meta_data_obj: str) -> dict:
224
- try:
225
- json_obj = json.loads(meta_data_obj)
226
- meta_data = json_obj
227
- return meta_data
228
- except Exception as e:
229
- logger.error(e)
230
- return {
231
- "name": "unknown",
232
- "description": "unknown",
233
- "license": "unknown",
234
- "owner": "unknown",
235
- }
237
+ meta_data_obj = meta_data_obj.strip()
238
+ the_obj = try_parse_json_object(meta_data_obj)
239
+
240
+ return the_obj if the_obj is not None else {
241
+ "name": "unknown",
242
+ "description": "unknown",
243
+ "license": "unknown",
244
+ "owner": "unknown",
245
+ }
246
+
247
+
@@ -82,6 +82,7 @@ IDENTIFICATION_GOAL_META_DATA = """Identify the following meta data of the repos
82
82
  """
83
83
 
84
84
  COT_USER_INSTRUCTION = "Do not give the answer immediately. First, explain your reasoning process step by step, then provide the answer."
85
+ EVALUATION_INSTRUCTION="Before arriving at the conclusion, clearly explain your reasoning step by step. Now, let's begin the evaluation."
85
86
 
86
87
  class CollectionGoalItemEnum(Enum):
87
88
  UserGuide = "User Guide"
@@ -188,3 +189,5 @@ If **any one** of these is present, the document should be classified as Contrib
188
189
  },
189
190
  }
190
191
 
192
+
193
+
@@ -9,7 +9,7 @@ from ..agents.identification_task import IdentificationTask
9
9
  from ..rag.rag import RAG
10
10
  from ..utils.file_utils import parse_repo_url
11
11
  from ..database.summarized_file_db import SummarizedFilesDb
12
- from ..agents.evaluation_task import EvaluationREADMETask
12
+ from ..agents.evaluation_readme_task import EvaluationREADMETask
13
13
  from ..agents.evaluation_installation_task import EvaluationInstallationTask
14
14
  from ..agents.collection_task import CollectionTask
15
15
 
@@ -25,8 +25,8 @@ class EvaluationManager:
25
25
  self.repo_url = repo_url
26
26
  self.rag = RAG()
27
27
  self.rag.initialize_db_manager()
28
- self.rag.prepare_retriever(repo_url_or_path=repo_url)
29
-
28
+ self.rag.initialize_repo(repo_url_or_path=repo_url)
29
+
30
30
  author, repo_name = parse_repo_url(repo_url)
31
31
  self.summary_file_db = SummarizedFilesDb(author, repo_name)
32
32
 
@@ -504,7 +504,11 @@ class DatabaseManager:
504
504
  self.repo_url_or_path = None
505
505
  self.repo_paths = None
506
506
 
507
- def prepare_database(self, repo_url_or_path: str, access_token: str = None) -> Tuple[List[Document], List[Document]]:
507
+ def reset_database_and_create_repo(self, repo_url_or_path: str, access_token: str = None):
508
+ self._reset_database()
509
+ self._create_repo(repo_url_or_path, access_token)
510
+
511
+ def prepare_database(self) -> Tuple[List[Document], List[Document]]:
508
512
  """
509
513
  Create a new database from the repository.
510
514
 
@@ -515,9 +519,7 @@ class DatabaseManager:
515
519
  Returns:
516
520
  Tuple[List[Document], List[Document]]: Tuple of two Lists of Document objects
517
521
  """
518
- self.reset_database()
519
- self._create_repo(repo_url_or_path, access_token)
520
- return self.prepare_db_index()
522
+ return self._prepare_db_index()
521
523
 
522
524
  def _extract_repo_name_from_url(self, repo_url_or_path: str, repo_type: str) -> str:
523
525
  # Extract owner and repo name to create unique identifier
@@ -534,7 +536,7 @@ class DatabaseManager:
534
536
  repo_name = url_parts[-1].replace(".git", "")
535
537
  return repo_name
536
538
 
537
- def reset_database(self):
539
+ def _reset_database(self):
538
540
  """
539
541
  Reset the database to its initial state.
540
542
  """
@@ -608,7 +610,7 @@ class DatabaseManager:
608
610
  return self.repo_paths["save_repo_dir"]
609
611
  return None
610
612
 
611
- def prepare_db_index(self) -> Tuple[List[Document], List[Document]]:
613
+ def _prepare_db_index(self) -> Tuple[List[Document], List[Document]]:
612
614
  """
613
615
  Prepare the indexed database for the repository.
614
616
  :return: Tuple of two Lists of Document objects
@@ -647,16 +649,3 @@ class DatabaseManager:
647
649
  logger.info(f"Total transformed code documents: {len(transformed_code_documents)}")
648
650
  return transformed_doc_documents, transformed_code_documents
649
651
 
650
- def prepare_retriever(self, repo_url_or_path: str, access_token: str = None):
651
- """
652
- Prepare the retriever for a repository.
653
- This is a compatibility method for the isolated API.
654
-
655
- Args:
656
- repo_url_or_path (str): The URL or local path of the repository
657
- access_token (str, optional): Access token for private repositories
658
-
659
- Returns:
660
- List[Document]: List of Document objects
661
- """
662
- return self.prepare_database(repo_url_or_path, access_token)
bioguider/rag/rag.py CHANGED
@@ -50,22 +50,25 @@ class RAG(adal.Component):
50
50
  def initialize_db_manager(self):
51
51
  """Initialize the database manager with local storage"""
52
52
  self.db_manager = DatabaseManager()
53
- self.transformed_doc_documents = []
54
- self.transformed_code_documents = []
53
+ self.transformed_doc_documents: list | None = None
54
+ self.transformed_code_documents: list | None = None
55
+ self.access_token: str | None = None
55
56
 
56
- def prepare_retriever(self, repo_url_or_path: str, access_token: str = None):
57
+ def initialize_repo(self, repo_url_or_path: str, access_token: str = None):
58
+ self.repo_url_or_path = repo_url_or_path
59
+ self.access_token = access_token
60
+ self.db_manager.reset_database_and_create_repo(repo_url_or_path, access_token)
61
+
62
+ def _prepare_retriever(self):
57
63
  """
58
64
  Prepare the retriever for a repository.
59
65
  Will load database from local storage if available.
60
-
61
- Args:
62
- repo_url_or_path: URL or local path to the repository
63
- access_token: Optional access token for private repositories
64
66
  """
65
- self.initialize_db_manager()
66
- self.repo_url_or_path = repo_url_or_path
67
+ if self.transformed_code_documents is not None and self.transformed_doc_documents is not None:
68
+ # retrievers have been prepared
69
+ return
67
70
  self.transformed_doc_documents, self.transformed_code_documents \
68
- = self.db_manager.prepare_database(repo_url_or_path, access_token)
71
+ = self.db_manager.prepare_database()
69
72
  logger.info(f"Loaded {len(self.transformed_doc_documents)} doc documents for retrieval")
70
73
  logger.info(f"Loaded {len(self.transformed_code_documents)} code documents for retrieval")
71
74
  self.doc_retriever = FAISSRetriever(
@@ -93,6 +96,7 @@ class RAG(adal.Component):
93
96
  Returns:
94
97
  retrieved_documents: List of documents retrieved based on the query
95
98
  """
99
+ self._prepare_retriever()
96
100
  retrieved_documents = self.doc_retriever(query)
97
101
  # Fill in the documents
98
102
  retrieved_documents[0].documents = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: bioguider
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: An AI-Powered package to help biomedical developers to generate clear documentation
5
5
  License: MIT
6
6
  Author: Cankun Wang
@@ -1,40 +1,41 @@
1
1
  bioguider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  bioguider/agents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- bioguider/agents/agent_task.py,sha256=SX4iLdGqQttT39qvr-RtXiSpQEzm7Z3ECVw8IGQzpDc,2828
3
+ bioguider/agents/agent_task.py,sha256=FrWCq_mG-Oo745qcZT3Lai4rd8hQ5IGK3jMNe1vFQrs,2820
4
4
  bioguider/agents/agent_tools.py,sha256=YWF44vGjTzK0H9dxfdZyJ5K2H4z2j1bz-Q0bVw1UoE8,7014
5
- bioguider/agents/agent_utils.py,sha256=GASgM8pwGcbs3xQ8RaryBtWCim19rAcd3_c4EDranmU,12843
5
+ bioguider/agents/agent_utils.py,sha256=FxZsssGapnHe0zruopfuBcctkEHL0zaA9So7dJvvtAg,13671
6
6
  bioguider/agents/collection_execute_step.py,sha256=Ev4BLjjmBdsc52M1zrq7QK8g7fsffDkSxu-jN2rvedw,5614
7
7
  bioguider/agents/collection_observe_step.py,sha256=iNeV6f16Emk1LMStSR4FXBPZ6Sc0eTjwxEfmoeegV-U,4554
8
8
  bioguider/agents/collection_plan_step.py,sha256=mx-_5Y3pqKDPBaMMyFElKlpq1GWN7g03ZplnlTr9ppE,5699
9
- bioguider/agents/collection_task.py,sha256=blrsS71aR-Du0vtO4MpFI6q0aUJdMvlAAYvHb1pBUfY,7368
9
+ bioguider/agents/collection_task.py,sha256=CLtPOqhlOgAfysMX2WYiGs3_O9W7qp3kh0wck6COiac,7304
10
10
  bioguider/agents/collection_task_utils.py,sha256=WRzzpMV6r8aY0FlX_zroHbLDXyrmvS48OSiBr_fIq2Q,3677
11
11
  bioguider/agents/common_agent.py,sha256=eGs8m8bjO0dmW6lDIen7DQNdWdHD7j8Udf3XhL1k6vI,5242
12
- bioguider/agents/common_agent_2step.py,sha256=IJ5SxqsK26oj8W3U4wnGtbJxHRrHEznaGCYFBXKUHn4,7916
12
+ bioguider/agents/common_agent_2step.py,sha256=Vton0RKtmMyEgIIFhnBk4CFU_hynX0LvwREcZ9kvMxQ,7918
13
13
  bioguider/agents/common_step.py,sha256=GdOCbmj1pwh4etg-futVFYVDQuoUG89DnIrw-B6QbzM,2594
14
14
  bioguider/agents/dockergeneration_execute_step.py,sha256=F92jDlkc6KjAvTkX7q1FsCYP8J15SCaNgmwh3YPqfDo,6500
15
15
  bioguider/agents/dockergeneration_observe_step.py,sha256=93PO_Y4YyUShVTKRt0nErcjb-xYTcwcZCj7TgniS9t4,6098
16
16
  bioguider/agents/dockergeneration_plan_step.py,sha256=SB8tQM9PkIKsD2o1DFD7bedcxz6r6hSy8n_EVK60Fz0,7235
17
17
  bioguider/agents/dockergeneration_task.py,sha256=ezsweVHJsFpOyOI6rYMt1DZ3PE19dcq4J3Lm-d0IA8M,6220
18
18
  bioguider/agents/dockergeneration_task_utils.py,sha256=v7emqrJlVW-A5ZdLmPSdiaMSKCR8uzy9UYzx_1cgzyo,9041
19
- bioguider/agents/evaluation_installation_task.py,sha256=G8oFpyiT99bGyHGgqE6eCW6_i5le64i3Hd7hSQkrndE,6498
20
- bioguider/agents/evaluation_task.py,sha256=0kwUkKixljs15VpasMCUdDjQH-xJwXzHV4GyNkGrmPc,17364
19
+ bioguider/agents/evaluation_installation_task.py,sha256=Lxgp4GZo6zZjSLV1bibnIbO03wH5klcLexALU8c5-lo,10195
20
+ bioguider/agents/evaluation_readme_task.py,sha256=QqCnTwPy4r_WnmdNlY5CkLRi94NSWLFt-cpk4urynR0,21492
21
+ bioguider/agents/evaluation_task.py,sha256=e-yJWhty9hvlvWaMYRoSoZs6Sjq9eLzBxtJstYaEIKY,12261
21
22
  bioguider/agents/identification_execute_step.py,sha256=w3IjL8f2WiHCyiLjVSoySnIAXpi1-hK1DLKCnXbAN2Y,5587
22
- bioguider/agents/identification_observe_step.py,sha256=OENwf9XyOSIHvJMp7eoyQOYGjjtPnPT2S29xf1rCATk,3667
23
+ bioguider/agents/identification_observe_step.py,sha256=j4Fniv86jljkClTFc-p3pA39_zxhGJLPS9K7jNpxhJ0,3750
23
24
  bioguider/agents/identification_plan_step.py,sha256=p0BKziXdB4ph4D_T9FU5bH8CbHD5Gv0YuszMds_xh-Y,5224
24
- bioguider/agents/identification_task.py,sha256=vQxNEkX1Sw-XK391Z2_bi3kjr0tcIU1u6y7JBaEXUFU,8790
25
+ bioguider/agents/identification_task.py,sha256=qJ46FdmctibXIzO4C2wBwXR7VLHUksBtFiILH2eIHB4,9277
25
26
  bioguider/agents/identification_task_utils.py,sha256=5gevknha9hJiiQN5L7Yp9-pyhAlbR_j31aGRK5j0D_w,522
26
27
  bioguider/agents/peo_common_step.py,sha256=iw2c1h7X11WJzSE2tSRg0UAoXH0QOlQDxW9CCzSVMOY,2677
27
- bioguider/agents/prompt_utils.py,sha256=udl4PSTZtAc6vBRYJJq4ZGB2iy3ihRE4i9afFJLT5kM,12390
28
+ bioguider/agents/prompt_utils.py,sha256=qjHEvqyHLazGAc3PEx_-QN3rCy2-WYnC3mbUigwPtEM,12530
28
29
  bioguider/agents/python_ast_repl_tool.py,sha256=o7-4P1h8jS8ikhGSA4CI_OWQ2a0Eg5tEdmuAp_qrO-0,2519
29
30
  bioguider/agents/rag_collection_task.py,sha256=r_jPAMjQcC7dIydKxX77UuMqjJ3MiVKswNZ-yNw7yx8,5199
30
31
  bioguider/conversation.py,sha256=DIvk_d7pz_guuORByK1eaaF09FAK-8shcNTrbSUHz9Y,1779
31
32
  bioguider/database/summarized_file_db.py,sha256=tDSi2iCvm2-lrx0rBJo0C11gYl9FswsDZTG2-Yhu5cE,4646
32
- bioguider/managers/evaluation_manager.py,sha256=93XOE2Q2a-mRa8DMF3IZC7mhE2CxxqOZZ5MLbWlPsjo,4904
33
+ bioguider/managers/evaluation_manager.py,sha256=czLjC3Cl6Xb1R2-sKFUHUNVle_G3O-g66x0-LISdP_w,4917
33
34
  bioguider/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
35
  bioguider/rag/config.py,sha256=5g4IqTzgyfZfax9Af9CTkXShgItPOt4_9TEMSekCPik,4602
35
- bioguider/rag/data_pipeline.py,sha256=OXnsqETVytHBMXHerg9gACtNhwpWSODYWvzxVDTP_So,27767
36
+ bioguider/rag/data_pipeline.py,sha256=bkJ2IUCgPx_OL2uZtPd6cIBor2VFZEIfGd5kVlmiPjw,27292
36
37
  bioguider/rag/embedder.py,sha256=jofR8hOj3Aj2IyBQ9y6FeAc84tgq5agbIfCGyFxYpJ8,650
37
- bioguider/rag/rag.py,sha256=2G2b7JIDsjrR74lnkIFyEuMPF14kn6B-WhphZkUxd3c,4481
38
+ bioguider/rag/rag.py,sha256=JFPwrJlKDSyd3U3Gce_NSxI5343eNUbqPG9Fs5Pfoq0,4696
38
39
  bioguider/settings.py,sha256=BD_iz9aYarxmWUl0XaKl4-D4oTXMhFzljsXLNn2phis,3143
39
40
  bioguider/utils/constants.py,sha256=_xMAhwE3py2RR0pIimnb2qfucXdnTj4ZNeKGACouh2w,932
40
41
  bioguider/utils/default.gitignore,sha256=XjPdyO2KV8z8iyuqluaNR_70tBQftMpyKL8HboVNyeI,1605
@@ -42,7 +43,7 @@ bioguider/utils/file_utils.py,sha256=9VfAHsz1UkFPtzAmvWZvPl1TMaKIYNjNlLgsfB8tNjg
42
43
  bioguider/utils/gitignore_checker.py,sha256=pOYUwsS9D5014LxcZb0cj3s2CAYaD2uF_pYJpaNKcho,6532
43
44
  bioguider/utils/pyphen_utils.py,sha256=cdZc3qphkvMDeL5NiZ8Xou13M_uVNP7ifJ-FwxO-0BE,2680
44
45
  bioguider/utils/utils.py,sha256=YP3HXgU_rvYDWkEcTzWGiYZw-mlfVrqGhUGSc0_4Pms,900
45
- bioguider-0.2.9.dist-info/LICENSE,sha256=qzkvZcKwwA5DuSuhXMOm2LcO6BdEr4V7jwFZVL2-jL4,1065
46
- bioguider-0.2.9.dist-info/METADATA,sha256=kctbCb5iK21lTibqx01l7hnLYUz10Be66afv61LwuJA,1867
47
- bioguider-0.2.9.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
48
- bioguider-0.2.9.dist-info/RECORD,,
46
+ bioguider-0.2.11.dist-info/LICENSE,sha256=qzkvZcKwwA5DuSuhXMOm2LcO6BdEr4V7jwFZVL2-jL4,1065
47
+ bioguider-0.2.11.dist-info/METADATA,sha256=Vy7ZfE7701TohPSZoK9H_VoEhvw7pLyPvvvBkhs8RTY,1868
48
+ bioguider-0.2.11.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
49
+ bioguider-0.2.11.dist-info/RECORD,,