bioguider 0.2.12__tar.gz → 0.2.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

Files changed (50) hide show
  1. {bioguider-0.2.12 → bioguider-0.2.14}/PKG-INFO +1 -1
  2. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/agent_task.py +8 -4
  3. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/agent_tools.py +17 -14
  4. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/agent_utils.py +40 -4
  5. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/collection_observe_step.py +7 -5
  6. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/collection_plan_step.py +9 -7
  7. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/collection_task.py +15 -5
  8. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/collection_task_utils.py +46 -15
  9. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/dockergeneration_task.py +1 -1
  10. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/evaluation_installation_task.py +31 -7
  11. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/evaluation_readme_task.py +26 -4
  12. bioguider-0.2.14/bioguider/agents/evaluation_submission_requirements_task.py +153 -0
  13. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/evaluation_task.py +19 -6
  14. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/identification_observe_step.py +7 -1
  15. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/identification_plan_step.py +6 -1
  16. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/identification_task.py +23 -4
  17. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/identification_task_utils.py +2 -0
  18. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/prompt_utils.py +44 -4
  19. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/database/summarized_file_db.py +1 -1
  20. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/managers/evaluation_manager.py +38 -46
  21. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/utils/constants.py +2 -0
  22. {bioguider-0.2.12 → bioguider-0.2.14}/pyproject.toml +1 -1
  23. {bioguider-0.2.12 → bioguider-0.2.14}/LICENSE +0 -0
  24. {bioguider-0.2.12 → bioguider-0.2.14}/README.md +0 -0
  25. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/__init__.py +0 -0
  26. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/__init__.py +0 -0
  27. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/collection_execute_step.py +0 -0
  28. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/common_agent.py +0 -0
  29. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/common_agent_2step.py +0 -0
  30. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/common_step.py +0 -0
  31. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/dockergeneration_execute_step.py +0 -0
  32. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/dockergeneration_observe_step.py +0 -0
  33. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/dockergeneration_plan_step.py +0 -0
  34. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/dockergeneration_task_utils.py +0 -0
  35. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/identification_execute_step.py +0 -0
  36. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/peo_common_step.py +0 -0
  37. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/python_ast_repl_tool.py +0 -0
  38. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/agents/rag_collection_task.py +0 -0
  39. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/conversation.py +0 -0
  40. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/rag/__init__.py +0 -0
  41. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/rag/config.py +0 -0
  42. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/rag/data_pipeline.py +0 -0
  43. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/rag/embedder.py +0 -0
  44. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/rag/rag.py +0 -0
  45. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/settings.py +0 -0
  46. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/utils/default.gitignore +0 -0
  47. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/utils/file_utils.py +0 -0
  48. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/utils/gitignore_checker.py +0 -0
  49. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/utils/pyphen_utils.py +0 -0
  50. {bioguider-0.2.12 → bioguider-0.2.14}/bioguider/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: bioguider
3
- Version: 0.2.12
3
+ Version: 0.2.14
4
4
  Summary: An AI-Powered package to help biomedical developers to generate clear documentation
5
5
  License: MIT
6
6
  Author: Cankun Wang
@@ -13,7 +13,12 @@ class AgentTask(ABC):
13
13
  A class representing a step in an agent's process.
14
14
  """
15
15
 
16
- def __init__(self, llm: BaseChatOpenAI, step_callback: Callable | None = None):
16
+ def __init__(
17
+ self,
18
+ llm: BaseChatOpenAI,
19
+ step_callback: Callable | None = None,
20
+ summarized_files_db: SummarizedFilesDb | None = None,
21
+ ):
17
22
  """
18
23
  Initialize the AgentStep with a language model and a callback function.
19
24
 
@@ -23,7 +28,7 @@ class AgentTask(ABC):
23
28
  """
24
29
  self.llm = llm
25
30
  self.step_callback = step_callback
26
- self.summary_file_db = None
31
+ self.summarized_files_db = summarized_files_db
27
32
  self.graph: CompiledGraph | None = None
28
33
 
29
34
  def _print_step(
@@ -45,7 +50,7 @@ class AgentTask(ABC):
45
50
  token_usage=token_usage,
46
51
  )
47
52
 
48
- def compile(self, repo_path: str, gitignore_path: str, db: SummarizedFilesDb | None = None, **kwargs):
53
+ def compile(self, repo_path: str, gitignore_path: str, **kwargs):
49
54
  """
50
55
  Compile the agent step with the given repository and gitignore paths.
51
56
 
@@ -55,7 +60,6 @@ class AgentTask(ABC):
55
60
  **kwargs: derived class may pass more arguments to implmented _compile(), that is,
56
61
  what **kwargs is depends on derived class
57
62
  """
58
- self.summary_file_db = db
59
63
  self._compile(repo_path, gitignore_path, **kwargs)
60
64
 
61
65
  @abstractmethod
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import logging
2
3
  from typing import Callable
3
4
  from markdownify import markdownify as md
4
5
  from langchain_openai.chat_models.base import BaseChatOpenAI
@@ -7,6 +8,8 @@ from bioguider.utils.file_utils import get_file_type
7
8
  from bioguider.agents.agent_utils import read_directory, read_file, summarize_file
8
9
  from bioguider.rag.data_pipeline import count_tokens
9
10
 
11
+ logger = logging.getLogger(__name__)
12
+
10
13
  class agent_tool:
11
14
  def __init__(
12
15
  self,
@@ -53,19 +56,12 @@ Returns:
53
56
  class summarize_file_tool(agent_tool):
54
57
  """ Read a file and generate a summary according to a specified prompt.
55
58
 
56
- Arguments
57
- ----------
58
- file_path : str, required
59
- Path to the file to read.
60
- summarize_prompt : str, optional
61
- Instruction guiding the summarization focus (default is "N/A").
62
- Use this to emphasize specific aspects of the content.
59
+ Args:
60
+ file_path str: required. The file path to read.
61
+ summarize_prompt str: optional. A string instruction guiding the summarization focus (default is "N/A"). Use this to emphasize specific aspects of the content.
63
62
 
64
- Returns
65
- -------
66
- str or None
67
- A summarized version of the file content.
68
- Returns None if the file does not exist or cannot be read.
63
+ Returns:
64
+ str or None: A summarized version of the file content. Returns None if the file does not exist or cannot be read.
69
65
  """
70
66
  def __init__(
71
67
  self,
@@ -124,8 +120,15 @@ Returns
124
120
  if summarized_content is not None:
125
121
  return f"summarized content of file {file_path}: " + summarized_content
126
122
 
127
- file_content = read_file(abs_file_path)
128
- file_content = file_content.replace("{", "{{").replace("}", "}}")
123
+ try:
124
+ file_content = read_file(abs_file_path)
125
+ file_content = file_content.replace("{", "{{").replace("}", "}}")
126
+ except UnicodeDecodeError as e:
127
+ logger.error(str(e))
128
+ return f"{file_path} is a binary, can't be summarized."
129
+ except Exception as e:
130
+ logger.error(str(e))
131
+ return f"Failed to read {file_path}."
129
132
  summarized_content, token_usage = summarize_file(
130
133
  self.llm, abs_file_path, file_content, self.detailed_level,
131
134
  summary_instructions=self.summarize_instruction,
@@ -16,11 +16,12 @@ from langchain.tools import BaseTool
16
16
  from langchain.schema import AgentAction, AgentFinish
17
17
  from langchain.agents import AgentOutputParser
18
18
  from langgraph.prebuilt import create_react_agent
19
+ from langchain_community.callbacks.openai_info import OpenAICallbackHandler
19
20
  import logging
20
21
 
21
22
  from pydantic import BaseModel, Field
22
23
 
23
- from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
24
+ from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, MAX_FILE_LENGTH, MAX_SENTENCE_NUM
24
25
  from bioguider.utils.file_utils import get_file_type
25
26
  from ..utils.gitignore_checker import GitignoreChecker
26
27
  from ..database.summarized_file_db import SummarizedFilesDb
@@ -178,8 +179,7 @@ Here is the file content:
178
179
  Now, let's start to summarize.
179
180
  """)
180
181
 
181
- MAX_FILE_LENGTH=20 *1024 # 20K
182
- MAX_SENTENCE_NUM=20
182
+
183
183
  def summarize_file(
184
184
  llm: BaseChatOpenAI,
185
185
  name: str,
@@ -379,6 +379,20 @@ def escape_braces(text: str) -> str:
379
379
  text = re.sub(r'(?<!{){(?!{)', '{{', text)
380
380
  return text
381
381
 
382
+ STRING_TO_OBJECT_SYSTEM_PROMPT = """
383
+ You are an expert to understand data. You will be provided a text, and your task is to extracted structured data from the provided text.
384
+
385
+ ---
386
+
387
+ ### **Instructions**
388
+ 1. If no structured data can be extracted, return None
389
+
390
+ ---
391
+
392
+ ### **Input Text**
393
+ {input_text}
394
+ """
395
+
382
396
  def try_parse_json_object(json_obj: str) -> dict | None:
383
397
  json_obj = json_obj.strip()
384
398
 
@@ -406,4 +420,26 @@ def try_parse_json_object(json_obj: str) -> dict | None:
406
420
  return None
407
421
  except Exception as e:
408
422
  logger.error(e)
409
- return None
423
+ return None
424
+
425
+ def try_parse_with_llm(llm: BaseChatOpenAI, input_text: str, schema: any):
426
+ system_prompt = ChatPromptTemplate.from_template(
427
+ STRING_TO_OBJECT_SYSTEM_PROMPT
428
+ ).format(input_text=input_text)
429
+ prompt = ChatPromptTemplate.from_messages([
430
+ ("system", system_prompt)
431
+ ])
432
+ agent = prompt | llm.with_structured_output(schema)
433
+ callback_handler = OpenAICallbackHandler()
434
+
435
+ try:
436
+ res = agent.invoke(
437
+ input={},
438
+ config={
439
+ "callbacks": [callback_handler],
440
+ },
441
+ )
442
+ return res, vars(callback_handler)
443
+ except Exception as e:
444
+ logger.error(e)
445
+ return None
@@ -5,7 +5,7 @@ from langchain_openai.chat_models.base import BaseChatOpenAI
5
5
  from langchain_core.prompts import ChatPromptTemplate
6
6
  from bioguider.agents.agent_utils import ObservationResult
7
7
  from bioguider.agents.collection_task_utils import CollectionWorkflowState
8
- from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
8
+ from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
9
9
  from bioguider.agents.peo_common_step import PEOCommonStep
10
10
  from bioguider.agents.prompt_utils import COLLECTION_GOAL, COLLECTION_PROMPTS
11
11
 
@@ -34,11 +34,13 @@ Here is the 2-level file structure of the repository (`f` = file, `d` = director
34
34
 
35
35
  * Provide your reasoning under **Analysis**
36
36
  * Then list all relevant files and folders under **FinalAnswer**
37
+ * **FinalAnswer** format must exactly match this format:
38
+ **FinalAnswer**: {{"final_answer": [<file path>, <file path>, <file path>, ...]}}
37
39
  * Be sure to include the **full relative paths** with respect to the repository root.
38
- * Your answer **must follow this exact format** (note: no JSON code block, no additional comments):
40
+ * Your answer **must exactly match the follwing format** (note: no JSON code block, no additional comments), **do not** make up anything:
39
41
 
40
42
  ```
41
- **Analysis**: your analysis here
43
+ **Analysis**: your analysis here
42
44
  **FinalAnswer**: {{"final_answer": ["path/to/file1", "path/to/file2", ...]}}
43
45
  ```
44
46
  4. If you believe **more files still need to be collected**:
@@ -80,8 +82,8 @@ class CollectionObserveStep(PEOCommonStep):
80
82
  repo_structure = self.repo_structure
81
83
  intermediate_steps = self._build_intermediate_steps(state)
82
84
  prompt = ChatPromptTemplate.from_template(COLLECTION_OBSERVE_SYSTEM_PROMPT)
83
- important_instructions = "N/A" if "important_instructions" not in collection_item or len(collection_item["important_instructions"]) == 0 \
84
- else collection_item["important_instructions"]
85
+ important_instructions = "N/A" if "observe_important_instructions" not in collection_item or len(collection_item["observe_important_instructions"]) == 0 \
86
+ else collection_item["observe_important_instructions"]
85
87
  return prompt.format(
86
88
  goal_item_desc=goal_item_desc,
87
89
  related_file_description=collection_item["related_file_description"],
@@ -8,7 +8,7 @@ from bioguider.agents.agent_utils import (
8
8
  PlanAgentResultJsonSchema,
9
9
  PlanAgentResult,
10
10
  )
11
- from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
11
+ from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
12
12
  from bioguider.agents.peo_common_step import PEOCommonStep
13
13
  from bioguider.agents.collection_task_utils import CollectionWorkflowState
14
14
  from bioguider.agents.prompt_utils import COLLECTION_GOAL, COLLECTION_PROMPTS
@@ -57,7 +57,9 @@ Here are the results from previous steps:
57
57
 
58
58
  3. You may use the `read_directory` tool to explore directory contents, but avoid using it in the first step unless necessary.
59
59
 
60
- 4. You may use the `python_repl` tool to execute Python code, but this should **also be avoided in the first step**.
60
+ 4. Your plan can only use the above tools, **do not** make up any tools not in the above tools list.
61
+
62
+ 5. Your planned step input file or input directory must come from the above repository files structure, **do not** make up file name or directory name.
61
63
 
62
64
  ---
63
65
 
@@ -65,12 +67,12 @@ Here are the results from previous steps:
65
67
  {important_instructions}
66
68
 
67
69
  ### **Output Format**
68
- Your plan should be returned as a sequence of steps in the following format:
70
+ Your plan **must exactly match** a sequence of steps in the following format, **do not** make up anything:
69
71
 
70
- Step: <tool name> # Tool name must be one of {tool_names}
72
+ Step: <tool name> # Tool name **must be one** of {tool_names}
71
73
  Step Input: <file or directory name>
72
74
 
73
- Step: <tool name>
75
+ Step: <tool name> # Tool name **must be one** of {tool_names}
74
76
  Step Input: <file or directory name>
75
77
  ...
76
78
  """)
@@ -105,8 +107,8 @@ class CollectionPlanStep(PEOCommonStep):
105
107
  step_analysis, step_thoughts = self._build_intermediate_analysis_and_thoughts(state)
106
108
  goal = ChatPromptTemplate.from_template(COLLECTION_GOAL).format(goal_item=collection_item["goal_item"])
107
109
  related_file_description = collection_item["related_file_description"]
108
- important_instructions="N/A" if "important_instructions" not in collection_item or len(collection_item["important_instructions"]) == 0 \
109
- else collection_item["important_instructions"]
110
+ important_instructions="N/A" if "plan_important_instructions" not in collection_item or len(collection_item["plan_important_instructions"]) == 0 \
111
+ else collection_item["plan_important_instructions"]
110
112
  tool_names, tools_desc = get_tool_names_and_descriptions(self.custom_tools)
111
113
  system_prompt = COLLECTION_PLAN_SYSTEM_PROMPT.format(
112
114
  goal=goal,
@@ -50,9 +50,12 @@ class CollectionTask(AgentTask):
50
50
  def __init__(
51
51
  self,
52
52
  llm: BaseChatOpenAI,
53
- step_callback: Callable | None = None
53
+ step_callback: Callable | None = None,
54
+ summarize_instruction: str | None = "N/A",
55
+ summarized_files_db: SummarizedFilesDb | None = None,
56
+ provided_files: list[str] | None = None,
54
57
  ):
55
- super().__init__(llm, step_callback)
58
+ super().__init__(llm, step_callback, summarized_files_db=summarized_files_db)
56
59
  self.repo_path: str | None = None
57
60
  self.gitignore_path: str | None = None
58
61
  self.repo_structure: str | None = None
@@ -60,6 +63,8 @@ class CollectionTask(AgentTask):
60
63
  self.steps: list[PEOCommonStep] = []
61
64
  self.tools: list[any] | None = None
62
65
  self.custom_tools: list[Tool] | None = None
66
+ self.summarize_instruction = summarize_instruction
67
+ self.provided_files = provided_files
63
68
 
64
69
  def _prepare_tools(self, related_file_goal_item_desc):
65
70
  tool_rd = read_directory_tool(repo_path=self.repo_path)
@@ -67,7 +72,8 @@ class CollectionTask(AgentTask):
67
72
  llm=self.llm,
68
73
  repo_path=self.repo_path,
69
74
  output_callback=self.step_callback,
70
- db=self.summary_file_db,
75
+ db=self.summarized_files_db,
76
+ summaize_instruction=self.summarize_instruction,
71
77
  )
72
78
  tool_rf = read_file_tool(repo_path=self.repo_path)
73
79
  tool_cf = check_file_related_tool(
@@ -75,6 +81,8 @@ class CollectionTask(AgentTask):
75
81
  repo_path=self.repo_path,
76
82
  goal_item_desc=related_file_goal_item_desc,
77
83
  output_callback=self.step_callback,
84
+ summarize_instruction=self.summarize_instruction,
85
+ summarized_files_db=self.summarized_files_db,
78
86
  )
79
87
  self.tools = [tool_rd, tool_sum, tool_rf, tool_cf]
80
88
  self.custom_tools = [
@@ -99,13 +107,15 @@ class CollectionTask(AgentTask):
99
107
  description=tool_cf.__class__.__doc__,
100
108
  ),
101
109
  ]
102
- self.custom_tools.append(CustomPythonAstREPLTool())
110
+ # self.custom_tools.append(CustomPythonAstREPLTool())
103
111
 
104
112
  def _initialize(self):
105
113
  # initialize the 2-level file structure of the repo
106
114
  if not os.path.exists(self.repo_path):
107
115
  raise ValueError(f"Repository path {self.repo_path} does not exist.")
108
- files = read_directory(self.repo_path, os.path.join(self.repo_path, ".gitignore"))
116
+ files = self.provided_files
117
+ if files is None:
118
+ files = read_directory(self.repo_path, os.path.join(self.repo_path, ".gitignore"))
109
119
  file_pairs = [(f, get_file_type(os.path.join(self.repo_path, f)).value) for f in files]
110
120
  self.repo_structure = ""
111
121
  for f, f_type in file_pairs:
@@ -4,13 +4,17 @@ from langchain.prompts import ChatPromptTemplate
4
4
  from langchain_openai.chat_models.base import BaseChatOpenAI
5
5
  from langchain_core.messages import AIMessage
6
6
  from pydantic import BaseModel, Field
7
+ import logging
7
8
 
8
9
  from bioguider.agents.agent_tools import agent_tool
9
10
  from bioguider.agents.agent_utils import read_file, summarize_file
10
11
  from bioguider.agents.peo_common_step import PEOWorkflowState
11
12
  from bioguider.agents.common_agent import CommonAgent
12
13
  from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
14
+ from bioguider.database.summarized_file_db import SummarizedFilesDb
15
+ from bioguider.utils.constants import MAX_FILE_LENGTH
13
16
 
17
+ logger = logging.getLogger(__name__)
14
18
 
15
19
  class CollectionWorkflowState(TypedDict):
16
20
  llm: Optional[BaseChatOpenAI]
@@ -46,20 +50,22 @@ Does this file appear to contain related information?
46
50
 
47
51
  ---
48
52
 
49
- ### **Output Format:**
50
- Respond with a single word: "Yes" or "No" to indicate whether the file is related to the goal item.
51
- Do not include any additional text, explanation, or formatting.
53
+ ### **Output Format:**
54
+ Respond with exactly two parts:
55
+ 1. A single word: Yes or No (indicating if the file meets the goal criteria)
56
+ 2. One brief explanatory sentence.
57
+ For example: Yes. This file is a compiled binary file, so, it is related to the compiled standalone file (goal item).
52
58
  """)
53
59
 
54
60
  class CheckFileRelatedResult(BaseModel):
55
- is_related: bool = Field(description="True if the file is related to the goal item, False otherwise.")
61
+ is_related: str = Field(description="A string conclusion specify if the provided file is related. The string value contains two parts:\n 1. A single word: Yes or No (indicating if the file meets the goal criteria).\n 2. One brief explanatory sentence.")
56
62
 
57
63
  class check_file_related_tool(agent_tool):
58
64
  """ Check if the file is related to the goal item
59
65
  Args:
60
66
  file_path str: file path
61
67
  Returns:
62
- bool: True if the file is related to the goal item, False otherwise.
68
+ str: A string conclusion. The string conclusion contains two parts:\n 1. A single word: Yes or No (indicating if the file meets the goal criteria).\n 2. One brief explanatory sentence.
63
69
  """
64
70
  def __init__(
65
71
  self,
@@ -67,23 +73,51 @@ Returns:
67
73
  repo_path: str,
68
74
  goal_item_desc: str,
69
75
  output_callback: Callable | None = None,
76
+ summarize_instruction: str | None = None,
77
+ summarize_level: int | None = 6,
78
+ summarized_files_db: SummarizedFilesDb | None = None,
70
79
  ):
71
80
  super().__init__(llm=llm, output_callback=output_callback)
72
81
  self.repo_path = repo_path
73
82
  self.goal_item_desc = goal_item_desc
83
+ self.summarize_instruction = summarize_instruction \
84
+ if summarize_instruction is not None else "N/A"
85
+ self.summarize_level = summarize_level
86
+ self.summarized_files_db = summarized_files_db
74
87
 
75
88
  def run(self, file_path: str) -> str:
76
89
  if not self.repo_path in file_path:
77
90
  file_path = os.path.join(self.repo_path, file_path)
78
91
  if not os.path.isfile(file_path):
79
92
  return "Can't read file"
80
- file_content = read_file(file_path)
81
- if file_content is None:
93
+
94
+ check_prompts = None
95
+ try:
96
+ file_content = read_file(file_path)
97
+ except UnicodeDecodeError as e:
98
+ logger.error(str(e))
99
+ check_prompts = "Can't summarize binary file, please decide according to file name and extension."
100
+ except Exception as e:
101
+ logger.error(str(e))
102
+ check_prompts = "Failed to summarize file, please decide according to file name and extension."
103
+ if check_prompts is None and file_content is None:
82
104
  return "Failed to read file"
83
- summarized_content, token_usage = summarize_file(self.llm, file_path, file_content, 6)
84
- if summarized_content is None:
85
- return "Failed to summarize file"
86
- self._print_token_usage(token_usage)
105
+ if check_prompts is not None:
106
+ summarized_content = check_prompts
107
+ else:
108
+ if len(file_content) > MAX_FILE_LENGTH:
109
+ file_content = file_content[:MAX_FILE_LENGTH]
110
+ summarized_content, token_usage = summarize_file(
111
+ llm=self.llm,
112
+ name=file_path,
113
+ content=file_content,
114
+ level=self.summarize_level,
115
+ summary_instructions=self.summarize_instruction,
116
+ db=self.summarized_files_db,
117
+ )
118
+ if summarized_content is None:
119
+ return "Failed to summarize file"
120
+ self._print_token_usage(token_usage)
87
121
 
88
122
  prompt = CHECK_FILE_RELATED_USER_PROMPT.format(
89
123
  goal_item_desc=self.goal_item_desc,
@@ -102,8 +136,5 @@ Returns:
102
136
 
103
137
  self._print_step_output(step_output=reasoning)
104
138
  self._print_token_usage(token_usage)
105
- if out:
106
- return "Yes, the file is related to the goal item."
107
- else:
108
- return "No, the file **is not** related to the goal item."
139
+ return res.is_related
109
140
 
@@ -47,7 +47,7 @@ class DockerGenerationTask(AgentTask):
47
47
  def __init__(
48
48
  self,
49
49
  llm,
50
- step_callback = None
50
+ step_callback = None,
51
51
  ):
52
52
  super().__init__(llm, step_callback)
53
53
  self.repo_path: str | None = None
@@ -9,7 +9,8 @@ from pydantic import BaseModel, Field
9
9
  from markdownify import markdownify as md
10
10
 
11
11
  from bioguider.agents.agent_utils import read_file
12
- from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
12
+ from bioguider.agents.collection_task import CollectionTask
13
+ from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION, CollectionGoalItemEnum
13
14
  from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
14
15
  from bioguider.rag.data_pipeline import count_tokens
15
16
  from .common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
@@ -32,14 +33,17 @@ Your task is to analyze the provided files related to installation and generate
32
33
  1. **Installation Available**: Is the installation section in document (like README.md or INSTALLATION)?
33
34
  * Output: `Yes` or `No`
34
35
 
35
- 2. **Installation Tutorial**: Is the installation tutorial provided?
36
+ 2. **Installation Tutorial**: Is the step-by-step installation tutorial provided?
36
37
  * Ouput: `Yes` or `No`
37
38
 
38
39
  3. **Number of required Dependencies Installation**: The number of dependencies that are required to install
39
40
  * Output: Number
40
41
  * Suggest specific improvements if necessary, such as missing dependencies
41
42
 
42
- 4. **Overall Score**: Give an overall quality rating of the Installation information.
43
+ 4. **Compatible Operating System**: Is the compatible operating system described?
44
+ * Output: `Yes` or `No`
45
+
46
+ 5. **Overall Score**: Give an overall quality rating of the Installation information.
43
47
  * Output: `Poor`, `Fair`, `Good`, or `Excellent`
44
48
 
45
49
  ---
@@ -53,6 +57,7 @@ Your final report must **exactly match** the following format. Do not add or omi
53
57
  **Dependency:**
54
58
  * number: [Number]
55
59
  * suggestions: <suggestion to improve **dependency information** like missing dependencies
60
+ **Compatible Operating System:** [Yes / No]
56
61
  **Overall Score:** [Poor / Fair / Good / Excellent]
57
62
 
58
63
  ---
@@ -113,6 +118,7 @@ class StructuredEvaluationInstallationResult(BaseModel):
113
118
  install_tutorial: Optional[bool]=Field(description="A boolean value. Is the installation tutorial provided?")
114
119
  dependency_number: Optional[int]=Field(description="A number. It is the number of dependencies that are required to install.")
115
120
  dependency_suggestions: Optional[str]=Field(description="A string value. It is the specific improvements if necessary, such as missing dependencies")
121
+ compatible_os: Optional[bool]=Field(description="A boolean value. Is compatible operating system described?")
116
122
  overall_score: Optional[str]=Field(description="A overall scroll for the installation quality, could be `Poor`, `Fair`, `Good`, or `Excellent`")
117
123
 
118
124
  class EvaluationInstallationResult(BaseModel):
@@ -163,8 +169,9 @@ class EvaluationInstallationTask(EvaluationTask):
163
169
  gitignore_path,
164
170
  meta_data = None,
165
171
  step_callback = None,
172
+ summarized_files_db = None,
166
173
  ):
167
- super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
174
+ super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
168
175
  self.evaluation_name = "Installation Evaluation"
169
176
 
170
177
 
@@ -204,6 +211,8 @@ class EvaluationInstallationTask(EvaluationTask):
204
211
  instruction_prompt=EVALUATION_INSTRUCTION,
205
212
  schema=StructuredEvaluationInstallationResult,
206
213
  )
214
+ res: StructuredEvaluationInstallationResult = res
215
+ res.dependency_number = 0 if res.dependency_number is None else res.dependency_number
207
216
  self.print_step(step_output=reasoning_process)
208
217
  self.print_step(token_usage=token_usage)
209
218
 
@@ -235,7 +244,7 @@ class EvaluationInstallationTask(EvaluationTask):
235
244
  }
236
245
  return evaluation, token_usage
237
246
 
238
- def _evaluate(self, files: list[str] | None = None) -> tuple[dict | None, dict]:
247
+ def _evaluate(self, files: list[str] | None = None) -> tuple[dict | None, dict, list[str]]:
239
248
  evaluation, token_usage = self._free_evaluate(files)
240
249
  structured_evaluation, structured_token_usage = self._structured_evaluate(files)
241
250
 
@@ -245,5 +254,20 @@ class EvaluationInstallationTask(EvaluationTask):
245
254
  }
246
255
  total_token_usage = increase_token_usage(token_usage, structured_token_usage)
247
256
 
248
- return combined_evaluation, total_token_usage
249
-
257
+ return combined_evaluation, total_token_usage, files
258
+
259
+ def _collect_files(self):
260
+ task = CollectionTask(
261
+ llm=self.llm,
262
+ step_callback=self.step_callback,
263
+ )
264
+ task.compile(
265
+ repo_path=self.repo_path,
266
+ gitignore_path=Path(self.repo_path, ".gitignore"),
267
+ db=self.summarized_files_db,
268
+ goal_item=CollectionGoalItemEnum.Installation.name,
269
+ )
270
+ files = task.collect()
271
+ if files is None:
272
+ return []
273
+ return files
@@ -7,6 +7,7 @@ from langchain_openai.chat_models.base import BaseChatOpenAI
7
7
  from pydantic import BaseModel, Field
8
8
 
9
9
  from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
10
+ from bioguider.utils.gitignore_checker import GitignoreChecker
10
11
 
11
12
  from ..utils.pyphen_utils import PyphenReadability
12
13
  from bioguider.agents.agent_utils import increase_token_usage, read_file, summarize_file
@@ -303,9 +304,10 @@ class EvaluationREADMETask(EvaluationTask):
303
304
  repo_path: str,
304
305
  gitignore_path: str,
305
306
  meta_data: ProjectMetadata | None = None,
306
- step_callback: Callable | None = None
307
+ step_callback: Callable | None = None,
308
+ summarized_files_db = None,
307
309
  ):
308
- super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
310
+ super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
309
311
  self.evaluation_name = "README Evaluation"
310
312
 
311
313
  def _structured_evaluate(self, free_readme_evaluations: dict[str, dict]):
@@ -455,7 +457,7 @@ class EvaluationREADMETask(EvaluationTask):
455
457
  total_token_usage = increase_token_usage(total_token_usage, token_usage)
456
458
  return readme_evaluations, total_token_usage
457
459
 
458
- def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
460
+ def _evaluate(self, files: list[str]) -> tuple[dict, dict, list[str]]:
459
461
  free_readme_evaluations, free_token_usage = self._free_evaluate(files)
460
462
  structured_readme_evaluations, structured_token_usage = self._structured_evaluate(free_readme_evaluations)
461
463
 
@@ -472,6 +474,26 @@ class EvaluationREADMETask(EvaluationTask):
472
474
 
473
475
  total_token_usage = increase_token_usage(free_token_usage, structured_token_usage)
474
476
 
475
- return combined_evaluations, total_token_usage
477
+ return combined_evaluations, total_token_usage, files
476
478
 
479
+ def _collect_files(self):
480
+ """
481
+ Search for a README file in the repository directory.
482
+ """
483
+ possible_readme_files = [
484
+ "readme.md",
485
+ "readme.rst",
486
+ "readme.txt",
487
+ "readme",
488
+ ]
489
+ repo_path = self.repo_path
490
+ gitignore_path = Path(repo_path, ".gitignore")
491
+ gitignore_checker = GitignoreChecker(
492
+ directory=repo_path, gitignore_path=gitignore_path
493
+ )
494
+ found_readme_files = gitignore_checker.check_files_and_folders(
495
+ check_file_cb=lambda root_dir, relative_path: Path(relative_path).name.lower() in possible_readme_files,
496
+ )
497
+
498
+ return found_readme_files
477
499