bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import Callable
|
|
3
|
+
from langchain.tools import BaseTool
|
|
4
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
5
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
6
|
+
from bioguider.agents.agent_utils import ObservationResult
|
|
7
|
+
from bioguider.agents.collection_task_utils import CollectionWorkflowState
|
|
8
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
|
|
9
|
+
from bioguider.agents.peo_common_step import PEOCommonStep
|
|
10
|
+
from bioguider.agents.prompt_utils import COLLECTION_GOAL, COLLECTION_PROMPTS
|
|
11
|
+
from bioguider.utils.constants import MAX_STEP_COUNT
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
COLLECTION_OBSERVE_SYSTEM_PROMPT = """You are an expert software developer and technical documentation analyst.
|
|
15
|
+
{goal_item_desc}
|
|
16
|
+
|
|
17
|
+
{related_file_description}
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
### **Repository Structure**
|
|
21
|
+
Here is the 2-level file structure of the repository (`f` = file, `d` = directory, `l` - symlink, `u` - unknown):
|
|
22
|
+
{repo_structure}
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
### **Intermediate Output**
|
|
26
|
+
{intermediate_output}
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
### **Instructions**
|
|
30
|
+
1. Your goal is to identify files that are relevant to the **goal item**.
|
|
31
|
+
2. Carefully review the **Goal**, **Repository Structure**, and **Intermediate Output**.
|
|
32
|
+
3. If you believe **all relevant files** have been collected:
|
|
33
|
+
|
|
34
|
+
* Proceed with the following format:
|
|
35
|
+
|
|
36
|
+
* Provide your reasoning under **Analysis**
|
|
37
|
+
* Then list all relevant files and folders under **FinalAnswer**
|
|
38
|
+
* **FinalAnswer** format must exactly match this format:
|
|
39
|
+
**FinalAnswer**: {{"final_answer": [<file path>, <file path>, <file path>, ...]}}
|
|
40
|
+
* Be sure to include the **full relative paths** with respect to the repository root.
|
|
41
|
+
* Your answer **must exactly match the follwing format** (note: no JSON code block, no additional comments), **do not** make up anything:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
**Analysis**: your analysis here
|
|
45
|
+
**FinalAnswer**: {{"final_answer": ["path/to/file1", "path/to/file2", ...]}}
|
|
46
|
+
```
|
|
47
|
+
4. If you believe **more files still need to be collected**:
|
|
48
|
+
* Provide your reasoning under **Thoughts**:
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
**Thoughts**: your explanation here
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
5. Important instructions:
|
|
55
|
+
{important_instructions}
|
|
56
|
+
Be precise and support your reasoning with evidence from the input.
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
### Notes
|
|
60
|
+
- We are collecting information over multiple rounds, your thoughts and the output of this step will be persisted, so please **do not rush to provide a Final Answer**.
|
|
61
|
+
If you find the current information insufficient, share your thoughts instead—we’ll continue with the next round accordingly.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
class CollectionObserveStep(PEOCommonStep):
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
llm: BaseChatOpenAI,
|
|
68
|
+
repo_path: str,
|
|
69
|
+
repo_structure: str,
|
|
70
|
+
gitignore_path: str,
|
|
71
|
+
):
|
|
72
|
+
super().__init__(llm=llm)
|
|
73
|
+
self.repo_path = repo_path
|
|
74
|
+
self.repo_structure = repo_structure
|
|
75
|
+
self.gitignore_path = gitignore_path
|
|
76
|
+
self.step_name = "Collection Observation Step"
|
|
77
|
+
|
|
78
|
+
def _build_prompt(self, state):
|
|
79
|
+
str_goal_item = state["goal_item"]
|
|
80
|
+
collection_item = COLLECTION_PROMPTS[str_goal_item]
|
|
81
|
+
goal_item_desc = \
|
|
82
|
+
ChatPromptTemplate.from_template(COLLECTION_GOAL).format(goal_item=collection_item["goal_item"])
|
|
83
|
+
repo_structure = self.repo_structure
|
|
84
|
+
intermediate_steps = self._build_intermediate_steps(state)
|
|
85
|
+
prompt = ChatPromptTemplate.from_template(COLLECTION_OBSERVE_SYSTEM_PROMPT)
|
|
86
|
+
important_instructions = "N/A" if "observe_important_instructions" not in collection_item or len(collection_item["observe_important_instructions"]) == 0 \
|
|
87
|
+
else collection_item["observe_important_instructions"]
|
|
88
|
+
return prompt.format(
|
|
89
|
+
goal_item_desc=goal_item_desc,
|
|
90
|
+
related_file_description=collection_item["related_file_description"],
|
|
91
|
+
repo_structure=repo_structure,
|
|
92
|
+
intermediate_output=intermediate_steps,
|
|
93
|
+
important_instructions=important_instructions,
|
|
94
|
+
)
|
|
95
|
+
def _execute_directly(self, state: CollectionWorkflowState):
|
|
96
|
+
step_count = state["step_count"]
|
|
97
|
+
plan = state["plan_actions"]
|
|
98
|
+
plan = plan.strip()
|
|
99
|
+
if len(plan) == 0:
|
|
100
|
+
instruction = "No plan provided, please let's generate the final answer based on the current information."
|
|
101
|
+
else:
|
|
102
|
+
instruction = "Now, we have reached max recursion limit, please give me the **final answer** based on the current information" \
|
|
103
|
+
if step_count == MAX_STEP_COUNT/3 - 2 else "Let's begin thinking."
|
|
104
|
+
system_prompt = self._build_prompt(state)
|
|
105
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
106
|
+
res, _, token_usage, reasoning_process = agent.go(
|
|
107
|
+
system_prompt=system_prompt,
|
|
108
|
+
instruction_prompt=instruction,
|
|
109
|
+
schema=ObservationResult,
|
|
110
|
+
)
|
|
111
|
+
state["final_answer"] = res.FinalAnswer
|
|
112
|
+
analysis = res.Analysis
|
|
113
|
+
thoughts = res.Thoughts
|
|
114
|
+
state["step_analysis"] = analysis
|
|
115
|
+
state["step_thoughts"] = thoughts
|
|
116
|
+
state["step_count"] += 1
|
|
117
|
+
self._print_step(
|
|
118
|
+
state,
|
|
119
|
+
step_output=f"**Observation Reasoning Process: {state['step_count']}**\n{reasoning_process}"
|
|
120
|
+
)
|
|
121
|
+
self._print_step(
|
|
122
|
+
state,
|
|
123
|
+
step_output=f"Final Answer: {res.FinalAnswer if res.FinalAnswer else None}\nAnalysis: {analysis}\nThoughts: {thoughts}",
|
|
124
|
+
)
|
|
125
|
+
return state, token_usage
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
|
|
2
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
3
|
+
from langchain.tools import BaseTool
|
|
4
|
+
from langchain_core.prompts import ChatPromptTemplate, StringPromptTemplate
|
|
5
|
+
from bioguider.agents.agent_utils import (
|
|
6
|
+
convert_plan_to_string,
|
|
7
|
+
get_tool_names_and_descriptions,
|
|
8
|
+
PlanAgentResultJsonSchema,
|
|
9
|
+
PlanAgentResult,
|
|
10
|
+
)
|
|
11
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
|
|
12
|
+
from bioguider.agents.peo_common_step import PEOCommonStep
|
|
13
|
+
from bioguider.agents.collection_task_utils import CollectionWorkflowState
|
|
14
|
+
from bioguider.agents.prompt_utils import COLLECTION_GOAL, COLLECTION_PROMPTS
|
|
15
|
+
|
|
16
|
+
COLLECTION_PLAN_SYSTEM_PROMPT = ChatPromptTemplate.from_template("""### **Goal**
|
|
17
|
+
You are an expert developer specializing in the biomedical domain.
|
|
18
|
+
**{goal}**
|
|
19
|
+
|
|
20
|
+
{related_file_description}
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
### **Repository File Structure**
|
|
24
|
+
Below is the 2-level file structure of the repository (`f` = file, `d` = directory, `l` - symlink, `u` - unknown):
|
|
25
|
+
{repo_structure}
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
### **Function Tools**
|
|
30
|
+
You have access to the following function tools:
|
|
31
|
+
{tools}
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
### **Intermediate Steps**
|
|
36
|
+
Here are the results from previous steps:
|
|
37
|
+
{intermediate_steps}
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
### **Intermediate Thoughts**
|
|
42
|
+
- **Analysis**: {intermediate_analysis}
|
|
43
|
+
- **Thoughts**: {intermediate_thoughts}
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
### **Instructions**
|
|
48
|
+
|
|
49
|
+
1. We will iterate through multiple **Plan -> Execution -> Observation** loops as needed.
|
|
50
|
+
- All variables and tool outputs are **persisted across rounds**, so you can build on prior results.
|
|
51
|
+
- Develop your plan **incrementally**, and reflect on intermediate observations before proceeding.
|
|
52
|
+
- Limit each step to **one or two actions** — avoid trying to complete everything in a single step.
|
|
53
|
+
|
|
54
|
+
2. Your task is to collect all files that are relevant to the goal.
|
|
55
|
+
- Start by using the `summarize_file` tool to inspect file content quickly.
|
|
56
|
+
- If needed, follow up with the `read_file` tool for full content extraction.
|
|
57
|
+
|
|
58
|
+
3. You may use the `read_directory` tool to explore directory contents, but avoid using it in the first step unless necessary.
|
|
59
|
+
|
|
60
|
+
4. Your plan can only use the above tools, **do not** make up any tools not in the above tools list.
|
|
61
|
+
|
|
62
|
+
5. Your planned step input file or input directory must come from the above repository files structure, **do not** make up file name or directory name.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
### **Important Instructions**
|
|
67
|
+
{important_instructions}
|
|
68
|
+
|
|
69
|
+
### **Output Format**
|
|
70
|
+
Your plan **must exactly match** a sequence of steps in the following format, **do not** make up anything:
|
|
71
|
+
|
|
72
|
+
Step: <tool name> # Tool name **must be one** of {tool_names}
|
|
73
|
+
Step Input: <file or directory name>
|
|
74
|
+
|
|
75
|
+
Step: <tool name> # Tool name **must be one** of {tool_names}
|
|
76
|
+
Step Input: <file or directory name>
|
|
77
|
+
...
|
|
78
|
+
""")
|
|
79
|
+
|
|
80
|
+
class CollectionPlanStep(PEOCommonStep):
|
|
81
|
+
"""
|
|
82
|
+
CollectionPlanStep is a step in the collection plan process.
|
|
83
|
+
It is responsible for initializing the tools and compiling the step.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
llm: BaseChatOpenAI,
|
|
89
|
+
repo_path: str,
|
|
90
|
+
repo_structure: str,
|
|
91
|
+
gitignore_path: str,
|
|
92
|
+
custom_tools: list[BaseTool] | None = None,
|
|
93
|
+
):
|
|
94
|
+
super().__init__(llm)
|
|
95
|
+
self.step_name = "Collection Plan Step"
|
|
96
|
+
self.repo_path = repo_path
|
|
97
|
+
self.repo_structure = repo_structure
|
|
98
|
+
self.gitignore_path = gitignore_path
|
|
99
|
+
self.custom_tools = custom_tools if custom_tools is not None else []
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _prepare_system_prompt(self, state: CollectionWorkflowState) -> str:
|
|
103
|
+
collection_state = state
|
|
104
|
+
goal_item = collection_state["goal_item"]
|
|
105
|
+
collection_item = COLLECTION_PROMPTS[goal_item]
|
|
106
|
+
intermediate_steps = self._build_intermediate_steps(state)
|
|
107
|
+
step_analysis, step_thoughts = self._build_intermediate_analysis_and_thoughts(state)
|
|
108
|
+
goal = ChatPromptTemplate.from_template(COLLECTION_GOAL).format(goal_item=collection_item["goal_item"])
|
|
109
|
+
related_file_description = collection_item["related_file_description"]
|
|
110
|
+
important_instructions="N/A" if "plan_important_instructions" not in collection_item or len(collection_item["plan_important_instructions"]) == 0 \
|
|
111
|
+
else collection_item["plan_important_instructions"]
|
|
112
|
+
tool_names, tools_desc = get_tool_names_and_descriptions(self.custom_tools)
|
|
113
|
+
system_prompt = COLLECTION_PLAN_SYSTEM_PROMPT.format(
|
|
114
|
+
goal=goal,
|
|
115
|
+
related_file_description=related_file_description,
|
|
116
|
+
repo_structure=self.repo_structure,
|
|
117
|
+
tools=tools_desc,
|
|
118
|
+
intermediate_steps=intermediate_steps,
|
|
119
|
+
intermediate_analysis=step_analysis,
|
|
120
|
+
intermediate_thoughts=step_thoughts,
|
|
121
|
+
tool_names=tool_names,
|
|
122
|
+
important_instructions=important_instructions,
|
|
123
|
+
)
|
|
124
|
+
self._print_step(
|
|
125
|
+
state,
|
|
126
|
+
step_output="**Intermediate Step Output**\n" + intermediate_steps
|
|
127
|
+
)
|
|
128
|
+
self._print_step(
|
|
129
|
+
state,
|
|
130
|
+
step_output="**Intermediate Step Analysis**\n{step_analysis}\n**Intermediate Step Thoughts**\n{step_thoughts}",
|
|
131
|
+
)
|
|
132
|
+
return system_prompt
|
|
133
|
+
|
|
134
|
+
def _execute_directly(self, state: CollectionWorkflowState):
|
|
135
|
+
system_prompt = self._prepare_system_prompt(state)
|
|
136
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
137
|
+
res, _, token_usage, reasoning_process = agent.go(
|
|
138
|
+
system_prompt=system_prompt,
|
|
139
|
+
instruction_prompt="Now, let's begin the collection plan step.",
|
|
140
|
+
schema=PlanAgentResultJsonSchema,
|
|
141
|
+
)
|
|
142
|
+
PEOCommonStep._reset_step_state(state)
|
|
143
|
+
res = PlanAgentResult(**res)
|
|
144
|
+
self._print_step(state, step_output=f"**Reasoning Process**\n{reasoning_process}")
|
|
145
|
+
self._print_step(state, step_output=f"**Plan**\n{str(res.actions)}")
|
|
146
|
+
state["plan_actions"] = convert_plan_to_string(res)
|
|
147
|
+
|
|
148
|
+
return state, token_usage
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Callable
|
|
5
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
6
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
7
|
+
from langchain.tools import StructuredTool, Tool
|
|
8
|
+
from langgraph.graph import StateGraph, START, END
|
|
9
|
+
|
|
10
|
+
from bioguider.database.summarized_file_db import SummarizedFilesDb
|
|
11
|
+
from bioguider.utils.file_utils import flatten_files, get_file_type
|
|
12
|
+
from bioguider.agents.agent_utils import parse_final_answer, read_directory
|
|
13
|
+
from bioguider.agents.collection_task_utils import (
|
|
14
|
+
RELATED_FILE_GOAL_ITEM,
|
|
15
|
+
CollectionWorkflowState,
|
|
16
|
+
check_file_related_tool,
|
|
17
|
+
)
|
|
18
|
+
from bioguider.agents.agent_tools import (
|
|
19
|
+
read_directory_tool,
|
|
20
|
+
summarize_file_tool,
|
|
21
|
+
read_file_tool,
|
|
22
|
+
)
|
|
23
|
+
from bioguider.agents.peo_common_step import PEOCommonStep
|
|
24
|
+
from bioguider.agents.prompt_utils import COLLECTION_PROMPTS
|
|
25
|
+
from bioguider.agents.agent_task import AgentTask
|
|
26
|
+
from bioguider.agents.collection_plan_step import CollectionPlanStep
|
|
27
|
+
from bioguider.agents.collection_execute_step import CollectionExecuteStep
|
|
28
|
+
from bioguider.agents.collection_observe_step import CollectionObserveStep
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
class CollectionTask(AgentTask):
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
llm: BaseChatOpenAI,
|
|
36
|
+
step_callback: Callable | None = None,
|
|
37
|
+
summarize_instruction: str | None = "N/A",
|
|
38
|
+
summarized_files_db: SummarizedFilesDb | None = None,
|
|
39
|
+
provided_files: list[str] | None = None,
|
|
40
|
+
):
|
|
41
|
+
super().__init__(llm, step_callback, summarized_files_db=summarized_files_db)
|
|
42
|
+
self.repo_path: str | None = None
|
|
43
|
+
self.gitignore_path: str | None = None
|
|
44
|
+
self.repo_structure: str | None = None
|
|
45
|
+
self.goal_item: str | None = None
|
|
46
|
+
self.steps: list[PEOCommonStep] = []
|
|
47
|
+
self.tools: list[any] | None = None
|
|
48
|
+
self.custom_tools: list[Tool] | None = None
|
|
49
|
+
self.summarize_instruction = summarize_instruction
|
|
50
|
+
self.provided_files = provided_files
|
|
51
|
+
|
|
52
|
+
def _prepare_tools(self, related_file_goal_item_desc):
|
|
53
|
+
tool_rd = read_directory_tool(repo_path=self.repo_path)
|
|
54
|
+
tool_sum = summarize_file_tool(
|
|
55
|
+
llm=self.llm,
|
|
56
|
+
repo_path=self.repo_path,
|
|
57
|
+
output_callback=self.step_callback,
|
|
58
|
+
db=self.summarized_files_db,
|
|
59
|
+
summaize_instruction=self.summarize_instruction,
|
|
60
|
+
)
|
|
61
|
+
tool_rf = read_file_tool(repo_path=self.repo_path)
|
|
62
|
+
tool_cf = check_file_related_tool(
|
|
63
|
+
llm=self.llm,
|
|
64
|
+
repo_path=self.repo_path,
|
|
65
|
+
goal_item_desc=related_file_goal_item_desc,
|
|
66
|
+
output_callback=self.step_callback,
|
|
67
|
+
summarize_instruction=self.summarize_instruction,
|
|
68
|
+
summarized_files_db=self.summarized_files_db,
|
|
69
|
+
)
|
|
70
|
+
self.tools = [tool_rd, tool_sum, tool_rf, tool_cf]
|
|
71
|
+
self.custom_tools = [
|
|
72
|
+
Tool(
|
|
73
|
+
name = tool_rd.__class__.__name__,
|
|
74
|
+
func = tool_rd.run,
|
|
75
|
+
description=tool_rd.__class__.__doc__,
|
|
76
|
+
),
|
|
77
|
+
StructuredTool.from_function(
|
|
78
|
+
tool_sum.run,
|
|
79
|
+
description=tool_sum.__class__.__doc__,
|
|
80
|
+
name=tool_sum.__class__.__name__,
|
|
81
|
+
),
|
|
82
|
+
Tool(
|
|
83
|
+
name = tool_rf.__class__.__name__,
|
|
84
|
+
func = tool_rf.run,
|
|
85
|
+
description=tool_rf.__class__.__doc__,
|
|
86
|
+
),
|
|
87
|
+
Tool(
|
|
88
|
+
name = tool_cf.__class__.__name__,
|
|
89
|
+
func = tool_cf.run,
|
|
90
|
+
description=tool_cf.__class__.__doc__,
|
|
91
|
+
),
|
|
92
|
+
]
|
|
93
|
+
# self.custom_tools.append(CustomPythonAstREPLTool())
|
|
94
|
+
|
|
95
|
+
def _initialize(self):
|
|
96
|
+
# initialize the 2-level file structure of the repo
|
|
97
|
+
if not os.path.exists(self.repo_path):
|
|
98
|
+
raise ValueError(f"Repository path {self.repo_path} does not exist.")
|
|
99
|
+
files = self.provided_files
|
|
100
|
+
if files is None:
|
|
101
|
+
files = read_directory(self.repo_path, os.path.join(self.repo_path, ".gitignore"))
|
|
102
|
+
file_pairs = [(f, get_file_type(os.path.join(self.repo_path, f)).value) for f in files]
|
|
103
|
+
self.repo_structure = ""
|
|
104
|
+
for f, f_type in file_pairs:
|
|
105
|
+
self.repo_structure += f"{f} - {f_type}\n"
|
|
106
|
+
|
|
107
|
+
collection_item = COLLECTION_PROMPTS[self.goal_item]
|
|
108
|
+
related_file_goal_item_desc = ChatPromptTemplate.from_template(RELATED_FILE_GOAL_ITEM).format(
|
|
109
|
+
goal_item=collection_item["goal_item"],
|
|
110
|
+
related_file_description=collection_item["related_file_description"],
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
self._prepare_tools(related_file_goal_item_desc)
|
|
114
|
+
self.steps = [
|
|
115
|
+
CollectionPlanStep(
|
|
116
|
+
llm=self.llm,
|
|
117
|
+
repo_path=self.repo_path,
|
|
118
|
+
repo_structure=self.repo_structure,
|
|
119
|
+
gitignore_path=self.gitignore_path,
|
|
120
|
+
custom_tools=self.custom_tools,
|
|
121
|
+
),
|
|
122
|
+
CollectionExecuteStep(
|
|
123
|
+
llm=self.llm,
|
|
124
|
+
repo_path=self.repo_path,
|
|
125
|
+
repo_structure=self.repo_structure,
|
|
126
|
+
gitignore_path=self.gitignore_path,
|
|
127
|
+
custom_tools=self.custom_tools,
|
|
128
|
+
),
|
|
129
|
+
CollectionObserveStep(
|
|
130
|
+
llm=self.llm,
|
|
131
|
+
repo_path=self.repo_path,
|
|
132
|
+
repo_structure=self.repo_structure,
|
|
133
|
+
gitignore_path=self.gitignore_path,
|
|
134
|
+
),
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
def _compile(self, repo_path: str, gitignore_path: str, **kwargs):
|
|
138
|
+
self.repo_path = repo_path
|
|
139
|
+
self.gitignore_path = gitignore_path
|
|
140
|
+
self.goal_item = kwargs.get("goal_item")
|
|
141
|
+
self._initialize()
|
|
142
|
+
|
|
143
|
+
def check_observe_step(state):
|
|
144
|
+
if "final_answer" in state and state["final_answer"] is not None:
|
|
145
|
+
self._print_step(step_name="Final Answer")
|
|
146
|
+
self._print_step(step_output=state["final_answer"])
|
|
147
|
+
return END
|
|
148
|
+
return "plan_step"
|
|
149
|
+
|
|
150
|
+
graph = StateGraph(CollectionWorkflowState)
|
|
151
|
+
graph.add_node("plan_step", self.steps[0].execute)
|
|
152
|
+
graph.add_node("execute_step", self.steps[1].execute)
|
|
153
|
+
graph.add_node("observe_step", self.steps[2].execute)
|
|
154
|
+
graph.add_edge(START, "plan_step")
|
|
155
|
+
graph.add_edge("plan_step", "execute_step")
|
|
156
|
+
graph.add_edge("execute_step", "observe_step")
|
|
157
|
+
graph.add_conditional_edges("observe_step", check_observe_step, {"plan_step", END})
|
|
158
|
+
|
|
159
|
+
self.graph = graph.compile()
|
|
160
|
+
|
|
161
|
+
def collect(self) -> list[str] | None:
|
|
162
|
+
s = self._go_graph({"goal_item": self.goal_item, "step_count": 0})
|
|
163
|
+
if s is None or 'final_answer' not in s:
|
|
164
|
+
return None
|
|
165
|
+
if s["final_answer"] is None:
|
|
166
|
+
return None
|
|
167
|
+
result = s["final_answer"].strip()
|
|
168
|
+
the_obj = parse_final_answer(result)
|
|
169
|
+
if the_obj is None or "final_answer" not in the_obj:
|
|
170
|
+
logger.error(f"Final answer is not a valid JSON: {result}")
|
|
171
|
+
return None
|
|
172
|
+
final_result = the_obj["final_answer"]
|
|
173
|
+
files = None
|
|
174
|
+
if isinstance(final_result, str):
|
|
175
|
+
final_result = final_result.strip()
|
|
176
|
+
files = [final_result]
|
|
177
|
+
elif isinstance(final_result, list):
|
|
178
|
+
files = final_result
|
|
179
|
+
else:
|
|
180
|
+
logger.error(f"Final answer is not a valid JSON list or string: {result}")
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
files = flatten_files(self.repo_path, files)
|
|
184
|
+
return files
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Callable, Optional, TypedDict
|
|
3
|
+
from langchain.prompts import ChatPromptTemplate
|
|
4
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
5
|
+
from langchain_core.messages import AIMessage
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
from bioguider.agents.agent_tools import agent_tool
|
|
10
|
+
from bioguider.agents.agent_utils import read_file, summarize_file
|
|
11
|
+
from bioguider.agents.peo_common_step import PEOWorkflowState
|
|
12
|
+
from bioguider.agents.common_agent import CommonAgent
|
|
13
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
|
|
14
|
+
from bioguider.database.summarized_file_db import SummarizedFilesDb
|
|
15
|
+
from bioguider.utils.constants import MAX_FILE_LENGTH
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
class CollectionWorkflowState(TypedDict):
|
|
20
|
+
llm: Optional[BaseChatOpenAI]
|
|
21
|
+
step_output_callback: Optional[Callable]
|
|
22
|
+
|
|
23
|
+
intermediate_steps: Optional[str]
|
|
24
|
+
step_output: Optional[str]
|
|
25
|
+
step_analysis: Optional[str]
|
|
26
|
+
step_thoughts: Optional[str]
|
|
27
|
+
plan_actions: Optional[list[dict]]
|
|
28
|
+
|
|
29
|
+
goal_item: Optional[str]
|
|
30
|
+
final_answer: Optional[str]
|
|
31
|
+
step_count: Optional[int]
|
|
32
|
+
|
|
33
|
+
RELATED_FILE_GOAL_ITEM = """
|
|
34
|
+
Your task is to determine whether the file is related to **{goal_item}**.
|
|
35
|
+
|
|
36
|
+
{related_file_description}
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
CHECK_FILE_RELATED_USER_PROMPT = ChatPromptTemplate.from_template("""
|
|
40
|
+
You are given a summary of a file’s content.
|
|
41
|
+
|
|
42
|
+
{goal_item_desc}
|
|
43
|
+
|
|
44
|
+
Here is the file summary:
|
|
45
|
+
```
|
|
46
|
+
{summarized_file_content}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### **Question:**
|
|
50
|
+
Does this file appear to contain related information?
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
### **Output Format:**
|
|
55
|
+
Respond with exactly two parts:
|
|
56
|
+
1. A single word: Yes or No (indicating if the file meets the goal criteria)
|
|
57
|
+
2. One brief explanatory sentence.
|
|
58
|
+
For example: Yes. This file is a compiled binary file, so, it is related to the compiled standalone file (goal item).
|
|
59
|
+
""")
|
|
60
|
+
|
|
61
|
+
class CheckFileRelatedResult(BaseModel):
|
|
62
|
+
is_related: str = Field(description="A string conclusion specify if the provided file is related. The string value contains two parts:\n 1. A single word: Yes or No (indicating if the file meets the goal criteria).\n 2. One brief explanatory sentence.")
|
|
63
|
+
|
|
64
|
+
class check_file_related_tool(agent_tool):
|
|
65
|
+
""" Check if the file is related to the goal item
|
|
66
|
+
Args:
|
|
67
|
+
file_path str: file path
|
|
68
|
+
Returns:
|
|
69
|
+
str: A string conclusion. The string conclusion contains two parts:\n 1. A single word: Yes or No (indicating if the file meets the goal criteria).\n 2. One brief explanatory sentence.
|
|
70
|
+
"""
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
llm: BaseChatOpenAI,
|
|
74
|
+
repo_path: str,
|
|
75
|
+
goal_item_desc: str,
|
|
76
|
+
output_callback: Callable | None = None,
|
|
77
|
+
summarize_instruction: str | None = None,
|
|
78
|
+
summarize_level: int | None = 6,
|
|
79
|
+
summarized_files_db: SummarizedFilesDb | None = None,
|
|
80
|
+
):
|
|
81
|
+
super().__init__(llm=llm, output_callback=output_callback)
|
|
82
|
+
self.repo_path = repo_path
|
|
83
|
+
self.goal_item_desc = goal_item_desc
|
|
84
|
+
self.summarize_instruction = summarize_instruction \
|
|
85
|
+
if summarize_instruction is not None else "N/A"
|
|
86
|
+
self.summarize_level = summarize_level
|
|
87
|
+
self.summarized_files_db = summarized_files_db
|
|
88
|
+
|
|
89
|
+
def run(self, file_path: str) -> str:
|
|
90
|
+
if not self.repo_path in file_path:
|
|
91
|
+
file_path = os.path.join(self.repo_path, file_path)
|
|
92
|
+
file_path = file_path.strip()
|
|
93
|
+
if not os.path.isfile(file_path):
|
|
94
|
+
return "Can't read file"
|
|
95
|
+
|
|
96
|
+
check_prompts = None
|
|
97
|
+
try:
|
|
98
|
+
file_content = read_file(file_path)
|
|
99
|
+
except UnicodeDecodeError as e:
|
|
100
|
+
logger.error(str(e))
|
|
101
|
+
check_prompts = "Can't summarize binary file, please decide according to file name and extension."
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(str(e))
|
|
104
|
+
check_prompts = "Failed to summarize file, please decide according to file name and extension."
|
|
105
|
+
if check_prompts is None and file_content is None:
|
|
106
|
+
return "Failed to read file"
|
|
107
|
+
if check_prompts is not None:
|
|
108
|
+
summarized_content = check_prompts
|
|
109
|
+
else:
|
|
110
|
+
if len(file_content) > MAX_FILE_LENGTH:
|
|
111
|
+
file_content = file_content[:MAX_FILE_LENGTH]
|
|
112
|
+
summarized_content, token_usage = summarize_file(
|
|
113
|
+
llm=self.llm,
|
|
114
|
+
name=file_path,
|
|
115
|
+
content=file_content,
|
|
116
|
+
level=self.summarize_level,
|
|
117
|
+
summary_instructions=self.summarize_instruction,
|
|
118
|
+
db=self.summarized_files_db,
|
|
119
|
+
)
|
|
120
|
+
if summarized_content is None:
|
|
121
|
+
return "Failed to summarize file"
|
|
122
|
+
self._print_token_usage(token_usage)
|
|
123
|
+
|
|
124
|
+
prompt = CHECK_FILE_RELATED_USER_PROMPT.format(
|
|
125
|
+
goal_item_desc=self.goal_item_desc,
|
|
126
|
+
summarized_file_content=summarized_content,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
130
|
+
res, _, token_usage, reasoning = agent.go(
|
|
131
|
+
system_prompt=prompt,
|
|
132
|
+
instruction_prompt="Now, please check if the file is related to the goal item.",
|
|
133
|
+
schema=CheckFileRelatedResult,
|
|
134
|
+
)
|
|
135
|
+
# res: AIMessage = self.llm.invoke([("human", prompt)])
|
|
136
|
+
res: CheckFileRelatedResult = res
|
|
137
|
+
out = res.is_related
|
|
138
|
+
|
|
139
|
+
self._print_step_output(step_output=reasoning)
|
|
140
|
+
self._print_token_usage(token_usage)
|
|
141
|
+
return res.is_related
|
|
142
|
+
|