bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
5
|
+
from langchain.tools import BaseTool
|
|
6
|
+
from langchain.agents import AgentExecutor, create_react_agent
|
|
7
|
+
from langchain_community.callbacks.openai_info import OpenAICallbackHandler
|
|
8
|
+
|
|
9
|
+
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
|
|
10
|
+
from bioguider.agents.agent_utils import CustomOutputParser, CustomPromptTemplate
|
|
11
|
+
from bioguider.agents.peo_common_step import (
|
|
12
|
+
PEOCommonStep,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
## execution system prompt
|
|
18
|
+
IDENTIFICATION_EXECUTION_SYSTEM_PROMPT = """You are an expert Python developer.
|
|
19
|
+
|
|
20
|
+
You are given a **plan** and are expected to complete it using Python code and the available tools.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
### **Available Tools**
|
|
25
|
+
{tools}
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
### **Your Task**
|
|
30
|
+
|
|
31
|
+
Execute the plan step by step using the format below:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
Thought: Describe what you are thinking or planning to do next.
|
|
35
|
+
Action: The tool you are going to use (must be one of: {tool_names})
|
|
36
|
+
Action Input: The input to the selected action
|
|
37
|
+
Observation: The result returned by the action
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
You may repeat the **Thought → Action → Action Input → Observation** loop as many times as needed.
|
|
41
|
+
|
|
42
|
+
Once the plan is fully completed, output the result in the following format:
|
|
43
|
+
```
|
|
44
|
+
Thought: I have completed the plan.
|
|
45
|
+
Final Answer:
|
|
46
|
+
Action: {{tool_name}}
|
|
47
|
+
Action Input: {{file_name1}}
|
|
48
|
+
Action Observation: {{Observation1}}
|
|
49
|
+
---
|
|
50
|
+
Action: {{tool_name}}
|
|
51
|
+
Action Input: {{file_name2}}
|
|
52
|
+
Action Observation: {{Observation2}}
|
|
53
|
+
---
|
|
54
|
+
...
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
### **Example**
|
|
60
|
+
```
|
|
61
|
+
Action: summarize_file_tool
|
|
62
|
+
Action Input: README.md
|
|
63
|
+
Action Input: "Please extract license information in summarized file content."
|
|
64
|
+
Observation: # BioGuider\nBioGuider is a Python package for bioinformatics.\n...
|
|
65
|
+
...
|
|
66
|
+
Final Answer:
|
|
67
|
+
Action: summarize_file_tool
|
|
68
|
+
Action Input: README.md
|
|
69
|
+
Action Input: "N/A"
|
|
70
|
+
Action Observation: # BioGuider\nBioGuider is a Python package for bioinformatics.\n...
|
|
71
|
+
---
|
|
72
|
+
Action: check_file_related_tool
|
|
73
|
+
Action Input: pyproject.toml
|
|
74
|
+
Action Observation: Yes, the file is related to the project.
|
|
75
|
+
---
|
|
76
|
+
...
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
### **Important Notes**
|
|
82
|
+
|
|
83
|
+
- You must strictly follow the provided plan.
|
|
84
|
+
- **Do not take any additional or alternative actions**, even if:
|
|
85
|
+
- No relevant result is found
|
|
86
|
+
- The file content is missing, empty, or irrelevant
|
|
87
|
+
- If no information is found in a step, simply proceed to the next action in the plan without improvising.
|
|
88
|
+
- Only use the tools specified in the plan actions. No independent decisions or extra steps are allowed.
|
|
89
|
+
|
|
90
|
+
### **Plan**
|
|
91
|
+
{plan_actions}
|
|
92
|
+
|
|
93
|
+
### **Actions Already Taken**
|
|
94
|
+
{agent_scratchpad}
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
{input}
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
class IdentificationExecuteStep(PEOCommonStep):
|
|
102
|
+
"""
|
|
103
|
+
This class is a placeholder for common step functionality in the PEO agent.
|
|
104
|
+
It is currently empty and can be extended in the future.
|
|
105
|
+
"""
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
llm: BaseChatOpenAI,
|
|
109
|
+
repo_path: str,
|
|
110
|
+
repo_structure: str,
|
|
111
|
+
gitignore_path: str,
|
|
112
|
+
custom_tools: list[BaseTool] | None = None,
|
|
113
|
+
):
|
|
114
|
+
super().__init__(llm=llm)
|
|
115
|
+
self.llm = llm
|
|
116
|
+
self.step_name = "Identification Execution Step"
|
|
117
|
+
self.repo_path = repo_path
|
|
118
|
+
self.repo_structure = repo_structure
|
|
119
|
+
self.gitignore_path = gitignore_path
|
|
120
|
+
self.custom_tools = custom_tools if custom_tools is not None else []
|
|
121
|
+
|
|
122
|
+
def _execute_directly(self, state):
|
|
123
|
+
plan_actions = state["plan_actions"]
|
|
124
|
+
prompt = CustomPromptTemplate(
|
|
125
|
+
template=IDENTIFICATION_EXECUTION_SYSTEM_PROMPT,
|
|
126
|
+
tools=self.custom_tools,
|
|
127
|
+
plan_actions=plan_actions,
|
|
128
|
+
input_variables=[
|
|
129
|
+
"tools",
|
|
130
|
+
"tool_names",
|
|
131
|
+
"agent_scratchpad",
|
|
132
|
+
"intermediate_steps",
|
|
133
|
+
"plan_actions",
|
|
134
|
+
],
|
|
135
|
+
)
|
|
136
|
+
output_parser = CustomOutputParser()
|
|
137
|
+
agent = create_react_agent(
|
|
138
|
+
llm=self.llm,
|
|
139
|
+
tools=self.custom_tools,
|
|
140
|
+
prompt=prompt,
|
|
141
|
+
output_parser=output_parser,
|
|
142
|
+
stop_sequence=["\nObservation:"],
|
|
143
|
+
)
|
|
144
|
+
callback_handler = OpenAICallbackHandler()
|
|
145
|
+
agent_executor = AgentExecutor(
|
|
146
|
+
agent=agent,
|
|
147
|
+
tools=self.custom_tools,
|
|
148
|
+
max_iterations=10,
|
|
149
|
+
)
|
|
150
|
+
response = agent_executor.invoke(
|
|
151
|
+
input={"plan_actions": plan_actions, "input": "Now, let's begin."},
|
|
152
|
+
callbacks=[callback_handler],
|
|
153
|
+
)
|
|
154
|
+
# parse the response
|
|
155
|
+
if "output" in response:
|
|
156
|
+
output = response["output"]
|
|
157
|
+
if "**Final Answer**" in output:
|
|
158
|
+
final_answer = output.split("**Final Answer:**")[-1].strip().strip(":")
|
|
159
|
+
step_output = final_answer
|
|
160
|
+
elif "Final Answer" in output:
|
|
161
|
+
final_answer = output.split("Final Answer")[-1].strip().strip(":")
|
|
162
|
+
step_output = final_answer
|
|
163
|
+
else:
|
|
164
|
+
step_output = output
|
|
165
|
+
step_output = step_output.strip().strip("```").strip('"""')
|
|
166
|
+
self._print_step(state, step_output=step_output)
|
|
167
|
+
state["step_output"] = step_output
|
|
168
|
+
else:
|
|
169
|
+
logger.error("No output found in the response.")
|
|
170
|
+
self._print_step(
|
|
171
|
+
state,
|
|
172
|
+
step_output="Error: No output found in the response.",
|
|
173
|
+
)
|
|
174
|
+
state["step_output"] = "Error: No output found in the response."
|
|
175
|
+
|
|
176
|
+
token_usage = vars(callback_handler)
|
|
177
|
+
token_usage = {**DEFAULT_TOKEN_USAGE, **token_usage}
|
|
178
|
+
|
|
179
|
+
return state, token_usage
|
|
180
|
+
|
|
181
|
+
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
|
|
2
|
+
from langchain.prompts import ChatPromptTemplate
|
|
3
|
+
|
|
4
|
+
from bioguider.agents.agent_utils import ObservationResult
|
|
5
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
|
|
6
|
+
from bioguider.agents.identification_task_utils import IdentificationWorkflowState
|
|
7
|
+
from bioguider.agents.peo_common_step import PEOWorkflowState, PEOCommonStep
|
|
8
|
+
from bioguider.utils.constants import MAX_STEP_COUNT
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## observation system prompt
|
|
12
|
+
IDENTIFICATION_OBSERVATION_SYSTEM_PROMPT = """Your goal is:
|
|
13
|
+
{goal}
|
|
14
|
+
|
|
15
|
+
### **Repository File Structure**
|
|
16
|
+
Here is the 2-level file structure of the repository (f - file, d - directory, l - symlink, u - unknown):
|
|
17
|
+
{repo_structure}
|
|
18
|
+
|
|
19
|
+
### **Intermediate Output**
|
|
20
|
+
{intermediate_output}
|
|
21
|
+
|
|
22
|
+
### **Instructions**
|
|
23
|
+
Carefully review the **Goal**, **Repository File Structure**, and **Intermediate Output**.
|
|
24
|
+
- If you believe the goal **can be achieved**, proceed as follows:
|
|
25
|
+
- Provide your reasoning under **Analysis**
|
|
26
|
+
- Then provide your result under **FinalAnswer**
|
|
27
|
+
```
|
|
28
|
+
**Analysis**: your analysis here
|
|
29
|
+
**FinalAnswer**: your final answer here, in **raw json format**, **including** the surrounding "{{}}" but **without** using code fence (```json ... ```),
|
|
30
|
+
For example, output exactly: {final_answer_example}
|
|
31
|
+
```
|
|
32
|
+
- If the information is **not sufficient** to achieve the goal, simply explain why under **Thoughts**:
|
|
33
|
+
```
|
|
34
|
+
**Thoughts**: your thoughts here
|
|
35
|
+
```
|
|
36
|
+
Be precise and support your reasoning with evidence from the input.
|
|
37
|
+
|
|
38
|
+
### **Important Instructions**
|
|
39
|
+
{important_instructions}
|
|
40
|
+
|
|
41
|
+
### Notes
|
|
42
|
+
We are collecting information over multiple rounds, your thoughts and the output of this step will be persisted, so please **do not rush to provide a Final Answer**.
|
|
43
|
+
If you find the current information insufficient, share your reasoning or thoughts instead—we’ll continue with the next round accordingly.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class IdentificationObserveStep(PEOCommonStep):
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
llm,
|
|
51
|
+
repo_path: str,
|
|
52
|
+
repo_structure: str,
|
|
53
|
+
gitignore_path: str,
|
|
54
|
+
custom_tools: list = None,
|
|
55
|
+
):
|
|
56
|
+
super().__init__(llm)
|
|
57
|
+
self.step_name = "Identification Observe Step"
|
|
58
|
+
self.repo_path = repo_path
|
|
59
|
+
self.repo_structure = repo_structure
|
|
60
|
+
self.gitignore_path = gitignore_path
|
|
61
|
+
self.custom_tools = custom_tools if custom_tools is not None else []
|
|
62
|
+
|
|
63
|
+
def _prepare_system_prompt(self, state: IdentificationWorkflowState):
|
|
64
|
+
goal = state["goal"]
|
|
65
|
+
important_instructions = "N/A" \
|
|
66
|
+
if not "observe_instructions" in state else state["observe_instructions"]
|
|
67
|
+
final_answer_example = state["final_answer_example"]
|
|
68
|
+
intermediate_output = self._build_intermediate_steps(state)
|
|
69
|
+
prompt = ChatPromptTemplate.from_template(IDENTIFICATION_OBSERVATION_SYSTEM_PROMPT)
|
|
70
|
+
|
|
71
|
+
return prompt.format(
|
|
72
|
+
goal=goal,
|
|
73
|
+
repo_structure=self.repo_structure,
|
|
74
|
+
intermediate_output=intermediate_output,
|
|
75
|
+
final_answer_example=final_answer_example,
|
|
76
|
+
important_instructions=important_instructions,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def _execute_directly(self, state: IdentificationWorkflowState):
|
|
80
|
+
step_count = state["step_count"]
|
|
81
|
+
instruction = "Now, we have reached max recursion limit, please give me the **final answer** based on the current information" \
|
|
82
|
+
if step_count == MAX_STEP_COUNT/3 - 2 else "Now, Let's begin."
|
|
83
|
+
system_prompt = self._prepare_system_prompt(state)
|
|
84
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
85
|
+
res, _, token_usage, reasoning_process = agent.go(
|
|
86
|
+
system_prompt=system_prompt,
|
|
87
|
+
instruction_prompt=instruction,
|
|
88
|
+
schema=ObservationResult,
|
|
89
|
+
)
|
|
90
|
+
state["final_answer"] = res.FinalAnswer
|
|
91
|
+
analysis = res.Analysis
|
|
92
|
+
thoughts = res.Thoughts
|
|
93
|
+
state["step_analysis"] = analysis
|
|
94
|
+
state["step_thoughts"] = thoughts
|
|
95
|
+
state["step_count"] += 1
|
|
96
|
+
self._print_step(
|
|
97
|
+
state,
|
|
98
|
+
step_output=f"**Observation Reasoning Process {state['step_count']}**\n{reasoning_process}"
|
|
99
|
+
)
|
|
100
|
+
self._print_step(
|
|
101
|
+
state,
|
|
102
|
+
step_output=f"Final Answer: {res.FinalAnswer if res.FinalAnswer else None}\nAnalysis: {analysis}\nThoughts: {thoughts}",
|
|
103
|
+
)
|
|
104
|
+
return state, token_usage
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
|
|
2
|
+
from langchain.prompts import ChatPromptTemplate
|
|
3
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
4
|
+
from langchain.tools import BaseTool
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from bioguider.agents.agent_utils import get_tool_names_and_descriptions
|
|
8
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
|
|
9
|
+
from bioguider.agents.identification_task_utils import IdentificationWorkflowState
|
|
10
|
+
from bioguider.agents.peo_common_step import PEOCommonStep
|
|
11
|
+
|
|
12
|
+
## plan system prompt
|
|
13
|
+
IDENTIFICATION_PLAN_SYSTEM_PROMPT = ChatPromptTemplate.from_template("""### **Goal**
|
|
14
|
+
You are an expert developer in the field of biomedical domain. Your goal is:
|
|
15
|
+
{goal}
|
|
16
|
+
|
|
17
|
+
### **Repository File Structure**
|
|
18
|
+
Here is the 2-level file structure of the repository (f - file, d - directory, l - symlink, u - unknown):
|
|
19
|
+
{repo_structure}
|
|
20
|
+
|
|
21
|
+
### **Function Tools**
|
|
22
|
+
You are provided the following function tools:
|
|
23
|
+
{tools}
|
|
24
|
+
|
|
25
|
+
### Intermediate Steps
|
|
26
|
+
Hers are the intermediate steps results:
|
|
27
|
+
{intermediate_steps}
|
|
28
|
+
|
|
29
|
+
### Intermediate Thoughts
|
|
30
|
+
Analysis: {intermediate_analysis}
|
|
31
|
+
Thoughts: {intermediate_thoughts}
|
|
32
|
+
|
|
33
|
+
### **Instruction**
|
|
34
|
+
We will repeat **Plan - Execution - Observation** loops as many times as needed. All the results in each round will be persisted,
|
|
35
|
+
meaning that states and variables will persisted through multiple rounds of plan execution. Be sure to take advantage of this by
|
|
36
|
+
developing your collection plan incrementally and reflect on the intermediate observations at each round, instead of coding up
|
|
37
|
+
everything in one go. Be sure to take only one or two actions in each step.
|
|
38
|
+
|
|
39
|
+
### **Important Instructions**
|
|
40
|
+
{important_instructions}
|
|
41
|
+
|
|
42
|
+
### **Output**
|
|
43
|
+
You plan should follow this format:
|
|
44
|
+
Step: tool name, should be one of {tool_names}
|
|
45
|
+
Step Input: file name or directory name
|
|
46
|
+
Step: tool name, should be one of {tool_names}
|
|
47
|
+
Step Input: file name or directory name
|
|
48
|
+
""")
|
|
49
|
+
|
|
50
|
+
class IdentificationPlanResult(BaseModel):
|
|
51
|
+
""" Identification Plan Result """
|
|
52
|
+
actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
|
|
53
|
+
|
|
54
|
+
IdentificationPlanResultJsonSchema = {
|
|
55
|
+
"title": "identification_plan_result",
|
|
56
|
+
"description": "plan result",
|
|
57
|
+
"type": "object",
|
|
58
|
+
"properties": {
|
|
59
|
+
"actions": {
|
|
60
|
+
"type": "array",
|
|
61
|
+
"description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
|
|
62
|
+
"title": "Actions",
|
|
63
|
+
"items": {"type": "object"}
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
"required": ["actions"],
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
class IdentificationPlanStep(PEOCommonStep):
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
llm: BaseChatOpenAI,
|
|
73
|
+
repo_path: str,
|
|
74
|
+
repo_structure: str,
|
|
75
|
+
gitignore_path: str,
|
|
76
|
+
custom_tools: list[BaseTool] | None = None,
|
|
77
|
+
):
|
|
78
|
+
super().__init__(llm)
|
|
79
|
+
self.step_name = "Identification Plan Step"
|
|
80
|
+
self.repo_path = repo_path
|
|
81
|
+
self.repo_structure = repo_structure
|
|
82
|
+
self.gitignore_path = gitignore_path
|
|
83
|
+
self.custom_tools = custom_tools if custom_tools is not None else []
|
|
84
|
+
|
|
85
|
+
def _prepare_system_prompt(self, state: IdentificationWorkflowState) -> str:
|
|
86
|
+
goal = state["goal"]
|
|
87
|
+
important_instructions = "N/A" if not "plan_instructions" in state else state["plan_instructions"]
|
|
88
|
+
repo_structure = self.repo_structure
|
|
89
|
+
intermdediate_steps = self._build_intermediate_steps(state)
|
|
90
|
+
step_analysis, step_thoughts = self._build_intermediate_analysis_and_thoughts(state)
|
|
91
|
+
self._print_step(
|
|
92
|
+
state,
|
|
93
|
+
step_output="**Intermediate Step Output**\n" + intermdediate_steps
|
|
94
|
+
)
|
|
95
|
+
self._print_step(
|
|
96
|
+
state,
|
|
97
|
+
step_output="**Intermediate Step Analysis**\n{step_analysis}\n**Intermediate Step Thoughts**\n{step_thoughts}",
|
|
98
|
+
)
|
|
99
|
+
tool_names, tools_desc = get_tool_names_and_descriptions(self.custom_tools)
|
|
100
|
+
return IDENTIFICATION_PLAN_SYSTEM_PROMPT.format(
|
|
101
|
+
goal=goal,
|
|
102
|
+
repo_structure=repo_structure,
|
|
103
|
+
tools=tools_desc,
|
|
104
|
+
intermediate_steps=intermdediate_steps,
|
|
105
|
+
intermediate_analysis=step_analysis,
|
|
106
|
+
intermediate_thoughts=step_thoughts,
|
|
107
|
+
tool_names=tool_names,
|
|
108
|
+
important_instructions=important_instructions,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def _convert_to_plan_actions_text(self, actions: list[dict]) -> str:
|
|
112
|
+
plan_str = ""
|
|
113
|
+
for action in actions:
|
|
114
|
+
action_str = f"Step: {action['name']}\n"
|
|
115
|
+
action_str += f"Step Input: {action['input']}\n"
|
|
116
|
+
plan_str += action_str
|
|
117
|
+
return plan_str
|
|
118
|
+
|
|
119
|
+
def _execute_directly(self, state: IdentificationWorkflowState):
|
|
120
|
+
system_prompt = self._prepare_system_prompt(state)
|
|
121
|
+
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
122
|
+
res, _, token_usage, reasoning_process = agent.go(
|
|
123
|
+
system_prompt=system_prompt,
|
|
124
|
+
instruction_prompt="Now, let's begin.",
|
|
125
|
+
schema=IdentificationPlanResultJsonSchema,
|
|
126
|
+
)
|
|
127
|
+
PEOCommonStep._reset_step_state(state)
|
|
128
|
+
res = IdentificationPlanResult(**res)
|
|
129
|
+
self._print_step(
|
|
130
|
+
state,
|
|
131
|
+
step_output="**Reasoning Process**\n" + reasoning_process,
|
|
132
|
+
)
|
|
133
|
+
self._print_step(
|
|
134
|
+
state,
|
|
135
|
+
step_output=f"**Plan**\n{res.actions}"
|
|
136
|
+
)
|
|
137
|
+
state["plan_actions"] = self._convert_to_plan_actions_text(res.actions)
|
|
138
|
+
|
|
139
|
+
return state, token_usage
|
|
140
|
+
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Callable
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
9
|
+
from langchain.tools import Tool, StructuredTool
|
|
10
|
+
from langgraph.graph import StateGraph, START, END
|
|
11
|
+
|
|
12
|
+
from bioguider.utils.constants import PrimaryLanguageEnum, ProjectTypeEnum
|
|
13
|
+
from bioguider.utils.file_utils import get_file_type
|
|
14
|
+
from bioguider.agents.agent_tools import (
|
|
15
|
+
read_file_tool,
|
|
16
|
+
read_directory_tool,
|
|
17
|
+
summarize_file_tool,
|
|
18
|
+
)
|
|
19
|
+
from bioguider.agents.agent_utils import (
|
|
20
|
+
read_directory,
|
|
21
|
+
try_parse_json_object,
|
|
22
|
+
)
|
|
23
|
+
from bioguider.agents.identification_execute_step import IdentificationExecuteStep
|
|
24
|
+
from bioguider.agents.identification_observe_step import IdentificationObserveStep
|
|
25
|
+
from bioguider.agents.identification_plan_step import IdentificationPlanStep
|
|
26
|
+
from bioguider.agents.identification_task_utils import IdentificationWorkflowState
|
|
27
|
+
from bioguider.agents.peo_common_step import PEOCommonStep
|
|
28
|
+
from bioguider.agents.prompt_utils import (
|
|
29
|
+
IDENTIFICATION_GOAL_PROJECT_TYPE,
|
|
30
|
+
IDENTIFICATION_GOAL_PRIMARY_LANGUAGE,
|
|
31
|
+
IDENTIFICATION_GOAL_META_DATA,
|
|
32
|
+
)
|
|
33
|
+
from bioguider.agents.python_ast_repl_tool import CustomPythonAstREPLTool
|
|
34
|
+
from bioguider.agents.agent_task import AgentTask
|
|
35
|
+
from bioguider.database.summarized_file_db import SummarizedFilesDb
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
META_DATA_FINAL_ANSWER_EXAMPLE = '{{"name": "repo name", ...}}'
|
|
40
|
+
PROJECT_TYPE_FINAL_ANSWER_EXAMPLE = '{{"project_type": "project type"}}'
|
|
41
|
+
PRIMARY_LANGUAGE_FINAL_ANSWER_EXAMPLE = '{{"primary_language": "primary language"}}'
|
|
42
|
+
|
|
43
|
+
class IdentificationPlanResult(BaseModel):
|
|
44
|
+
""" Identification Plan Result """
|
|
45
|
+
actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
|
|
46
|
+
|
|
47
|
+
IdentificationPlanResultJsonSchema = {
|
|
48
|
+
"title": "identification_plan_result",
|
|
49
|
+
"description": "plan result",
|
|
50
|
+
"type": "object",
|
|
51
|
+
"properties": {
|
|
52
|
+
"actions": {
|
|
53
|
+
"type": "array",
|
|
54
|
+
"description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
|
|
55
|
+
"title": "Actions",
|
|
56
|
+
"items": {"type": "object"}
|
|
57
|
+
},
|
|
58
|
+
},
|
|
59
|
+
"required": ["actions"],
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
class IdentificationTask(AgentTask):
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
llm: BaseChatOpenAI,
|
|
66
|
+
step_callback: Callable | None=None,
|
|
67
|
+
summarized_files_db: SummarizedFilesDb | None = None,
|
|
68
|
+
provided_files: list[str] | None = None,
|
|
69
|
+
):
|
|
70
|
+
super().__init__(llm=llm, step_callback=step_callback, summarized_files_db=summarized_files_db)
|
|
71
|
+
self.repo_path: str | None = None
|
|
72
|
+
self.gitignore_path: str | None = None
|
|
73
|
+
self.repo_structure: str | None = None
|
|
74
|
+
self.tools = []
|
|
75
|
+
self.custom_tools = []
|
|
76
|
+
self.steps: list[PEOCommonStep] = []
|
|
77
|
+
self.provided_files = provided_files
|
|
78
|
+
|
|
79
|
+
def _prepare_tools(self):
|
|
80
|
+
tool_rd = read_directory_tool(repo_path=self.repo_path)
|
|
81
|
+
tool_sum = summarize_file_tool(
|
|
82
|
+
llm=self.llm,
|
|
83
|
+
repo_path=self.repo_path,
|
|
84
|
+
output_callback=self.step_callback,
|
|
85
|
+
db=self.summarized_files_db,
|
|
86
|
+
)
|
|
87
|
+
tool_rf = read_file_tool(repo_path=self.repo_path)
|
|
88
|
+
|
|
89
|
+
self.tools = [tool_rd, tool_sum, tool_rf,]
|
|
90
|
+
self.custom_tools = [
|
|
91
|
+
Tool(
|
|
92
|
+
name = tool_rd.__class__.__name__,
|
|
93
|
+
func = tool_rd.run,
|
|
94
|
+
description=tool_rd.__class__.__doc__,
|
|
95
|
+
),
|
|
96
|
+
StructuredTool.from_function(
|
|
97
|
+
tool_sum.run,
|
|
98
|
+
description=tool_sum.__class__.__doc__,
|
|
99
|
+
name=tool_sum.__class__.__name__,
|
|
100
|
+
),
|
|
101
|
+
Tool(
|
|
102
|
+
name = tool_rf.__class__.__name__,
|
|
103
|
+
func = tool_rf.run,
|
|
104
|
+
description=tool_rf.__class__.__doc__,
|
|
105
|
+
),
|
|
106
|
+
]
|
|
107
|
+
# self.custom_tools.append(CustomPythonAstREPLTool())
|
|
108
|
+
|
|
109
|
+
def _initialize(self):
|
|
110
|
+
if not os.path.exists(self.repo_path):
|
|
111
|
+
raise ValueError(f"Repository path {self.repo_path} does not exist.")
|
|
112
|
+
files = self.provided_files
|
|
113
|
+
if files is None:
|
|
114
|
+
files = read_directory(self.repo_path, os.path.join(self.repo_path, ".gitignore"))
|
|
115
|
+
file_pairs = [(f, get_file_type(os.path.join(self.repo_path, f)).value) for f in files]
|
|
116
|
+
self.repo_structure = ""
|
|
117
|
+
for f, f_type in file_pairs:
|
|
118
|
+
self.repo_structure += f"{f} - {f_type}\n"
|
|
119
|
+
|
|
120
|
+
self._prepare_tools()
|
|
121
|
+
self.steps = [
|
|
122
|
+
IdentificationPlanStep(
|
|
123
|
+
llm=self.llm,
|
|
124
|
+
repo_path=self.repo_path,
|
|
125
|
+
repo_structure=self.repo_structure,
|
|
126
|
+
gitignore_path=self.gitignore_path,
|
|
127
|
+
custom_tools=self.custom_tools,
|
|
128
|
+
),
|
|
129
|
+
IdentificationExecuteStep(
|
|
130
|
+
llm=self.llm,
|
|
131
|
+
repo_path=self.repo_path,
|
|
132
|
+
repo_structure=self.repo_structure,
|
|
133
|
+
gitignore_path=self.gitignore_path,
|
|
134
|
+
custom_tools=self.custom_tools,
|
|
135
|
+
),
|
|
136
|
+
IdentificationObserveStep(
|
|
137
|
+
llm=self.llm,
|
|
138
|
+
repo_path=self.repo_path,
|
|
139
|
+
repo_structure=self.repo_structure,
|
|
140
|
+
gitignore_path=self.gitignore_path,
|
|
141
|
+
custom_tools=self.custom_tools,
|
|
142
|
+
)
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _compile(
|
|
147
|
+
self,
|
|
148
|
+
repo_path: str,
|
|
149
|
+
gitignore_path: str,
|
|
150
|
+
**kwargs,
|
|
151
|
+
):
|
|
152
|
+
self.repo_path = repo_path
|
|
153
|
+
self.gitignore_path = gitignore_path
|
|
154
|
+
self._initialize()
|
|
155
|
+
|
|
156
|
+
def check_observation_step(state: IdentificationWorkflowState):
|
|
157
|
+
if "final_answer" in state and state["final_answer"] is not None:
|
|
158
|
+
return END
|
|
159
|
+
return "plan_step"
|
|
160
|
+
|
|
161
|
+
graph = StateGraph(IdentificationWorkflowState)
|
|
162
|
+
graph.add_node("plan_step", self.steps[0].execute)
|
|
163
|
+
graph.add_node("execute_step", self.steps[1].execute)
|
|
164
|
+
graph.add_node("observe_step", self.steps[2].execute)
|
|
165
|
+
graph.add_edge(START, "plan_step")
|
|
166
|
+
graph.add_edge("plan_step", "execute_step")
|
|
167
|
+
graph.add_edge("execute_step", "observe_step")
|
|
168
|
+
graph.add_conditional_edges("observe_step", check_observation_step, {"plan_step", END})
|
|
169
|
+
|
|
170
|
+
self.graph = graph.compile()
|
|
171
|
+
|
|
172
|
+
def identify_project_type(self):
|
|
173
|
+
s = self._go_graph({
|
|
174
|
+
"goal": IDENTIFICATION_GOAL_PROJECT_TYPE,
|
|
175
|
+
"final_answer_example": PROJECT_TYPE_FINAL_ANSWER_EXAMPLE,
|
|
176
|
+
"step_count": 0,
|
|
177
|
+
})
|
|
178
|
+
proj_type = s["final_answer"] if "final_answer" in s else "unknown type"
|
|
179
|
+
return self._parse_project_type(proj_type)
|
|
180
|
+
|
|
181
|
+
def identify_primary_language(self):
|
|
182
|
+
s = self._go_graph({
|
|
183
|
+
"goal": IDENTIFICATION_GOAL_PRIMARY_LANGUAGE,
|
|
184
|
+
"final_answer_example": PRIMARY_LANGUAGE_FINAL_ANSWER_EXAMPLE,
|
|
185
|
+
"step_count": 0,
|
|
186
|
+
})
|
|
187
|
+
language = s["final_answer"] if "final_answer" in s else "unknown type"
|
|
188
|
+
return self._parse_primary_language(language)
|
|
189
|
+
|
|
190
|
+
def identify_meta_data(self):
|
|
191
|
+
s = self._go_graph({
|
|
192
|
+
"goal": IDENTIFICATION_GOAL_META_DATA,
|
|
193
|
+
"final_answer_example": META_DATA_FINAL_ANSWER_EXAMPLE,
|
|
194
|
+
"step_count": 0,
|
|
195
|
+
})
|
|
196
|
+
meta_data = s["final_answer"] if "final_answer" in s else "unknown type"
|
|
197
|
+
return self._parse_meta_data(meta_data)
|
|
198
|
+
|
|
199
|
+
def identify_customize_goal(
|
|
200
|
+
self,
|
|
201
|
+
goal: str,
|
|
202
|
+
final_answer_example: str,
|
|
203
|
+
plan_instructions: str = "N/A",
|
|
204
|
+
observe_instructions: str = "N/A",
|
|
205
|
+
):
|
|
206
|
+
s = self._go_graph({
|
|
207
|
+
"goal": goal,
|
|
208
|
+
"final_answer_example": final_answer_example,
|
|
209
|
+
"plan_instructions": plan_instructions,
|
|
210
|
+
"observe_instructions": observe_instructions,
|
|
211
|
+
"step_count": 0,
|
|
212
|
+
})
|
|
213
|
+
return s["final_answer"] if "final_answer" in s else None
|
|
214
|
+
|
|
215
|
+
def _parse_project_type(self, proj_type_obj: str) -> ProjectTypeEnum:
|
|
216
|
+
proj_type_obj = proj_type_obj.strip()
|
|
217
|
+
the_obj = try_parse_json_object(proj_type_obj)
|
|
218
|
+
if not the_obj is None and "project_type" in the_obj:
|
|
219
|
+
proj_type = the_obj["project_type"]
|
|
220
|
+
elif proj_type_obj in [
|
|
221
|
+
ProjectTypeEnum.application.value,
|
|
222
|
+
ProjectTypeEnum.package.value,
|
|
223
|
+
ProjectTypeEnum.pipeline.value
|
|
224
|
+
]:
|
|
225
|
+
return ProjectTypeEnum(proj_type_obj)
|
|
226
|
+
else:
|
|
227
|
+
proj_type = "unknown"
|
|
228
|
+
if proj_type == "application":
|
|
229
|
+
return ProjectTypeEnum.application
|
|
230
|
+
elif proj_type == "package":
|
|
231
|
+
return ProjectTypeEnum.package
|
|
232
|
+
elif proj_type == "pipeline":
|
|
233
|
+
return ProjectTypeEnum.pipeline
|
|
234
|
+
else:
|
|
235
|
+
return ProjectTypeEnum.unknown
|
|
236
|
+
|
|
237
|
+
def _parse_primary_language(self, language_obj: str) -> PrimaryLanguageEnum:
|
|
238
|
+
# try to handle some common errors
|
|
239
|
+
language_obj = language_obj.strip()
|
|
240
|
+
the_obj = try_parse_json_object(language_obj)
|
|
241
|
+
if not the_obj is None and "primary_language" in the_obj:
|
|
242
|
+
language = the_obj["primary_language"]
|
|
243
|
+
elif language_obj in [
|
|
244
|
+
PrimaryLanguageEnum.python.value,
|
|
245
|
+
PrimaryLanguageEnum.R.value,
|
|
246
|
+
]:
|
|
247
|
+
return PrimaryLanguageEnum(language_obj)
|
|
248
|
+
else:
|
|
249
|
+
language = "unknown"
|
|
250
|
+
|
|
251
|
+
language = language.strip()
|
|
252
|
+
if language == "python":
|
|
253
|
+
return PrimaryLanguageEnum.python
|
|
254
|
+
elif language == "R":
|
|
255
|
+
return PrimaryLanguageEnum.R
|
|
256
|
+
else:
|
|
257
|
+
return PrimaryLanguageEnum.unknown
|
|
258
|
+
|
|
259
|
+
def _parse_meta_data(self, meta_data_obj: str) -> dict:
|
|
260
|
+
meta_data_obj = meta_data_obj.strip()
|
|
261
|
+
the_obj = try_parse_json_object(meta_data_obj)
|
|
262
|
+
|
|
263
|
+
return the_obj if the_obj is not None else {
|
|
264
|
+
"name": "unknown",
|
|
265
|
+
"description": "unknown",
|
|
266
|
+
"license": "unknown",
|
|
267
|
+
"owner": "unknown",
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
|