bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
|
|
2
|
+
import json
|
|
3
|
+
from json import JSONDecodeError
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
from typing import List, Optional, Tuple, Union
|
|
8
|
+
from langchain_openai import AzureChatOpenAI, ChatOpenAI
|
|
9
|
+
from langchain_deepseek import ChatDeepSeek
|
|
10
|
+
from langchain_core.utils.interactive_env import is_interactive_env
|
|
11
|
+
from langchain_core.messages.base import get_msg_title_repr
|
|
12
|
+
from langchain_core.prompts import ChatPromptTemplate, StringPromptTemplate
|
|
13
|
+
from langchain_core.messages import AIMessage
|
|
14
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
15
|
+
from langchain.tools import BaseTool
|
|
16
|
+
from langchain.schema import AgentAction, AgentFinish
|
|
17
|
+
from langchain.agents import AgentOutputParser
|
|
18
|
+
from langgraph.prebuilt import create_react_agent
|
|
19
|
+
from langchain_community.callbacks.openai_info import OpenAICallbackHandler
|
|
20
|
+
import logging
|
|
21
|
+
|
|
22
|
+
from pydantic import BaseModel, Field
|
|
23
|
+
|
|
24
|
+
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, MAX_FILE_LENGTH, MAX_SENTENCE_NUM
|
|
25
|
+
from bioguider.utils.file_utils import get_file_type
|
|
26
|
+
from bioguider.utils.utils import clean_action_input
|
|
27
|
+
from ..utils.gitignore_checker import GitignoreChecker
|
|
28
|
+
from ..database.summarized_file_db import SummarizedFilesDb
|
|
29
|
+
from bioguider.agents.common_conversation import CommonConversation
|
|
30
|
+
from bioguider.rag.config import configs
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
class PlanAgentResult(BaseModel):
|
|
35
|
+
""" Identification Plan Result """
|
|
36
|
+
actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
|
|
37
|
+
|
|
38
|
+
PlanAgentResultJsonSchema = {
|
|
39
|
+
"title": "identification_plan_result",
|
|
40
|
+
"description": "plan result",
|
|
41
|
+
"type": "object",
|
|
42
|
+
"properties": {
|
|
43
|
+
"actions": {
|
|
44
|
+
"type": "array",
|
|
45
|
+
"description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
|
|
46
|
+
"title": "Actions",
|
|
47
|
+
"items": {"type": "object"}
|
|
48
|
+
},
|
|
49
|
+
},
|
|
50
|
+
"required": ["actions"],
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
def get_openai():
|
|
54
|
+
return get_llm(
|
|
55
|
+
api_key=os.environ.get("OPENAI_API_KEY"),
|
|
56
|
+
model_name=os.environ.get("OPENAI_MODEL"),
|
|
57
|
+
azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
58
|
+
api_version=os.environ.get("OPENAI_API_VERSION"),
|
|
59
|
+
azure_deployment=os.environ.get("OPENAI_DEPLOYMENT_NAME"),
|
|
60
|
+
max_tokens=os.environ.get("OPENAI_MAX_OUTPUT_TOKEN"),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def get_llm(
|
|
64
|
+
api_key: str,
|
|
65
|
+
model_name: str="gpt-4o",
|
|
66
|
+
azure_endpoint: str=None,
|
|
67
|
+
api_version: str=None,
|
|
68
|
+
azure_deployment: str=None,
|
|
69
|
+
temperature: float = 0.0,
|
|
70
|
+
max_tokens: int = 16384, # Set high by default - enough for any document type
|
|
71
|
+
):
|
|
72
|
+
"""
|
|
73
|
+
Create an LLM instance with appropriate parameters based on model type and API version.
|
|
74
|
+
|
|
75
|
+
Handles parameter compatibility across different models and API versions:
|
|
76
|
+
- DeepSeek models: Use max_tokens parameter
|
|
77
|
+
- GPT models (newer): Use max_completion_tokens parameter
|
|
78
|
+
- GPT-5+: Don't support custom temperature (uses default)
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
if model_name.startswith("deepseek"):
|
|
82
|
+
chat = ChatDeepSeek(
|
|
83
|
+
api_key=api_key,
|
|
84
|
+
model=model_name,
|
|
85
|
+
temperature=temperature,
|
|
86
|
+
max_tokens=max_tokens,
|
|
87
|
+
)
|
|
88
|
+
elif model_name.startswith("gpt"):
|
|
89
|
+
llm_params = {
|
|
90
|
+
"api_key": api_key,
|
|
91
|
+
"model": model_name,
|
|
92
|
+
}
|
|
93
|
+
# Handle temperature parameter based on model capabilities
|
|
94
|
+
# GPT-5+ models don't support custom temperature values
|
|
95
|
+
supports_temperature = not any(restricted in model_name for restricted in ["gpt-5", "o1", "o3"])
|
|
96
|
+
if supports_temperature:
|
|
97
|
+
llm_params["temperature"] = temperature
|
|
98
|
+
|
|
99
|
+
if azure_endpoint is None:
|
|
100
|
+
# OpenAI
|
|
101
|
+
llm_params["max_tokens"] = max_tokens
|
|
102
|
+
chat = ChatOpenAI(**llm_params)
|
|
103
|
+
else:
|
|
104
|
+
# Azure OpenAI
|
|
105
|
+
llm_params["azure_endpoint"] = azure_endpoint
|
|
106
|
+
llm_params["api_version"] = api_version
|
|
107
|
+
llm_params["deployment_name"] = azure_deployment
|
|
108
|
+
# Determine token limit parameter name based on API version
|
|
109
|
+
# Newer APIs (2024-08+) use max_completion_tokens instead of max_tokens
|
|
110
|
+
use_completion_tokens = api_version and api_version >= "2024-08-01-preview"
|
|
111
|
+
token_param = "max_completion_tokens" if use_completion_tokens else "max_tokens"
|
|
112
|
+
llm_params[token_param] = max_tokens
|
|
113
|
+
chat = AzureChatOpenAI(**llm_params)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"Unsupported model type: {model_name}")
|
|
116
|
+
|
|
117
|
+
# Validate the LLM instance with a simple test
|
|
118
|
+
try:
|
|
119
|
+
chat.invoke("Hi")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(f"Failed to initialize LLM {model_name}: {e}")
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
return chat
|
|
125
|
+
|
|
126
|
+
def pretty_print(message, printout = True):
|
|
127
|
+
if isinstance(message, tuple):
|
|
128
|
+
title = message
|
|
129
|
+
else:
|
|
130
|
+
if isinstance(message.content, list):
|
|
131
|
+
title = get_msg_title_repr(message.type.title().upper() + " Message", bold=is_interactive_env())
|
|
132
|
+
if message.name is not None:
|
|
133
|
+
title += f"\nName: {message.name}"
|
|
134
|
+
|
|
135
|
+
for i in message.content:
|
|
136
|
+
if i['type'] == 'text':
|
|
137
|
+
title += f"\n{i['text']}\n"
|
|
138
|
+
elif i['type'] == 'tool_use':
|
|
139
|
+
title += f"\nTool: {i['name']}"
|
|
140
|
+
title += f"\nInput: {i['input']}"
|
|
141
|
+
if printout:
|
|
142
|
+
print(f"{title}")
|
|
143
|
+
else:
|
|
144
|
+
title = get_msg_title_repr(message.type.title() + " Message", bold=is_interactive_env())
|
|
145
|
+
if message.name is not None:
|
|
146
|
+
title += f"\nName: {message.name}"
|
|
147
|
+
title += f"\n\n{message.content}"
|
|
148
|
+
if printout:
|
|
149
|
+
print(f"{title}")
|
|
150
|
+
return title
|
|
151
|
+
|
|
152
|
+
HUGE_FILE_LENGTH = 10 * 1024 # 10K
|
|
153
|
+
|
|
154
|
+
def read_file(
|
|
155
|
+
file_path: str | Path,
|
|
156
|
+
) -> str | None:
|
|
157
|
+
file_path = str(file_path).strip()
|
|
158
|
+
if not os.path.isfile(file_path):
|
|
159
|
+
return None
|
|
160
|
+
with open(file_path, 'r') as f:
|
|
161
|
+
content = f.read()
|
|
162
|
+
return content
|
|
163
|
+
|
|
164
|
+
def write_file(file_path: str | Path, content: str):
|
|
165
|
+
try:
|
|
166
|
+
file_path = str(file_path).strip()
|
|
167
|
+
with open(file_path, "w") as fobj:
|
|
168
|
+
fobj.write(content)
|
|
169
|
+
return True
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(e)
|
|
172
|
+
return False
|
|
173
|
+
|
|
174
|
+
def read_directory(
|
|
175
|
+
dir_path: str | Path,
|
|
176
|
+
gitignore_path: str,
|
|
177
|
+
level: int=1,
|
|
178
|
+
) -> list[str] | None:
|
|
179
|
+
dir_path = str(dir_path).strip()
|
|
180
|
+
if not os.path.isdir(dir_path):
|
|
181
|
+
return None
|
|
182
|
+
gitignore_checker = GitignoreChecker(
|
|
183
|
+
directory=dir_path,
|
|
184
|
+
gitignore_path=gitignore_path,
|
|
185
|
+
exclude_dir_patterns=configs["file_filters"]["excluded_dirs"],
|
|
186
|
+
exclude_file_patterns=configs["file_filters"]["excluded_files"],
|
|
187
|
+
)
|
|
188
|
+
files = gitignore_checker.check_files_and_folders(level=level)
|
|
189
|
+
return files
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
EVALUATION_SUMMARIZE_FILE_PROMPT = ChatPromptTemplate.from_template("""
|
|
193
|
+
You will be provided with the content of the file **{file_name}**:
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
### **Summary Instructions**
|
|
198
|
+
{summary_instructions}
|
|
199
|
+
The content is lengthy. Please generate a concise summary ({sentence_num1}-{sentence_num2} sentences).
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
### **Important Instructions**
|
|
204
|
+
{summarize_prompt}
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
### **File Content**
|
|
209
|
+
Here is the file content:
|
|
210
|
+
{file_content}
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
Now, let's start to summarize.
|
|
215
|
+
""")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def summarize_file(
|
|
219
|
+
llm: BaseChatOpenAI,
|
|
220
|
+
name: str | Path,
|
|
221
|
+
content: str | None = None,
|
|
222
|
+
level: int = 3,
|
|
223
|
+
summary_instructions: str | None = None,
|
|
224
|
+
summarize_prompt: str = "N/A",
|
|
225
|
+
db: SummarizedFilesDb | None = None,
|
|
226
|
+
) -> Tuple[str, dict]:
|
|
227
|
+
name = str(name).strip()
|
|
228
|
+
if content is None:
|
|
229
|
+
try:
|
|
230
|
+
with open(name, "r") as fobj:
|
|
231
|
+
content = fobj.read()
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error(e)
|
|
234
|
+
return ""
|
|
235
|
+
# First, query from database
|
|
236
|
+
if db is not None:
|
|
237
|
+
res = db.select_summarized_text(name, summary_instructions, level)
|
|
238
|
+
if res is not None:
|
|
239
|
+
return res, {**DEFAULT_TOKEN_USAGE}
|
|
240
|
+
|
|
241
|
+
file_content = content
|
|
242
|
+
level = level if level > 0 else 1
|
|
243
|
+
level = level if level < MAX_SENTENCE_NUM+1 else MAX_SENTENCE_NUM
|
|
244
|
+
if len(file_content) > MAX_FILE_LENGTH:
|
|
245
|
+
file_content = content[:MAX_FILE_LENGTH] + " ..."
|
|
246
|
+
prompt = EVALUATION_SUMMARIZE_FILE_PROMPT.format(
|
|
247
|
+
file_name=name,
|
|
248
|
+
file_content=file_content,
|
|
249
|
+
sentence_num1=level,
|
|
250
|
+
sentence_num2=level+1,
|
|
251
|
+
summary_instructions=summary_instructions \
|
|
252
|
+
if summary_instructions is not None and len(summary_instructions) > 0 \
|
|
253
|
+
else "N/A",
|
|
254
|
+
summarize_prompt=summarize_prompt,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
config = {"recursion_limit": 500}
|
|
258
|
+
res: AIMessage = llm.invoke([("human", prompt)], config=config)
|
|
259
|
+
out = res.content
|
|
260
|
+
token_usage = {
|
|
261
|
+
"prompt_tokens": res.usage_metadata["input_tokens"],
|
|
262
|
+
"completion_tokens": res.usage_metadata["output_tokens"],
|
|
263
|
+
"total_tokens": res.usage_metadata["total_tokens"],
|
|
264
|
+
}
|
|
265
|
+
if db is not None:
|
|
266
|
+
db.upsert_summarized_file(
|
|
267
|
+
file_path=name,
|
|
268
|
+
instruction=summary_instructions,
|
|
269
|
+
summarize_level=level,
|
|
270
|
+
summarize_prompt=summarize_prompt,
|
|
271
|
+
summarized_text=out,
|
|
272
|
+
token_usage=token_usage,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
return out, token_usage
|
|
276
|
+
|
|
277
|
+
# Set up a prompt template
|
|
278
|
+
class CustomPromptTemplate(StringPromptTemplate):
|
|
279
|
+
# The template to use
|
|
280
|
+
template: str
|
|
281
|
+
# The list of tools available
|
|
282
|
+
tools: List[BaseTool]
|
|
283
|
+
# Plan
|
|
284
|
+
plan_actions: str
|
|
285
|
+
|
|
286
|
+
def format(self, **kwargs) -> str:
|
|
287
|
+
# Get the intermediate steps (AgentAction, Observation tuples)
|
|
288
|
+
# Format them in a particular way
|
|
289
|
+
intermediate_steps = kwargs.pop("intermediate_steps")
|
|
290
|
+
thoughts = ""
|
|
291
|
+
for action, observation in intermediate_steps:
|
|
292
|
+
thoughts += action.log
|
|
293
|
+
thoughts += f"\nObservation: {observation}\n"
|
|
294
|
+
# Set plan_step
|
|
295
|
+
kwargs["plan_actions"] = self.plan_actions
|
|
296
|
+
# Set the agent_scratchpad variable to that value
|
|
297
|
+
kwargs["agent_scratchpad"] = thoughts
|
|
298
|
+
# Create a tools variable from the list of tools provided
|
|
299
|
+
kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
|
|
300
|
+
# Create a list of tool names for the tools provided
|
|
301
|
+
kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
|
|
302
|
+
prompt = self.template.format(**kwargs)
|
|
303
|
+
# print([prompt])
|
|
304
|
+
return prompt
|
|
305
|
+
|
|
306
|
+
class CustomOutputParser(AgentOutputParser):
|
|
307
|
+
def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
|
|
308
|
+
# Check if agent should finish
|
|
309
|
+
if "Final Answer:" in llm_output:
|
|
310
|
+
return AgentFinish(
|
|
311
|
+
return_values={"output": llm_output},
|
|
312
|
+
log=llm_output,
|
|
313
|
+
)
|
|
314
|
+
# Parse out the action and action input
|
|
315
|
+
regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
|
|
316
|
+
match = re.search(regex, llm_output, re.DOTALL)
|
|
317
|
+
if not match:
|
|
318
|
+
# raise ValueError(f"Could not parse LLM output: `{llm_output}`")
|
|
319
|
+
print(f"Warning: could not parse LLM output: `{llm_output}`, finishing chain...")
|
|
320
|
+
return AgentFinish(
|
|
321
|
+
return_values={"output": llm_output},
|
|
322
|
+
log=llm_output,
|
|
323
|
+
)
|
|
324
|
+
action = match.group(1).strip()
|
|
325
|
+
action_input = match.group(2)
|
|
326
|
+
# Return the action and action input
|
|
327
|
+
action_dict = None
|
|
328
|
+
action_input_replaced = clean_action_input(action_input)
|
|
329
|
+
try:
|
|
330
|
+
action_dict = json.loads(action_input_replaced)
|
|
331
|
+
except json.JSONDecodeError:
|
|
332
|
+
pass
|
|
333
|
+
if action_dict is None:
|
|
334
|
+
# try using ast to parse input string
|
|
335
|
+
import ast
|
|
336
|
+
try:
|
|
337
|
+
action_dict = ast.literal_eval(action_input_replaced)
|
|
338
|
+
if not isinstance(action_dict, dict):
|
|
339
|
+
action_dict = None
|
|
340
|
+
except Exception as e:
|
|
341
|
+
logger.error(f"Error parsing action input: {action_input} -> {action_input_replaced}\n{e}")
|
|
342
|
+
pass
|
|
343
|
+
return AgentAction(
|
|
344
|
+
tool=action,
|
|
345
|
+
tool_input=action_dict if action_dict is not None else action_input,
|
|
346
|
+
log=llm_output
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def get_tool_names_and_descriptions(tools: List[BaseTool]) -> str:
|
|
350
|
+
tool_names = []
|
|
351
|
+
tools_descriptions = ""
|
|
352
|
+
for tool in tools:
|
|
353
|
+
tools_descriptions += f"name: {tool.name}, description: {tool.description}\n"
|
|
354
|
+
tool_names.append(tool.name)
|
|
355
|
+
return str(tool_names), tools_descriptions
|
|
356
|
+
|
|
357
|
+
def generate_repo_structure_prompt(
|
|
358
|
+
files: List[str],
|
|
359
|
+
dir_path: str="",
|
|
360
|
+
) -> str:
|
|
361
|
+
# Convert the repo structure to a string
|
|
362
|
+
file_pairs = [(f, get_file_type(os.path.join(dir_path, f)).value) for f in files]
|
|
363
|
+
repo_structure = ""
|
|
364
|
+
for f, f_type in file_pairs:
|
|
365
|
+
repo_structure += f"{f} - {f_type}\n"
|
|
366
|
+
return repo_structure
|
|
367
|
+
|
|
368
|
+
class ObservationResult(BaseModel):
|
|
369
|
+
Analysis: Optional[str]=Field(description="Analyzing the goal, repository file structure and intermediate output.")
|
|
370
|
+
FinalAnswer: Optional[str]=Field(description="the final answer for the goal")
|
|
371
|
+
Thoughts: Optional[str]=Field(description="If the information is insufficient, the thoughts will be given and be taken into consideration in next round.")
|
|
372
|
+
|
|
373
|
+
def convert_plan_to_string(plan: PlanAgentResult) -> str:
|
|
374
|
+
plan_str = ""
|
|
375
|
+
for action in plan.actions:
|
|
376
|
+
action_str = f"Step: {action['name']}\n"
|
|
377
|
+
action_str += f"Step Input: {action['input']}\n"
|
|
378
|
+
plan_str += action_str
|
|
379
|
+
return plan_str
|
|
380
|
+
|
|
381
|
+
STRING_TO_OBJECT_SYSTEM_PROMPT = """
|
|
382
|
+
You are an expert to understand data. You will be provided a text, and your task is to extracted structured data from the provided text.
|
|
383
|
+
|
|
384
|
+
---
|
|
385
|
+
|
|
386
|
+
### **Instructions**
|
|
387
|
+
1. If no structured data can be extracted, return None
|
|
388
|
+
|
|
389
|
+
---
|
|
390
|
+
|
|
391
|
+
### **Input Text**
|
|
392
|
+
{input_text}
|
|
393
|
+
"""
|
|
394
|
+
|
|
395
|
+
def try_parse_json_object(json_obj: str) -> dict | None:
|
|
396
|
+
json_obj = json_obj.strip()
|
|
397
|
+
|
|
398
|
+
# First, try to parse
|
|
399
|
+
try:
|
|
400
|
+
obj = json.loads(json_obj)
|
|
401
|
+
return obj
|
|
402
|
+
except JSONDecodeError as e:
|
|
403
|
+
logger.error(e)
|
|
404
|
+
|
|
405
|
+
# Second, let's handle some common errors
|
|
406
|
+
# 1. handle the case that the json object is not wrapped in { and }
|
|
407
|
+
if not json_obj.startswith("{") and not json_obj.endswith("}") and ":" in json_obj:
|
|
408
|
+
json_obj = "{" + json_obj + "}"
|
|
409
|
+
if json_obj.startswith("{{"):
|
|
410
|
+
json_obj = json_obj[1:]
|
|
411
|
+
if json_obj.endswith("}}"):
|
|
412
|
+
json_obj = json_obj[:-1]
|
|
413
|
+
|
|
414
|
+
# Finally, let's try to parse again
|
|
415
|
+
try:
|
|
416
|
+
obj = json.loads(json_obj)
|
|
417
|
+
return obj
|
|
418
|
+
except JSONDecodeError as e:
|
|
419
|
+
logger.error(e)
|
|
420
|
+
return None
|
|
421
|
+
except Exception as e:
|
|
422
|
+
logger.error(e)
|
|
423
|
+
return None
|
|
424
|
+
|
|
425
|
+
def try_parse_with_llm(llm: BaseChatOpenAI, input_text: str, schema: any):
|
|
426
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
427
|
+
STRING_TO_OBJECT_SYSTEM_PROMPT
|
|
428
|
+
).format(input_text=input_text)
|
|
429
|
+
|
|
430
|
+
conversation = CommonConversation(llm=llm)
|
|
431
|
+
res, token_usage = conversation.generate_with_schema(
|
|
432
|
+
system_prompt=system_prompt,
|
|
433
|
+
instruction_prompt="Let's start to parse the input text.",
|
|
434
|
+
schema=schema,
|
|
435
|
+
)
|
|
436
|
+
return res, token_usage
|
|
437
|
+
|
|
438
|
+
def parse_final_answer(final_answer: str | None) -> dict | None:
|
|
439
|
+
if final_answer is None:
|
|
440
|
+
return None
|
|
441
|
+
final_answer = final_answer.strip()
|
|
442
|
+
the_obj = try_parse_json_object(final_answer)
|
|
443
|
+
if the_obj is not None and "final_answer" in the_obj:
|
|
444
|
+
return the_obj
|
|
445
|
+
|
|
446
|
+
final_answer_cases = [
|
|
447
|
+
"**FinalAnswer:**",
|
|
448
|
+
"FinalAnswer:",
|
|
449
|
+
"**FinalAnswer**",
|
|
450
|
+
"FinalAnswer",
|
|
451
|
+
"**FinalAnswer**:",
|
|
452
|
+
"**Final Answer:**",
|
|
453
|
+
"**Final Answer**:",
|
|
454
|
+
"Final Answer:",
|
|
455
|
+
"Final Answer",
|
|
456
|
+
"**final_answer**:",
|
|
457
|
+
"**final_answer:**",
|
|
458
|
+
"final_answer:",
|
|
459
|
+
"**final_answer**",
|
|
460
|
+
"final_answer",
|
|
461
|
+
"**final answer**:",
|
|
462
|
+
"**final answer:**",
|
|
463
|
+
"final answer:",
|
|
464
|
+
"final answer",
|
|
465
|
+
]
|
|
466
|
+
for case in final_answer_cases:
|
|
467
|
+
if case in final_answer:
|
|
468
|
+
splitted_answer = final_answer.split(case)[-1].strip().strip(":")
|
|
469
|
+
the_obj = try_parse_json_object(splitted_answer)
|
|
470
|
+
if the_obj is not None and "final_answer" in the_obj:
|
|
471
|
+
return the_obj
|
|
472
|
+
return None
|
|
473
|
+
|
|
474
|
+
def read_license_file(repo_path: str) -> tuple[str | None, str|None]:
|
|
475
|
+
# find hardcoded license file
|
|
476
|
+
hardcoded_license_files = [
|
|
477
|
+
"LICENSE",
|
|
478
|
+
"LICENSE.txt",
|
|
479
|
+
"LICENSE.md",
|
|
480
|
+
"LICENSE.rst",
|
|
481
|
+
]
|
|
482
|
+
license_files = []
|
|
483
|
+
for file in hardcoded_license_files:
|
|
484
|
+
file_path = os.path.join(str(repo_path), file)
|
|
485
|
+
file_path = file_path.strip()
|
|
486
|
+
if os.path.exists(file_path):
|
|
487
|
+
with open(file_path, "r") as f:
|
|
488
|
+
license_files.append((f.read(), os.path.join(repo_path, file)))
|
|
489
|
+
|
|
490
|
+
max_item = max(license_files, key=lambda x: len(x[0])) if len(license_files) > 0 else (None, None)
|
|
491
|
+
if max_item[0] is not None:
|
|
492
|
+
return max_item[0], max_item[1]
|
|
493
|
+
|
|
494
|
+
# find in root directory
|
|
495
|
+
for root, _, files in os.walk(repo_path):
|
|
496
|
+
for file in files:
|
|
497
|
+
if file.lower() == "license":
|
|
498
|
+
with open(os.path.join(root, file), "r") as f:
|
|
499
|
+
return f.read(), os.path.join(root, file)
|
|
500
|
+
if file[:8].lower() == "license.":
|
|
501
|
+
with open(os.path.join(root, file), "r") as f:
|
|
502
|
+
return f.read(), os.path.join(root, file)
|
|
503
|
+
return None, None
|
|
504
|
+
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
3
|
+
from langchain.tools import BaseTool
|
|
4
|
+
from langchain_core.prompts import ChatPromptTemplate, StringPromptTemplate
|
|
5
|
+
from langchain.agents import create_react_agent, AgentExecutor
|
|
6
|
+
from langchain_community.callbacks.openai_info import OpenAICallbackHandler
|
|
7
|
+
|
|
8
|
+
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
|
|
9
|
+
from bioguider.agents.agent_utils import (
|
|
10
|
+
CustomPromptTemplate,
|
|
11
|
+
CustomOutputParser,
|
|
12
|
+
)
|
|
13
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
|
|
14
|
+
from bioguider.agents.peo_common_step import PEOCommonStep, PEOWorkflowState
|
|
15
|
+
from bioguider.agents.collection_task_utils import CollectionWorkflowState
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
COLLECTION_EXECUTION_SYSTEM_PROMPT = """---
|
|
20
|
+
|
|
21
|
+
You are an expert Python developer.
|
|
22
|
+
You are given a **plan** and must complete it strictly using Python code and the available tools.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
### **Available Tools**
|
|
27
|
+
{tools}
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
### **Your Task**
|
|
32
|
+
Follow the given plan step by step using the exact format below:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Thought: Describe what you are thinking or planning to do next.
|
|
36
|
+
Action: The tool you are going to use (must be one of: {tool_names})
|
|
37
|
+
Action Input: The input to the selected action
|
|
38
|
+
Observation: The result returned by the action
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
You may repeat the **Thought → Action → Action Input → Observation** loop as needed.
|
|
42
|
+
|
|
43
|
+
Once all steps in the plan have been executed, output all the results using this format:
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
Thought: I have completed the plan.
|
|
47
|
+
Final Answer:
|
|
48
|
+
Action: {{tool_name}}
|
|
49
|
+
Action Input: {{input1}}
|
|
50
|
+
Action Observation: {{Observation1}}
|
|
51
|
+
---
|
|
52
|
+
Action: {{tool_name}}
|
|
53
|
+
Action Input: {{input2}}
|
|
54
|
+
Action Observation: {{Observation2}}
|
|
55
|
+
---
|
|
56
|
+
...
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
### **Example**
|
|
62
|
+
```
|
|
63
|
+
Action: summarize_file_tool
|
|
64
|
+
Action Input: README.md
|
|
65
|
+
Action Input: "Please extract license information in summarized file content."
|
|
66
|
+
Observation: # BioGuider\nBioGuider is a Python package for bioinformatics.\n...
|
|
67
|
+
...
|
|
68
|
+
Final Answer:
|
|
69
|
+
Action: summarize_file_tool
|
|
70
|
+
Action Input: README.md
|
|
71
|
+
Action Input: "N/A"
|
|
72
|
+
Action Observation: # BioGuider\nBioGuider is a Python package for bioinformatics.\n...
|
|
73
|
+
---
|
|
74
|
+
Action: check_file_related_tool
|
|
75
|
+
Action Input: pyproject.toml
|
|
76
|
+
Action Observation: Yes, the file is related to the project.
|
|
77
|
+
---
|
|
78
|
+
...
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
### **Important Notes**
|
|
84
|
+
|
|
85
|
+
- You must strictly follow the provided plan.
|
|
86
|
+
- **Do not take any additional or alternative actions**, even if:
|
|
87
|
+
- No relevant result is found
|
|
88
|
+
- The file content is missing, empty, or irrelevant
|
|
89
|
+
- If no information is found in a step, simply proceed to the next action in the plan without improvising.
|
|
90
|
+
- Only use the tools specified in the plan actions. No independent decisions or extra steps are allowed.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
### **Plan**
|
|
95
|
+
{plan_actions}
|
|
96
|
+
|
|
97
|
+
### **Actions Already Taken**
|
|
98
|
+
{agent_scratchpad}
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
{input}
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
class CollectionExecuteStep(PEOCommonStep):
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
llm: BaseChatOpenAI,
|
|
111
|
+
repo_path: str,
|
|
112
|
+
repo_structure: str,
|
|
113
|
+
gitignore_path: str,
|
|
114
|
+
custom_tools: list[BaseTool] | None = None,
|
|
115
|
+
):
|
|
116
|
+
super().__init__(llm)
|
|
117
|
+
self.step_name = "Collection Execution Step"
|
|
118
|
+
self.repo_path = repo_path
|
|
119
|
+
self.repo_structure = repo_structure
|
|
120
|
+
self.gitignore_path = gitignore_path
|
|
121
|
+
self.custom_tools = custom_tools if custom_tools is not None else []
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _execute_directly(self, state: PEOWorkflowState):
|
|
125
|
+
plan_actions = state["plan_actions"]
|
|
126
|
+
prompt = CustomPromptTemplate(
|
|
127
|
+
template=COLLECTION_EXECUTION_SYSTEM_PROMPT,
|
|
128
|
+
tools=self.custom_tools,
|
|
129
|
+
plan_actions=plan_actions,
|
|
130
|
+
input_variables=[
|
|
131
|
+
"tools", "tool_names", "agent_scratchpad",
|
|
132
|
+
"intermediate_steps", "plan_actions",
|
|
133
|
+
],
|
|
134
|
+
)
|
|
135
|
+
output_parser = CustomOutputParser()
|
|
136
|
+
agent = create_react_agent(
|
|
137
|
+
llm=self.llm,
|
|
138
|
+
tools=self.custom_tools,
|
|
139
|
+
prompt=prompt,
|
|
140
|
+
output_parser=output_parser,
|
|
141
|
+
stop_sequence=["\nObservation:"],
|
|
142
|
+
)
|
|
143
|
+
callback_handler = OpenAICallbackHandler()
|
|
144
|
+
agent_executor = AgentExecutor(
|
|
145
|
+
agent=agent,
|
|
146
|
+
tools=self.custom_tools,
|
|
147
|
+
max_iterations=30,
|
|
148
|
+
)
|
|
149
|
+
response = agent_executor.invoke(
|
|
150
|
+
input={"plan_actions": plan_actions, "input": "Now, let's begin."},
|
|
151
|
+
config={
|
|
152
|
+
"callbacks": [callback_handler],
|
|
153
|
+
"recursion_limit": 20,
|
|
154
|
+
},
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# parse the response
|
|
158
|
+
if "output" in response:
|
|
159
|
+
output = response["output"]
|
|
160
|
+
if "**Final Answer**" in output:
|
|
161
|
+
final_answer = output.split("**Final Answer:**")[-1].strip().strip(":")
|
|
162
|
+
step_output = final_answer
|
|
163
|
+
elif "Final Answer" in output:
|
|
164
|
+
final_answer = output.split("Final Answer")[-1].strip().strip(":")
|
|
165
|
+
step_output = final_answer
|
|
166
|
+
else:
|
|
167
|
+
step_output = output
|
|
168
|
+
self._print_step(state, step_output=step_output)
|
|
169
|
+
state["step_output"] = step_output
|
|
170
|
+
else:
|
|
171
|
+
logger.error("No output found in the response.")
|
|
172
|
+
self._print_step(
|
|
173
|
+
state,
|
|
174
|
+
step_output="Error: No output found in the response.",
|
|
175
|
+
)
|
|
176
|
+
state["step_output"] = "Error: No output found in the response."
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
token_usage = vars(callback_handler)
|
|
180
|
+
token_usage = {**DEFAULT_TOKEN_USAGE, **token_usage}
|
|
181
|
+
|
|
182
|
+
return state, token_usage
|