bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,504 @@
1
+
2
+ import json
3
+ from json import JSONDecodeError
4
+ import os
5
+ from pathlib import Path
6
+ import re
7
+ from typing import List, Optional, Tuple, Union
8
+ from langchain_openai import AzureChatOpenAI, ChatOpenAI
9
+ from langchain_deepseek import ChatDeepSeek
10
+ from langchain_core.utils.interactive_env import is_interactive_env
11
+ from langchain_core.messages.base import get_msg_title_repr
12
+ from langchain_core.prompts import ChatPromptTemplate, StringPromptTemplate
13
+ from langchain_core.messages import AIMessage
14
+ from langchain_openai.chat_models.base import BaseChatOpenAI
15
+ from langchain.tools import BaseTool
16
+ from langchain.schema import AgentAction, AgentFinish
17
+ from langchain.agents import AgentOutputParser
18
+ from langgraph.prebuilt import create_react_agent
19
+ from langchain_community.callbacks.openai_info import OpenAICallbackHandler
20
+ import logging
21
+
22
+ from pydantic import BaseModel, Field
23
+
24
+ from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, MAX_FILE_LENGTH, MAX_SENTENCE_NUM
25
+ from bioguider.utils.file_utils import get_file_type
26
+ from bioguider.utils.utils import clean_action_input
27
+ from ..utils.gitignore_checker import GitignoreChecker
28
+ from ..database.summarized_file_db import SummarizedFilesDb
29
+ from bioguider.agents.common_conversation import CommonConversation
30
+ from bioguider.rag.config import configs
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ class PlanAgentResult(BaseModel):
35
+ """ Identification Plan Result """
36
+ actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
37
+
38
+ PlanAgentResultJsonSchema = {
39
+ "title": "identification_plan_result",
40
+ "description": "plan result",
41
+ "type": "object",
42
+ "properties": {
43
+ "actions": {
44
+ "type": "array",
45
+ "description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
46
+ "title": "Actions",
47
+ "items": {"type": "object"}
48
+ },
49
+ },
50
+ "required": ["actions"],
51
+ }
52
+
53
+ def get_openai():
54
+ return get_llm(
55
+ api_key=os.environ.get("OPENAI_API_KEY"),
56
+ model_name=os.environ.get("OPENAI_MODEL"),
57
+ azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
58
+ api_version=os.environ.get("OPENAI_API_VERSION"),
59
+ azure_deployment=os.environ.get("OPENAI_DEPLOYMENT_NAME"),
60
+ max_tokens=os.environ.get("OPENAI_MAX_OUTPUT_TOKEN"),
61
+ )
62
+
63
+ def get_llm(
64
+ api_key: str,
65
+ model_name: str="gpt-4o",
66
+ azure_endpoint: str=None,
67
+ api_version: str=None,
68
+ azure_deployment: str=None,
69
+ temperature: float = 0.0,
70
+ max_tokens: int = 16384, # Set high by default - enough for any document type
71
+ ):
72
+ """
73
+ Create an LLM instance with appropriate parameters based on model type and API version.
74
+
75
+ Handles parameter compatibility across different models and API versions:
76
+ - DeepSeek models: Use max_tokens parameter
77
+ - GPT models (newer): Use max_completion_tokens parameter
78
+ - GPT-5+: Don't support custom temperature (uses default)
79
+ """
80
+
81
+ if model_name.startswith("deepseek"):
82
+ chat = ChatDeepSeek(
83
+ api_key=api_key,
84
+ model=model_name,
85
+ temperature=temperature,
86
+ max_tokens=max_tokens,
87
+ )
88
+ elif model_name.startswith("gpt"):
89
+ llm_params = {
90
+ "api_key": api_key,
91
+ "model": model_name,
92
+ }
93
+ # Handle temperature parameter based on model capabilities
94
+ # GPT-5+ models don't support custom temperature values
95
+ supports_temperature = not any(restricted in model_name for restricted in ["gpt-5", "o1", "o3"])
96
+ if supports_temperature:
97
+ llm_params["temperature"] = temperature
98
+
99
+ if azure_endpoint is None:
100
+ # OpenAI
101
+ llm_params["max_tokens"] = max_tokens
102
+ chat = ChatOpenAI(**llm_params)
103
+ else:
104
+ # Azure OpenAI
105
+ llm_params["azure_endpoint"] = azure_endpoint
106
+ llm_params["api_version"] = api_version
107
+ llm_params["deployment_name"] = azure_deployment
108
+ # Determine token limit parameter name based on API version
109
+ # Newer APIs (2024-08+) use max_completion_tokens instead of max_tokens
110
+ use_completion_tokens = api_version and api_version >= "2024-08-01-preview"
111
+ token_param = "max_completion_tokens" if use_completion_tokens else "max_tokens"
112
+ llm_params[token_param] = max_tokens
113
+ chat = AzureChatOpenAI(**llm_params)
114
+ else:
115
+ raise ValueError(f"Unsupported model type: {model_name}")
116
+
117
+ # Validate the LLM instance with a simple test
118
+ try:
119
+ chat.invoke("Hi")
120
+ except Exception as e:
121
+ logger.error(f"Failed to initialize LLM {model_name}: {e}")
122
+ return None
123
+
124
+ return chat
125
+
126
+ def pretty_print(message, printout = True):
127
+ if isinstance(message, tuple):
128
+ title = message
129
+ else:
130
+ if isinstance(message.content, list):
131
+ title = get_msg_title_repr(message.type.title().upper() + " Message", bold=is_interactive_env())
132
+ if message.name is not None:
133
+ title += f"\nName: {message.name}"
134
+
135
+ for i in message.content:
136
+ if i['type'] == 'text':
137
+ title += f"\n{i['text']}\n"
138
+ elif i['type'] == 'tool_use':
139
+ title += f"\nTool: {i['name']}"
140
+ title += f"\nInput: {i['input']}"
141
+ if printout:
142
+ print(f"{title}")
143
+ else:
144
+ title = get_msg_title_repr(message.type.title() + " Message", bold=is_interactive_env())
145
+ if message.name is not None:
146
+ title += f"\nName: {message.name}"
147
+ title += f"\n\n{message.content}"
148
+ if printout:
149
+ print(f"{title}")
150
+ return title
151
+
152
+ HUGE_FILE_LENGTH = 10 * 1024 # 10K
153
+
154
+ def read_file(
155
+ file_path: str | Path,
156
+ ) -> str | None:
157
+ file_path = str(file_path).strip()
158
+ if not os.path.isfile(file_path):
159
+ return None
160
+ with open(file_path, 'r') as f:
161
+ content = f.read()
162
+ return content
163
+
164
+ def write_file(file_path: str | Path, content: str):
165
+ try:
166
+ file_path = str(file_path).strip()
167
+ with open(file_path, "w") as fobj:
168
+ fobj.write(content)
169
+ return True
170
+ except Exception as e:
171
+ logger.error(e)
172
+ return False
173
+
174
+ def read_directory(
175
+ dir_path: str | Path,
176
+ gitignore_path: str,
177
+ level: int=1,
178
+ ) -> list[str] | None:
179
+ dir_path = str(dir_path).strip()
180
+ if not os.path.isdir(dir_path):
181
+ return None
182
+ gitignore_checker = GitignoreChecker(
183
+ directory=dir_path,
184
+ gitignore_path=gitignore_path,
185
+ exclude_dir_patterns=configs["file_filters"]["excluded_dirs"],
186
+ exclude_file_patterns=configs["file_filters"]["excluded_files"],
187
+ )
188
+ files = gitignore_checker.check_files_and_folders(level=level)
189
+ return files
190
+
191
+
192
+ EVALUATION_SUMMARIZE_FILE_PROMPT = ChatPromptTemplate.from_template("""
193
+ You will be provided with the content of the file **{file_name}**:
194
+
195
+ ---
196
+
197
+ ### **Summary Instructions**
198
+ {summary_instructions}
199
+ The content is lengthy. Please generate a concise summary ({sentence_num1}-{sentence_num2} sentences).
200
+
201
+ ---
202
+
203
+ ### **Important Instructions**
204
+ {summarize_prompt}
205
+
206
+ ---
207
+
208
+ ### **File Content**
209
+ Here is the file content:
210
+ {file_content}
211
+
212
+ ---
213
+
214
+ Now, let's start to summarize.
215
+ """)
216
+
217
+
218
+ def summarize_file(
219
+ llm: BaseChatOpenAI,
220
+ name: str | Path,
221
+ content: str | None = None,
222
+ level: int = 3,
223
+ summary_instructions: str | None = None,
224
+ summarize_prompt: str = "N/A",
225
+ db: SummarizedFilesDb | None = None,
226
+ ) -> Tuple[str, dict]:
227
+ name = str(name).strip()
228
+ if content is None:
229
+ try:
230
+ with open(name, "r") as fobj:
231
+ content = fobj.read()
232
+ except Exception as e:
233
+ logger.error(e)
234
+ return ""
235
+ # First, query from database
236
+ if db is not None:
237
+ res = db.select_summarized_text(name, summary_instructions, level)
238
+ if res is not None:
239
+ return res, {**DEFAULT_TOKEN_USAGE}
240
+
241
+ file_content = content
242
+ level = level if level > 0 else 1
243
+ level = level if level < MAX_SENTENCE_NUM+1 else MAX_SENTENCE_NUM
244
+ if len(file_content) > MAX_FILE_LENGTH:
245
+ file_content = content[:MAX_FILE_LENGTH] + " ..."
246
+ prompt = EVALUATION_SUMMARIZE_FILE_PROMPT.format(
247
+ file_name=name,
248
+ file_content=file_content,
249
+ sentence_num1=level,
250
+ sentence_num2=level+1,
251
+ summary_instructions=summary_instructions \
252
+ if summary_instructions is not None and len(summary_instructions) > 0 \
253
+ else "N/A",
254
+ summarize_prompt=summarize_prompt,
255
+ )
256
+
257
+ config = {"recursion_limit": 500}
258
+ res: AIMessage = llm.invoke([("human", prompt)], config=config)
259
+ out = res.content
260
+ token_usage = {
261
+ "prompt_tokens": res.usage_metadata["input_tokens"],
262
+ "completion_tokens": res.usage_metadata["output_tokens"],
263
+ "total_tokens": res.usage_metadata["total_tokens"],
264
+ }
265
+ if db is not None:
266
+ db.upsert_summarized_file(
267
+ file_path=name,
268
+ instruction=summary_instructions,
269
+ summarize_level=level,
270
+ summarize_prompt=summarize_prompt,
271
+ summarized_text=out,
272
+ token_usage=token_usage,
273
+ )
274
+
275
+ return out, token_usage
276
+
277
+ # Set up a prompt template
278
+ class CustomPromptTemplate(StringPromptTemplate):
279
+ # The template to use
280
+ template: str
281
+ # The list of tools available
282
+ tools: List[BaseTool]
283
+ # Plan
284
+ plan_actions: str
285
+
286
+ def format(self, **kwargs) -> str:
287
+ # Get the intermediate steps (AgentAction, Observation tuples)
288
+ # Format them in a particular way
289
+ intermediate_steps = kwargs.pop("intermediate_steps")
290
+ thoughts = ""
291
+ for action, observation in intermediate_steps:
292
+ thoughts += action.log
293
+ thoughts += f"\nObservation: {observation}\n"
294
+ # Set plan_step
295
+ kwargs["plan_actions"] = self.plan_actions
296
+ # Set the agent_scratchpad variable to that value
297
+ kwargs["agent_scratchpad"] = thoughts
298
+ # Create a tools variable from the list of tools provided
299
+ kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
300
+ # Create a list of tool names for the tools provided
301
+ kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
302
+ prompt = self.template.format(**kwargs)
303
+ # print([prompt])
304
+ return prompt
305
+
306
+ class CustomOutputParser(AgentOutputParser):
307
+ def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
308
+ # Check if agent should finish
309
+ if "Final Answer:" in llm_output:
310
+ return AgentFinish(
311
+ return_values={"output": llm_output},
312
+ log=llm_output,
313
+ )
314
+ # Parse out the action and action input
315
+ regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
316
+ match = re.search(regex, llm_output, re.DOTALL)
317
+ if not match:
318
+ # raise ValueError(f"Could not parse LLM output: `{llm_output}`")
319
+ print(f"Warning: could not parse LLM output: `{llm_output}`, finishing chain...")
320
+ return AgentFinish(
321
+ return_values={"output": llm_output},
322
+ log=llm_output,
323
+ )
324
+ action = match.group(1).strip()
325
+ action_input = match.group(2)
326
+ # Return the action and action input
327
+ action_dict = None
328
+ action_input_replaced = clean_action_input(action_input)
329
+ try:
330
+ action_dict = json.loads(action_input_replaced)
331
+ except json.JSONDecodeError:
332
+ pass
333
+ if action_dict is None:
334
+ # try using ast to parse input string
335
+ import ast
336
+ try:
337
+ action_dict = ast.literal_eval(action_input_replaced)
338
+ if not isinstance(action_dict, dict):
339
+ action_dict = None
340
+ except Exception as e:
341
+ logger.error(f"Error parsing action input: {action_input} -> {action_input_replaced}\n{e}")
342
+ pass
343
+ return AgentAction(
344
+ tool=action,
345
+ tool_input=action_dict if action_dict is not None else action_input,
346
+ log=llm_output
347
+ )
348
+
349
+ def get_tool_names_and_descriptions(tools: List[BaseTool]) -> str:
350
+ tool_names = []
351
+ tools_descriptions = ""
352
+ for tool in tools:
353
+ tools_descriptions += f"name: {tool.name}, description: {tool.description}\n"
354
+ tool_names.append(tool.name)
355
+ return str(tool_names), tools_descriptions
356
+
357
+ def generate_repo_structure_prompt(
358
+ files: List[str],
359
+ dir_path: str="",
360
+ ) -> str:
361
+ # Convert the repo structure to a string
362
+ file_pairs = [(f, get_file_type(os.path.join(dir_path, f)).value) for f in files]
363
+ repo_structure = ""
364
+ for f, f_type in file_pairs:
365
+ repo_structure += f"{f} - {f_type}\n"
366
+ return repo_structure
367
+
368
+ class ObservationResult(BaseModel):
369
+ Analysis: Optional[str]=Field(description="Analyzing the goal, repository file structure and intermediate output.")
370
+ FinalAnswer: Optional[str]=Field(description="the final answer for the goal")
371
+ Thoughts: Optional[str]=Field(description="If the information is insufficient, the thoughts will be given and be taken into consideration in next round.")
372
+
373
+ def convert_plan_to_string(plan: PlanAgentResult) -> str:
374
+ plan_str = ""
375
+ for action in plan.actions:
376
+ action_str = f"Step: {action['name']}\n"
377
+ action_str += f"Step Input: {action['input']}\n"
378
+ plan_str += action_str
379
+ return plan_str
380
+
381
+ STRING_TO_OBJECT_SYSTEM_PROMPT = """
382
+ You are an expert to understand data. You will be provided a text, and your task is to extracted structured data from the provided text.
383
+
384
+ ---
385
+
386
+ ### **Instructions**
387
+ 1. If no structured data can be extracted, return None
388
+
389
+ ---
390
+
391
+ ### **Input Text**
392
+ {input_text}
393
+ """
394
+
395
+ def try_parse_json_object(json_obj: str) -> dict | None:
396
+ json_obj = json_obj.strip()
397
+
398
+ # First, try to parse
399
+ try:
400
+ obj = json.loads(json_obj)
401
+ return obj
402
+ except JSONDecodeError as e:
403
+ logger.error(e)
404
+
405
+ # Second, let's handle some common errors
406
+ # 1. handle the case that the json object is not wrapped in { and }
407
+ if not json_obj.startswith("{") and not json_obj.endswith("}") and ":" in json_obj:
408
+ json_obj = "{" + json_obj + "}"
409
+ if json_obj.startswith("{{"):
410
+ json_obj = json_obj[1:]
411
+ if json_obj.endswith("}}"):
412
+ json_obj = json_obj[:-1]
413
+
414
+ # Finally, let's try to parse again
415
+ try:
416
+ obj = json.loads(json_obj)
417
+ return obj
418
+ except JSONDecodeError as e:
419
+ logger.error(e)
420
+ return None
421
+ except Exception as e:
422
+ logger.error(e)
423
+ return None
424
+
425
+ def try_parse_with_llm(llm: BaseChatOpenAI, input_text: str, schema: any):
426
+ system_prompt = ChatPromptTemplate.from_template(
427
+ STRING_TO_OBJECT_SYSTEM_PROMPT
428
+ ).format(input_text=input_text)
429
+
430
+ conversation = CommonConversation(llm=llm)
431
+ res, token_usage = conversation.generate_with_schema(
432
+ system_prompt=system_prompt,
433
+ instruction_prompt="Let's start to parse the input text.",
434
+ schema=schema,
435
+ )
436
+ return res, token_usage
437
+
438
+ def parse_final_answer(final_answer: str | None) -> dict | None:
439
+ if final_answer is None:
440
+ return None
441
+ final_answer = final_answer.strip()
442
+ the_obj = try_parse_json_object(final_answer)
443
+ if the_obj is not None and "final_answer" in the_obj:
444
+ return the_obj
445
+
446
+ final_answer_cases = [
447
+ "**FinalAnswer:**",
448
+ "FinalAnswer:",
449
+ "**FinalAnswer**",
450
+ "FinalAnswer",
451
+ "**FinalAnswer**:",
452
+ "**Final Answer:**",
453
+ "**Final Answer**:",
454
+ "Final Answer:",
455
+ "Final Answer",
456
+ "**final_answer**:",
457
+ "**final_answer:**",
458
+ "final_answer:",
459
+ "**final_answer**",
460
+ "final_answer",
461
+ "**final answer**:",
462
+ "**final answer:**",
463
+ "final answer:",
464
+ "final answer",
465
+ ]
466
+ for case in final_answer_cases:
467
+ if case in final_answer:
468
+ splitted_answer = final_answer.split(case)[-1].strip().strip(":")
469
+ the_obj = try_parse_json_object(splitted_answer)
470
+ if the_obj is not None and "final_answer" in the_obj:
471
+ return the_obj
472
+ return None
473
+
474
+ def read_license_file(repo_path: str) -> tuple[str | None, str|None]:
475
+ # find hardcoded license file
476
+ hardcoded_license_files = [
477
+ "LICENSE",
478
+ "LICENSE.txt",
479
+ "LICENSE.md",
480
+ "LICENSE.rst",
481
+ ]
482
+ license_files = []
483
+ for file in hardcoded_license_files:
484
+ file_path = os.path.join(str(repo_path), file)
485
+ file_path = file_path.strip()
486
+ if os.path.exists(file_path):
487
+ with open(file_path, "r") as f:
488
+ license_files.append((f.read(), os.path.join(repo_path, file)))
489
+
490
+ max_item = max(license_files, key=lambda x: len(x[0])) if len(license_files) > 0 else (None, None)
491
+ if max_item[0] is not None:
492
+ return max_item[0], max_item[1]
493
+
494
+ # find in root directory
495
+ for root, _, files in os.walk(repo_path):
496
+ for file in files:
497
+ if file.lower() == "license":
498
+ with open(os.path.join(root, file), "r") as f:
499
+ return f.read(), os.path.join(root, file)
500
+ if file[:8].lower() == "license.":
501
+ with open(os.path.join(root, file), "r") as f:
502
+ return f.read(), os.path.join(root, file)
503
+ return None, None
504
+
@@ -0,0 +1,182 @@
1
+ import logging
2
+ from langchain_openai.chat_models.base import BaseChatOpenAI
3
+ from langchain.tools import BaseTool
4
+ from langchain_core.prompts import ChatPromptTemplate, StringPromptTemplate
5
+ from langchain.agents import create_react_agent, AgentExecutor
6
+ from langchain_community.callbacks.openai_info import OpenAICallbackHandler
7
+
8
+ from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
9
+ from bioguider.agents.agent_utils import (
10
+ CustomPromptTemplate,
11
+ CustomOutputParser,
12
+ )
13
+ from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
14
+ from bioguider.agents.peo_common_step import PEOCommonStep, PEOWorkflowState
15
+ from bioguider.agents.collection_task_utils import CollectionWorkflowState
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ COLLECTION_EXECUTION_SYSTEM_PROMPT = """---
20
+
21
+ You are an expert Python developer.
22
+ You are given a **plan** and must complete it strictly using Python code and the available tools.
23
+
24
+ ---
25
+
26
+ ### **Available Tools**
27
+ {tools}
28
+
29
+ ---
30
+
31
+ ### **Your Task**
32
+ Follow the given plan step by step using the exact format below:
33
+
34
+ ```
35
+ Thought: Describe what you are thinking or planning to do next.
36
+ Action: The tool you are going to use (must be one of: {tool_names})
37
+ Action Input: The input to the selected action
38
+ Observation: The result returned by the action
39
+ ```
40
+
41
+ You may repeat the **Thought → Action → Action Input → Observation** loop as needed.
42
+
43
+ Once all steps in the plan have been executed, output all the results using this format:
44
+
45
+ ```
46
+ Thought: I have completed the plan.
47
+ Final Answer:
48
+ Action: {{tool_name}}
49
+ Action Input: {{input1}}
50
+ Action Observation: {{Observation1}}
51
+ ---
52
+ Action: {{tool_name}}
53
+ Action Input: {{input2}}
54
+ Action Observation: {{Observation2}}
55
+ ---
56
+ ...
57
+ ```
58
+
59
+ ---
60
+
61
+ ### **Example**
62
+ ```
63
+ Action: summarize_file_tool
64
+ Action Input: README.md
65
+ Action Input: "Please extract license information in summarized file content."
66
+ Observation: # BioGuider\nBioGuider is a Python package for bioinformatics.\n...
67
+ ...
68
+ Final Answer:
69
+ Action: summarize_file_tool
70
+ Action Input: README.md
71
+ Action Input: "N/A"
72
+ Action Observation: # BioGuider\nBioGuider is a Python package for bioinformatics.\n...
73
+ ---
74
+ Action: check_file_related_tool
75
+ Action Input: pyproject.toml
76
+ Action Observation: Yes, the file is related to the project.
77
+ ---
78
+ ...
79
+ ```
80
+
81
+ ---
82
+
83
+ ### **Important Notes**
84
+
85
+ - You must strictly follow the provided plan.
86
+ - **Do not take any additional or alternative actions**, even if:
87
+ - No relevant result is found
88
+ - The file content is missing, empty, or irrelevant
89
+ - If no information is found in a step, simply proceed to the next action in the plan without improvising.
90
+ - Only use the tools specified in the plan actions. No independent decisions or extra steps are allowed.
91
+
92
+ ---
93
+
94
+ ### **Plan**
95
+ {plan_actions}
96
+
97
+ ### **Actions Already Taken**
98
+ {agent_scratchpad}
99
+
100
+ ---
101
+
102
+ {input}
103
+
104
+ ---
105
+ """
106
+
107
+ class CollectionExecuteStep(PEOCommonStep):
108
+ def __init__(
109
+ self,
110
+ llm: BaseChatOpenAI,
111
+ repo_path: str,
112
+ repo_structure: str,
113
+ gitignore_path: str,
114
+ custom_tools: list[BaseTool] | None = None,
115
+ ):
116
+ super().__init__(llm)
117
+ self.step_name = "Collection Execution Step"
118
+ self.repo_path = repo_path
119
+ self.repo_structure = repo_structure
120
+ self.gitignore_path = gitignore_path
121
+ self.custom_tools = custom_tools if custom_tools is not None else []
122
+
123
+
124
+ def _execute_directly(self, state: PEOWorkflowState):
125
+ plan_actions = state["plan_actions"]
126
+ prompt = CustomPromptTemplate(
127
+ template=COLLECTION_EXECUTION_SYSTEM_PROMPT,
128
+ tools=self.custom_tools,
129
+ plan_actions=plan_actions,
130
+ input_variables=[
131
+ "tools", "tool_names", "agent_scratchpad",
132
+ "intermediate_steps", "plan_actions",
133
+ ],
134
+ )
135
+ output_parser = CustomOutputParser()
136
+ agent = create_react_agent(
137
+ llm=self.llm,
138
+ tools=self.custom_tools,
139
+ prompt=prompt,
140
+ output_parser=output_parser,
141
+ stop_sequence=["\nObservation:"],
142
+ )
143
+ callback_handler = OpenAICallbackHandler()
144
+ agent_executor = AgentExecutor(
145
+ agent=agent,
146
+ tools=self.custom_tools,
147
+ max_iterations=30,
148
+ )
149
+ response = agent_executor.invoke(
150
+ input={"plan_actions": plan_actions, "input": "Now, let's begin."},
151
+ config={
152
+ "callbacks": [callback_handler],
153
+ "recursion_limit": 20,
154
+ },
155
+ )
156
+
157
+ # parse the response
158
+ if "output" in response:
159
+ output = response["output"]
160
+ if "**Final Answer**" in output:
161
+ final_answer = output.split("**Final Answer:**")[-1].strip().strip(":")
162
+ step_output = final_answer
163
+ elif "Final Answer" in output:
164
+ final_answer = output.split("Final Answer")[-1].strip().strip(":")
165
+ step_output = final_answer
166
+ else:
167
+ step_output = output
168
+ self._print_step(state, step_output=step_output)
169
+ state["step_output"] = step_output
170
+ else:
171
+ logger.error("No output found in the response.")
172
+ self._print_step(
173
+ state,
174
+ step_output="Error: No output found in the response.",
175
+ )
176
+ state["step_output"] = "Error: No output found in the response."
177
+
178
+
179
+ token_usage = vars(callback_handler)
180
+ token_usage = {**DEFAULT_TOKEN_USAGE, **token_usage}
181
+
182
+ return state, token_usage