bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,181 @@
1
+
2
+ import logging
3
+
4
+ from langchain_openai.chat_models.base import BaseChatOpenAI
5
+ from langchain.tools import BaseTool
6
+ from langchain.agents import AgentExecutor, create_react_agent
7
+ from langchain_community.callbacks.openai_info import OpenAICallbackHandler
8
+
9
+ from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
10
+ from bioguider.agents.agent_utils import CustomOutputParser, CustomPromptTemplate
11
+ from bioguider.agents.peo_common_step import (
12
+ PEOCommonStep,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ ## execution system prompt
18
+ IDENTIFICATION_EXECUTION_SYSTEM_PROMPT = """You are an expert Python developer.
19
+
20
+ You are given a **plan** and are expected to complete it using Python code and the available tools.
21
+
22
+ ---
23
+
24
+ ### **Available Tools**
25
+ {tools}
26
+
27
+ ---
28
+
29
+ ### **Your Task**
30
+
31
+ Execute the plan step by step using the format below:
32
+
33
+ ```
34
+ Thought: Describe what you are thinking or planning to do next.
35
+ Action: The tool you are going to use (must be one of: {tool_names})
36
+ Action Input: The input to the selected action
37
+ Observation: The result returned by the action
38
+ ```
39
+
40
+ You may repeat the **Thought → Action → Action Input → Observation** loop as many times as needed.
41
+
42
+ Once the plan is fully completed, output the result in the following format:
43
+ ```
44
+ Thought: I have completed the plan.
45
+ Final Answer:
46
+ Action: {{tool_name}}
47
+ Action Input: {{file_name1}}
48
+ Action Observation: {{Observation1}}
49
+ ---
50
+ Action: {{tool_name}}
51
+ Action Input: {{file_name2}}
52
+ Action Observation: {{Observation2}}
53
+ ---
54
+ ...
55
+ ```
56
+
57
+ ---
58
+
59
+ ### **Example**
60
+ ```
61
+ Action: summarize_file_tool
62
+ Action Input: README.md
63
+ Action Input: "Please extract license information in summarized file content."
64
+ Observation: # BioGuider\nBioGuider is a Python package for bioinformatics.\n...
65
+ ...
66
+ Final Answer:
67
+ Action: summarize_file_tool
68
+ Action Input: README.md
69
+ Action Input: "N/A"
70
+ Action Observation: # BioGuider\nBioGuider is a Python package for bioinformatics.\n...
71
+ ---
72
+ Action: check_file_related_tool
73
+ Action Input: pyproject.toml
74
+ Action Observation: Yes, the file is related to the project.
75
+ ---
76
+ ...
77
+ ```
78
+
79
+ ---
80
+
81
+ ### **Important Notes**
82
+
83
+ - You must strictly follow the provided plan.
84
+ - **Do not take any additional or alternative actions**, even if:
85
+ - No relevant result is found
86
+ - The file content is missing, empty, or irrelevant
87
+ - If no information is found in a step, simply proceed to the next action in the plan without improvising.
88
+ - Only use the tools specified in the plan actions. No independent decisions or extra steps are allowed.
89
+
90
+ ### **Plan**
91
+ {plan_actions}
92
+
93
+ ### **Actions Already Taken**
94
+ {agent_scratchpad}
95
+
96
+ ---
97
+
98
+ {input}
99
+ """
100
+
101
+ class IdentificationExecuteStep(PEOCommonStep):
102
+ """
103
+ This class is a placeholder for common step functionality in the PEO agent.
104
+ It is currently empty and can be extended in the future.
105
+ """
106
+ def __init__(
107
+ self,
108
+ llm: BaseChatOpenAI,
109
+ repo_path: str,
110
+ repo_structure: str,
111
+ gitignore_path: str,
112
+ custom_tools: list[BaseTool] | None = None,
113
+ ):
114
+ super().__init__(llm=llm)
115
+ self.llm = llm
116
+ self.step_name = "Identification Execution Step"
117
+ self.repo_path = repo_path
118
+ self.repo_structure = repo_structure
119
+ self.gitignore_path = gitignore_path
120
+ self.custom_tools = custom_tools if custom_tools is not None else []
121
+
122
+ def _execute_directly(self, state):
123
+ plan_actions = state["plan_actions"]
124
+ prompt = CustomPromptTemplate(
125
+ template=IDENTIFICATION_EXECUTION_SYSTEM_PROMPT,
126
+ tools=self.custom_tools,
127
+ plan_actions=plan_actions,
128
+ input_variables=[
129
+ "tools",
130
+ "tool_names",
131
+ "agent_scratchpad",
132
+ "intermediate_steps",
133
+ "plan_actions",
134
+ ],
135
+ )
136
+ output_parser = CustomOutputParser()
137
+ agent = create_react_agent(
138
+ llm=self.llm,
139
+ tools=self.custom_tools,
140
+ prompt=prompt,
141
+ output_parser=output_parser,
142
+ stop_sequence=["\nObservation:"],
143
+ )
144
+ callback_handler = OpenAICallbackHandler()
145
+ agent_executor = AgentExecutor(
146
+ agent=agent,
147
+ tools=self.custom_tools,
148
+ max_iterations=10,
149
+ )
150
+ response = agent_executor.invoke(
151
+ input={"plan_actions": plan_actions, "input": "Now, let's begin."},
152
+ callbacks=[callback_handler],
153
+ )
154
+ # parse the response
155
+ if "output" in response:
156
+ output = response["output"]
157
+ if "**Final Answer**" in output:
158
+ final_answer = output.split("**Final Answer:**")[-1].strip().strip(":")
159
+ step_output = final_answer
160
+ elif "Final Answer" in output:
161
+ final_answer = output.split("Final Answer")[-1].strip().strip(":")
162
+ step_output = final_answer
163
+ else:
164
+ step_output = output
165
+ step_output = step_output.strip().strip("```").strip('"""')
166
+ self._print_step(state, step_output=step_output)
167
+ state["step_output"] = step_output
168
+ else:
169
+ logger.error("No output found in the response.")
170
+ self._print_step(
171
+ state,
172
+ step_output="Error: No output found in the response.",
173
+ )
174
+ state["step_output"] = "Error: No output found in the response."
175
+
176
+ token_usage = vars(callback_handler)
177
+ token_usage = {**DEFAULT_TOKEN_USAGE, **token_usage}
178
+
179
+ return state, token_usage
180
+
181
+
@@ -0,0 +1,104 @@
1
+
2
+ from langchain.prompts import ChatPromptTemplate
3
+
4
+ from bioguider.agents.agent_utils import ObservationResult
5
+ from bioguider.agents.common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
6
+ from bioguider.agents.identification_task_utils import IdentificationWorkflowState
7
+ from bioguider.agents.peo_common_step import PEOWorkflowState, PEOCommonStep
8
+ from bioguider.utils.constants import MAX_STEP_COUNT
9
+
10
+
11
+ ## observation system prompt
12
+ IDENTIFICATION_OBSERVATION_SYSTEM_PROMPT = """Your goal is:
13
+ {goal}
14
+
15
+ ### **Repository File Structure**
16
+ Here is the 2-level file structure of the repository (f - file, d - directory, l - symlink, u - unknown):
17
+ {repo_structure}
18
+
19
+ ### **Intermediate Output**
20
+ {intermediate_output}
21
+
22
+ ### **Instructions**
23
+ Carefully review the **Goal**, **Repository File Structure**, and **Intermediate Output**.
24
+ - If you believe the goal **can be achieved**, proceed as follows:
25
+ - Provide your reasoning under **Analysis**
26
+ - Then provide your result under **FinalAnswer**
27
+ ```
28
+ **Analysis**: your analysis here
29
+ **FinalAnswer**: your final answer here, in **raw json format**, **including** the surrounding "{{}}" but **without** using code fence (```json ... ```),
30
+ For example, output exactly: {final_answer_example}
31
+ ```
32
+ - If the information is **not sufficient** to achieve the goal, simply explain why under **Thoughts**:
33
+ ```
34
+ **Thoughts**: your thoughts here
35
+ ```
36
+ Be precise and support your reasoning with evidence from the input.
37
+
38
+ ### **Important Instructions**
39
+ {important_instructions}
40
+
41
+ ### Notes
42
+ We are collecting information over multiple rounds, your thoughts and the output of this step will be persisted, so please **do not rush to provide a Final Answer**.
43
+ If you find the current information insufficient, share your reasoning or thoughts instead—we’ll continue with the next round accordingly.
44
+ """
45
+
46
+
47
+ class IdentificationObserveStep(PEOCommonStep):
48
+ def __init__(
49
+ self,
50
+ llm,
51
+ repo_path: str,
52
+ repo_structure: str,
53
+ gitignore_path: str,
54
+ custom_tools: list = None,
55
+ ):
56
+ super().__init__(llm)
57
+ self.step_name = "Identification Observe Step"
58
+ self.repo_path = repo_path
59
+ self.repo_structure = repo_structure
60
+ self.gitignore_path = gitignore_path
61
+ self.custom_tools = custom_tools if custom_tools is not None else []
62
+
63
+ def _prepare_system_prompt(self, state: IdentificationWorkflowState):
64
+ goal = state["goal"]
65
+ important_instructions = "N/A" \
66
+ if not "observe_instructions" in state else state["observe_instructions"]
67
+ final_answer_example = state["final_answer_example"]
68
+ intermediate_output = self._build_intermediate_steps(state)
69
+ prompt = ChatPromptTemplate.from_template(IDENTIFICATION_OBSERVATION_SYSTEM_PROMPT)
70
+
71
+ return prompt.format(
72
+ goal=goal,
73
+ repo_structure=self.repo_structure,
74
+ intermediate_output=intermediate_output,
75
+ final_answer_example=final_answer_example,
76
+ important_instructions=important_instructions,
77
+ )
78
+
79
+ def _execute_directly(self, state: IdentificationWorkflowState):
80
+ step_count = state["step_count"]
81
+ instruction = "Now, we have reached max recursion limit, please give me the **final answer** based on the current information" \
82
+ if step_count == MAX_STEP_COUNT/3 - 2 else "Now, Let's begin."
83
+ system_prompt = self._prepare_system_prompt(state)
84
+ agent = CommonAgentTwoSteps(llm=self.llm)
85
+ res, _, token_usage, reasoning_process = agent.go(
86
+ system_prompt=system_prompt,
87
+ instruction_prompt=instruction,
88
+ schema=ObservationResult,
89
+ )
90
+ state["final_answer"] = res.FinalAnswer
91
+ analysis = res.Analysis
92
+ thoughts = res.Thoughts
93
+ state["step_analysis"] = analysis
94
+ state["step_thoughts"] = thoughts
95
+ state["step_count"] += 1
96
+ self._print_step(
97
+ state,
98
+ step_output=f"**Observation Reasoning Process {state['step_count']}**\n{reasoning_process}"
99
+ )
100
+ self._print_step(
101
+ state,
102
+ step_output=f"Final Answer: {res.FinalAnswer if res.FinalAnswer else None}\nAnalysis: {analysis}\nThoughts: {thoughts}",
103
+ )
104
+ return state, token_usage
@@ -0,0 +1,140 @@
1
+
2
+ from langchain.prompts import ChatPromptTemplate
3
+ from langchain_openai.chat_models.base import BaseChatOpenAI
4
+ from langchain.tools import BaseTool
5
+ from pydantic import BaseModel, Field
6
+
7
+ from bioguider.agents.agent_utils import get_tool_names_and_descriptions
8
+ from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
9
+ from bioguider.agents.identification_task_utils import IdentificationWorkflowState
10
+ from bioguider.agents.peo_common_step import PEOCommonStep
11
+
12
+ ## plan system prompt
13
+ IDENTIFICATION_PLAN_SYSTEM_PROMPT = ChatPromptTemplate.from_template("""### **Goal**
14
+ You are an expert developer in the field of biomedical domain. Your goal is:
15
+ {goal}
16
+
17
+ ### **Repository File Structure**
18
+ Here is the 2-level file structure of the repository (f - file, d - directory, l - symlink, u - unknown):
19
+ {repo_structure}
20
+
21
+ ### **Function Tools**
22
+ You are provided the following function tools:
23
+ {tools}
24
+
25
+ ### Intermediate Steps
26
+ Hers are the intermediate steps results:
27
+ {intermediate_steps}
28
+
29
+ ### Intermediate Thoughts
30
+ Analysis: {intermediate_analysis}
31
+ Thoughts: {intermediate_thoughts}
32
+
33
+ ### **Instruction**
34
+ We will repeat **Plan - Execution - Observation** loops as many times as needed. All the results in each round will be persisted,
35
+ meaning that states and variables will persisted through multiple rounds of plan execution. Be sure to take advantage of this by
36
+ developing your collection plan incrementally and reflect on the intermediate observations at each round, instead of coding up
37
+ everything in one go. Be sure to take only one or two actions in each step.
38
+
39
+ ### **Important Instructions**
40
+ {important_instructions}
41
+
42
+ ### **Output**
43
+ You plan should follow this format:
44
+ Step: tool name, should be one of {tool_names}
45
+ Step Input: file name or directory name
46
+ Step: tool name, should be one of {tool_names}
47
+ Step Input: file name or directory name
48
+ """)
49
+
50
+ class IdentificationPlanResult(BaseModel):
51
+ """ Identification Plan Result """
52
+ actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
53
+
54
+ IdentificationPlanResultJsonSchema = {
55
+ "title": "identification_plan_result",
56
+ "description": "plan result",
57
+ "type": "object",
58
+ "properties": {
59
+ "actions": {
60
+ "type": "array",
61
+ "description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
62
+ "title": "Actions",
63
+ "items": {"type": "object"}
64
+ },
65
+ },
66
+ "required": ["actions"],
67
+ }
68
+
69
+ class IdentificationPlanStep(PEOCommonStep):
70
+ def __init__(
71
+ self,
72
+ llm: BaseChatOpenAI,
73
+ repo_path: str,
74
+ repo_structure: str,
75
+ gitignore_path: str,
76
+ custom_tools: list[BaseTool] | None = None,
77
+ ):
78
+ super().__init__(llm)
79
+ self.step_name = "Identification Plan Step"
80
+ self.repo_path = repo_path
81
+ self.repo_structure = repo_structure
82
+ self.gitignore_path = gitignore_path
83
+ self.custom_tools = custom_tools if custom_tools is not None else []
84
+
85
+ def _prepare_system_prompt(self, state: IdentificationWorkflowState) -> str:
86
+ goal = state["goal"]
87
+ important_instructions = "N/A" if not "plan_instructions" in state else state["plan_instructions"]
88
+ repo_structure = self.repo_structure
89
+ intermdediate_steps = self._build_intermediate_steps(state)
90
+ step_analysis, step_thoughts = self._build_intermediate_analysis_and_thoughts(state)
91
+ self._print_step(
92
+ state,
93
+ step_output="**Intermediate Step Output**\n" + intermdediate_steps
94
+ )
95
+ self._print_step(
96
+ state,
97
+ step_output="**Intermediate Step Analysis**\n{step_analysis}\n**Intermediate Step Thoughts**\n{step_thoughts}",
98
+ )
99
+ tool_names, tools_desc = get_tool_names_and_descriptions(self.custom_tools)
100
+ return IDENTIFICATION_PLAN_SYSTEM_PROMPT.format(
101
+ goal=goal,
102
+ repo_structure=repo_structure,
103
+ tools=tools_desc,
104
+ intermediate_steps=intermdediate_steps,
105
+ intermediate_analysis=step_analysis,
106
+ intermediate_thoughts=step_thoughts,
107
+ tool_names=tool_names,
108
+ important_instructions=important_instructions,
109
+ )
110
+
111
+ def _convert_to_plan_actions_text(self, actions: list[dict]) -> str:
112
+ plan_str = ""
113
+ for action in actions:
114
+ action_str = f"Step: {action['name']}\n"
115
+ action_str += f"Step Input: {action['input']}\n"
116
+ plan_str += action_str
117
+ return plan_str
118
+
119
+ def _execute_directly(self, state: IdentificationWorkflowState):
120
+ system_prompt = self._prepare_system_prompt(state)
121
+ agent = CommonAgentTwoSteps(llm=self.llm)
122
+ res, _, token_usage, reasoning_process = agent.go(
123
+ system_prompt=system_prompt,
124
+ instruction_prompt="Now, let's begin.",
125
+ schema=IdentificationPlanResultJsonSchema,
126
+ )
127
+ PEOCommonStep._reset_step_state(state)
128
+ res = IdentificationPlanResult(**res)
129
+ self._print_step(
130
+ state,
131
+ step_output="**Reasoning Process**\n" + reasoning_process,
132
+ )
133
+ self._print_step(
134
+ state,
135
+ step_output=f"**Plan**\n{res.actions}"
136
+ )
137
+ state["plan_actions"] = self._convert_to_plan_actions_text(res.actions)
138
+
139
+ return state, token_usage
140
+
@@ -0,0 +1,270 @@
1
+
2
+ import os
3
+ import json
4
+ import logging
5
+ from enum import Enum
6
+ from typing import Callable
7
+ from pydantic import BaseModel, Field
8
+ from langchain_openai.chat_models.base import BaseChatOpenAI
9
+ from langchain.tools import Tool, StructuredTool
10
+ from langgraph.graph import StateGraph, START, END
11
+
12
+ from bioguider.utils.constants import PrimaryLanguageEnum, ProjectTypeEnum
13
+ from bioguider.utils.file_utils import get_file_type
14
+ from bioguider.agents.agent_tools import (
15
+ read_file_tool,
16
+ read_directory_tool,
17
+ summarize_file_tool,
18
+ )
19
+ from bioguider.agents.agent_utils import (
20
+ read_directory,
21
+ try_parse_json_object,
22
+ )
23
+ from bioguider.agents.identification_execute_step import IdentificationExecuteStep
24
+ from bioguider.agents.identification_observe_step import IdentificationObserveStep
25
+ from bioguider.agents.identification_plan_step import IdentificationPlanStep
26
+ from bioguider.agents.identification_task_utils import IdentificationWorkflowState
27
+ from bioguider.agents.peo_common_step import PEOCommonStep
28
+ from bioguider.agents.prompt_utils import (
29
+ IDENTIFICATION_GOAL_PROJECT_TYPE,
30
+ IDENTIFICATION_GOAL_PRIMARY_LANGUAGE,
31
+ IDENTIFICATION_GOAL_META_DATA,
32
+ )
33
+ from bioguider.agents.python_ast_repl_tool import CustomPythonAstREPLTool
34
+ from bioguider.agents.agent_task import AgentTask
35
+ from bioguider.database.summarized_file_db import SummarizedFilesDb
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ META_DATA_FINAL_ANSWER_EXAMPLE = '{{"name": "repo name", ...}}'
40
+ PROJECT_TYPE_FINAL_ANSWER_EXAMPLE = '{{"project_type": "project type"}}'
41
+ PRIMARY_LANGUAGE_FINAL_ANSWER_EXAMPLE = '{{"primary_language": "primary language"}}'
42
+
43
+ class IdentificationPlanResult(BaseModel):
44
+ """ Identification Plan Result """
45
+ actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
46
+
47
+ IdentificationPlanResultJsonSchema = {
48
+ "title": "identification_plan_result",
49
+ "description": "plan result",
50
+ "type": "object",
51
+ "properties": {
52
+ "actions": {
53
+ "type": "array",
54
+ "description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
55
+ "title": "Actions",
56
+ "items": {"type": "object"}
57
+ },
58
+ },
59
+ "required": ["actions"],
60
+ }
61
+
62
+ class IdentificationTask(AgentTask):
63
+ def __init__(
64
+ self,
65
+ llm: BaseChatOpenAI,
66
+ step_callback: Callable | None=None,
67
+ summarized_files_db: SummarizedFilesDb | None = None,
68
+ provided_files: list[str] | None = None,
69
+ ):
70
+ super().__init__(llm=llm, step_callback=step_callback, summarized_files_db=summarized_files_db)
71
+ self.repo_path: str | None = None
72
+ self.gitignore_path: str | None = None
73
+ self.repo_structure: str | None = None
74
+ self.tools = []
75
+ self.custom_tools = []
76
+ self.steps: list[PEOCommonStep] = []
77
+ self.provided_files = provided_files
78
+
79
+ def _prepare_tools(self):
80
+ tool_rd = read_directory_tool(repo_path=self.repo_path)
81
+ tool_sum = summarize_file_tool(
82
+ llm=self.llm,
83
+ repo_path=self.repo_path,
84
+ output_callback=self.step_callback,
85
+ db=self.summarized_files_db,
86
+ )
87
+ tool_rf = read_file_tool(repo_path=self.repo_path)
88
+
89
+ self.tools = [tool_rd, tool_sum, tool_rf,]
90
+ self.custom_tools = [
91
+ Tool(
92
+ name = tool_rd.__class__.__name__,
93
+ func = tool_rd.run,
94
+ description=tool_rd.__class__.__doc__,
95
+ ),
96
+ StructuredTool.from_function(
97
+ tool_sum.run,
98
+ description=tool_sum.__class__.__doc__,
99
+ name=tool_sum.__class__.__name__,
100
+ ),
101
+ Tool(
102
+ name = tool_rf.__class__.__name__,
103
+ func = tool_rf.run,
104
+ description=tool_rf.__class__.__doc__,
105
+ ),
106
+ ]
107
+ # self.custom_tools.append(CustomPythonAstREPLTool())
108
+
109
+ def _initialize(self):
110
+ if not os.path.exists(self.repo_path):
111
+ raise ValueError(f"Repository path {self.repo_path} does not exist.")
112
+ files = self.provided_files
113
+ if files is None:
114
+ files = read_directory(self.repo_path, os.path.join(self.repo_path, ".gitignore"))
115
+ file_pairs = [(f, get_file_type(os.path.join(self.repo_path, f)).value) for f in files]
116
+ self.repo_structure = ""
117
+ for f, f_type in file_pairs:
118
+ self.repo_structure += f"{f} - {f_type}\n"
119
+
120
+ self._prepare_tools()
121
+ self.steps = [
122
+ IdentificationPlanStep(
123
+ llm=self.llm,
124
+ repo_path=self.repo_path,
125
+ repo_structure=self.repo_structure,
126
+ gitignore_path=self.gitignore_path,
127
+ custom_tools=self.custom_tools,
128
+ ),
129
+ IdentificationExecuteStep(
130
+ llm=self.llm,
131
+ repo_path=self.repo_path,
132
+ repo_structure=self.repo_structure,
133
+ gitignore_path=self.gitignore_path,
134
+ custom_tools=self.custom_tools,
135
+ ),
136
+ IdentificationObserveStep(
137
+ llm=self.llm,
138
+ repo_path=self.repo_path,
139
+ repo_structure=self.repo_structure,
140
+ gitignore_path=self.gitignore_path,
141
+ custom_tools=self.custom_tools,
142
+ )
143
+ ]
144
+
145
+
146
+ def _compile(
147
+ self,
148
+ repo_path: str,
149
+ gitignore_path: str,
150
+ **kwargs,
151
+ ):
152
+ self.repo_path = repo_path
153
+ self.gitignore_path = gitignore_path
154
+ self._initialize()
155
+
156
+ def check_observation_step(state: IdentificationWorkflowState):
157
+ if "final_answer" in state and state["final_answer"] is not None:
158
+ return END
159
+ return "plan_step"
160
+
161
+ graph = StateGraph(IdentificationWorkflowState)
162
+ graph.add_node("plan_step", self.steps[0].execute)
163
+ graph.add_node("execute_step", self.steps[1].execute)
164
+ graph.add_node("observe_step", self.steps[2].execute)
165
+ graph.add_edge(START, "plan_step")
166
+ graph.add_edge("plan_step", "execute_step")
167
+ graph.add_edge("execute_step", "observe_step")
168
+ graph.add_conditional_edges("observe_step", check_observation_step, {"plan_step", END})
169
+
170
+ self.graph = graph.compile()
171
+
172
+ def identify_project_type(self):
173
+ s = self._go_graph({
174
+ "goal": IDENTIFICATION_GOAL_PROJECT_TYPE,
175
+ "final_answer_example": PROJECT_TYPE_FINAL_ANSWER_EXAMPLE,
176
+ "step_count": 0,
177
+ })
178
+ proj_type = s["final_answer"] if "final_answer" in s else "unknown type"
179
+ return self._parse_project_type(proj_type)
180
+
181
+ def identify_primary_language(self):
182
+ s = self._go_graph({
183
+ "goal": IDENTIFICATION_GOAL_PRIMARY_LANGUAGE,
184
+ "final_answer_example": PRIMARY_LANGUAGE_FINAL_ANSWER_EXAMPLE,
185
+ "step_count": 0,
186
+ })
187
+ language = s["final_answer"] if "final_answer" in s else "unknown type"
188
+ return self._parse_primary_language(language)
189
+
190
+ def identify_meta_data(self):
191
+ s = self._go_graph({
192
+ "goal": IDENTIFICATION_GOAL_META_DATA,
193
+ "final_answer_example": META_DATA_FINAL_ANSWER_EXAMPLE,
194
+ "step_count": 0,
195
+ })
196
+ meta_data = s["final_answer"] if "final_answer" in s else "unknown type"
197
+ return self._parse_meta_data(meta_data)
198
+
199
+ def identify_customize_goal(
200
+ self,
201
+ goal: str,
202
+ final_answer_example: str,
203
+ plan_instructions: str = "N/A",
204
+ observe_instructions: str = "N/A",
205
+ ):
206
+ s = self._go_graph({
207
+ "goal": goal,
208
+ "final_answer_example": final_answer_example,
209
+ "plan_instructions": plan_instructions,
210
+ "observe_instructions": observe_instructions,
211
+ "step_count": 0,
212
+ })
213
+ return s["final_answer"] if "final_answer" in s else None
214
+
215
+ def _parse_project_type(self, proj_type_obj: str) -> ProjectTypeEnum:
216
+ proj_type_obj = proj_type_obj.strip()
217
+ the_obj = try_parse_json_object(proj_type_obj)
218
+ if not the_obj is None and "project_type" in the_obj:
219
+ proj_type = the_obj["project_type"]
220
+ elif proj_type_obj in [
221
+ ProjectTypeEnum.application.value,
222
+ ProjectTypeEnum.package.value,
223
+ ProjectTypeEnum.pipeline.value
224
+ ]:
225
+ return ProjectTypeEnum(proj_type_obj)
226
+ else:
227
+ proj_type = "unknown"
228
+ if proj_type == "application":
229
+ return ProjectTypeEnum.application
230
+ elif proj_type == "package":
231
+ return ProjectTypeEnum.package
232
+ elif proj_type == "pipeline":
233
+ return ProjectTypeEnum.pipeline
234
+ else:
235
+ return ProjectTypeEnum.unknown
236
+
237
+ def _parse_primary_language(self, language_obj: str) -> PrimaryLanguageEnum:
238
+ # try to handle some common errors
239
+ language_obj = language_obj.strip()
240
+ the_obj = try_parse_json_object(language_obj)
241
+ if not the_obj is None and "primary_language" in the_obj:
242
+ language = the_obj["primary_language"]
243
+ elif language_obj in [
244
+ PrimaryLanguageEnum.python.value,
245
+ PrimaryLanguageEnum.R.value,
246
+ ]:
247
+ return PrimaryLanguageEnum(language_obj)
248
+ else:
249
+ language = "unknown"
250
+
251
+ language = language.strip()
252
+ if language == "python":
253
+ return PrimaryLanguageEnum.python
254
+ elif language == "R":
255
+ return PrimaryLanguageEnum.R
256
+ else:
257
+ return PrimaryLanguageEnum.unknown
258
+
259
+ def _parse_meta_data(self, meta_data_obj: str) -> dict:
260
+ meta_data_obj = meta_data_obj.strip()
261
+ the_obj = try_parse_json_object(meta_data_obj)
262
+
263
+ return the_obj if the_obj is not None else {
264
+ "name": "unknown",
265
+ "description": "unknown",
266
+ "license": "unknown",
267
+ "owner": "unknown",
268
+ }
269
+
270
+