bioguider 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +88 -0
- bioguider/agents/agent_tools.py +147 -0
- bioguider/agents/agent_utils.py +357 -0
- bioguider/agents/collection_execute_step.py +180 -0
- bioguider/agents/collection_observe_step.py +113 -0
- bioguider/agents/collection_plan_step.py +154 -0
- bioguider/agents/collection_task.py +179 -0
- bioguider/agents/collection_task_utils.py +109 -0
- bioguider/agents/common_agent.py +159 -0
- bioguider/agents/common_agent_2step.py +126 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +153 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_task.py +269 -0
- bioguider/agents/identification_execute_step.py +179 -0
- bioguider/agents/identification_observe_step.py +92 -0
- bioguider/agents/identification_plan_step.py +135 -0
- bioguider/agents/identification_task.py +220 -0
- bioguider/agents/identification_task_utils.py +18 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +190 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/summarized_file_db.py +140 -0
- bioguider/managers/evaluation_manager.py +108 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +648 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +134 -0
- bioguider/settings.py +103 -0
- bioguider/utils/constants.py +40 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +126 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/utils.py +27 -0
- bioguider-0.2.3.dist-info/LICENSE +21 -0
- bioguider-0.2.3.dist-info/METADATA +44 -0
- bioguider-0.2.3.dist-info/RECORD +47 -0
- bioguider-0.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Callable
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
9
|
+
from langchain.tools import Tool
|
|
10
|
+
from langgraph.graph import StateGraph, START, END
|
|
11
|
+
|
|
12
|
+
from bioguider.utils.constants import PrimaryLanguageEnum, ProjectTypeEnum
|
|
13
|
+
from bioguider.utils.file_utils import get_file_type
|
|
14
|
+
from bioguider.agents.agent_tools import (
|
|
15
|
+
read_file_tool,
|
|
16
|
+
read_directory_tool,
|
|
17
|
+
summarize_file_tool,
|
|
18
|
+
)
|
|
19
|
+
from bioguider.agents.agent_utils import (
|
|
20
|
+
read_directory,
|
|
21
|
+
)
|
|
22
|
+
from bioguider.agents.identification_execute_step import IdentificationExecuteStep
|
|
23
|
+
from bioguider.agents.identification_observe_step import IdentificationObserveStep
|
|
24
|
+
from bioguider.agents.identification_plan_step import IdentificationPlanStep
|
|
25
|
+
from bioguider.agents.identification_task_utils import IdentificationWorkflowState
|
|
26
|
+
from bioguider.agents.peo_common_step import PEOCommonStep
|
|
27
|
+
from bioguider.agents.prompt_utils import (
|
|
28
|
+
IDENTIFICATION_GOAL_PROJECT_TYPE,
|
|
29
|
+
IDENTIFICATION_GOAL_PRIMARY_LANGUAGE,
|
|
30
|
+
IDENTIFICATION_GOAL_META_DATA,
|
|
31
|
+
)
|
|
32
|
+
from bioguider.agents.python_ast_repl_tool import CustomPythonAstREPLTool
|
|
33
|
+
from bioguider.agents.agent_task import AgentTask
|
|
34
|
+
from bioguider.database.summarized_file_db import SummarizedFilesDb
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
META_DATA_FINAL_ANSWER_EXAMPLE = '{{"name": "repo name", ...}}'
|
|
39
|
+
PROJECT_TYPE_FINAL_ANSWER_EXAMPLE = '{{"project_type": "project type"}}'
|
|
40
|
+
PRIMARY_LANGUAGE_FINAL_ANSWER_EXAMPLE = '{{"primary_language": "primary language"}}'
|
|
41
|
+
|
|
42
|
+
class IdentificationPlanResult(BaseModel):
|
|
43
|
+
""" Identification Plan Result """
|
|
44
|
+
actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
|
|
45
|
+
|
|
46
|
+
IdentificationPlanResultJsonSchema = {
|
|
47
|
+
"title": "identification_plan_result",
|
|
48
|
+
"description": "plan result",
|
|
49
|
+
"type": "object",
|
|
50
|
+
"properties": {
|
|
51
|
+
"actions": {
|
|
52
|
+
"type": "array",
|
|
53
|
+
"description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
|
|
54
|
+
"title": "Actions",
|
|
55
|
+
"items": {"type": "object"}
|
|
56
|
+
},
|
|
57
|
+
},
|
|
58
|
+
"required": ["actions"],
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
class IdentificationTask(AgentTask):
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
llm: BaseChatOpenAI,
|
|
65
|
+
step_callback: Callable | None=None,
|
|
66
|
+
):
|
|
67
|
+
super().__init__(llm=llm, step_callback=step_callback)
|
|
68
|
+
self.repo_path: str | None = None
|
|
69
|
+
self.gitignore_path: str | None = None
|
|
70
|
+
self.repo_structure: str | None = None
|
|
71
|
+
self.tools = []
|
|
72
|
+
self.custom_tools = []
|
|
73
|
+
self.steps: list[PEOCommonStep] = []
|
|
74
|
+
|
|
75
|
+
def _initialize(self):
|
|
76
|
+
if not os.path.exists(self.repo_path):
|
|
77
|
+
raise ValueError(f"Repository path {self.repo_path} does not exist.")
|
|
78
|
+
files = read_directory(self.repo_path, os.path.join(self.repo_path, ".gitignore"))
|
|
79
|
+
file_pairs = [(f, get_file_type(os.path.join(self.repo_path, f)).value) for f in files]
|
|
80
|
+
self.repo_structure = ""
|
|
81
|
+
for f, f_type in file_pairs:
|
|
82
|
+
self.repo_structure += f"{f} - {f_type}\n"
|
|
83
|
+
|
|
84
|
+
self.tools = [
|
|
85
|
+
summarize_file_tool(
|
|
86
|
+
llm=self.llm,
|
|
87
|
+
repo_path=self.repo_path,
|
|
88
|
+
output_callback=self._print_step,
|
|
89
|
+
db=self.summary_file_db,
|
|
90
|
+
),
|
|
91
|
+
read_directory_tool(repo_path=self.repo_path, gitignore_path=self.gitignore_path),
|
|
92
|
+
read_file_tool(repo_path=self.repo_path),
|
|
93
|
+
]
|
|
94
|
+
self.custom_tools = [Tool(
|
|
95
|
+
name=tool.__class__.__name__,
|
|
96
|
+
func=tool.run,
|
|
97
|
+
description=tool.__class__.__doc__,
|
|
98
|
+
) for tool in self.tools]
|
|
99
|
+
self.custom_tools.append(CustomPythonAstREPLTool())
|
|
100
|
+
self.steps = [
|
|
101
|
+
IdentificationPlanStep(
|
|
102
|
+
llm=self.llm,
|
|
103
|
+
repo_path=self.repo_path,
|
|
104
|
+
repo_structure=self.repo_structure,
|
|
105
|
+
gitignore_path=self.gitignore_path,
|
|
106
|
+
custom_tools=self.custom_tools,
|
|
107
|
+
),
|
|
108
|
+
IdentificationExecuteStep(
|
|
109
|
+
llm=self.llm,
|
|
110
|
+
repo_path=self.repo_path,
|
|
111
|
+
repo_structure=self.repo_structure,
|
|
112
|
+
gitignore_path=self.gitignore_path,
|
|
113
|
+
custom_tools=self.custom_tools,
|
|
114
|
+
),
|
|
115
|
+
IdentificationObserveStep(
|
|
116
|
+
llm=self.llm,
|
|
117
|
+
repo_path=self.repo_path,
|
|
118
|
+
repo_structure=self.repo_structure,
|
|
119
|
+
gitignore_path=self.gitignore_path,
|
|
120
|
+
custom_tools=self.custom_tools,
|
|
121
|
+
)
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _compile(
|
|
126
|
+
self,
|
|
127
|
+
repo_path: str,
|
|
128
|
+
gitignore_path: str,
|
|
129
|
+
**kwargs,
|
|
130
|
+
):
|
|
131
|
+
self.repo_path = repo_path
|
|
132
|
+
self.gitignore_path = gitignore_path
|
|
133
|
+
self._initialize()
|
|
134
|
+
|
|
135
|
+
def check_observation_step(state: IdentificationWorkflowState):
|
|
136
|
+
if "final_answer" in state and state["final_answer"] is not None:
|
|
137
|
+
return END
|
|
138
|
+
return "plan_step"
|
|
139
|
+
|
|
140
|
+
graph = StateGraph(IdentificationWorkflowState)
|
|
141
|
+
graph.add_node("plan_step", self.steps[0].execute)
|
|
142
|
+
graph.add_node("execute_step", self.steps[1].execute)
|
|
143
|
+
graph.add_node("observe_step", self.steps[2].execute)
|
|
144
|
+
graph.add_edge(START, "plan_step")
|
|
145
|
+
graph.add_edge("plan_step", "execute_step")
|
|
146
|
+
graph.add_edge("execute_step", "observe_step")
|
|
147
|
+
graph.add_conditional_edges("observe_step", check_observation_step, {"plan_step", END})
|
|
148
|
+
|
|
149
|
+
self.graph = graph.compile()
|
|
150
|
+
|
|
151
|
+
def identify_project_type(self):
|
|
152
|
+
s = self._go_graph({
|
|
153
|
+
"goal": IDENTIFICATION_GOAL_PROJECT_TYPE,
|
|
154
|
+
"final_answer_example": PROJECT_TYPE_FINAL_ANSWER_EXAMPLE,
|
|
155
|
+
})
|
|
156
|
+
proj_type = s["final_answer"] if "final_answer" in s else "unknown type"
|
|
157
|
+
return self._parse_project_type(proj_type)
|
|
158
|
+
|
|
159
|
+
def identify_primary_language(self):
|
|
160
|
+
s = self._go_graph({
|
|
161
|
+
"goal": IDENTIFICATION_GOAL_PRIMARY_LANGUAGE,
|
|
162
|
+
"final_answer_example": PRIMARY_LANGUAGE_FINAL_ANSWER_EXAMPLE,
|
|
163
|
+
})
|
|
164
|
+
language = s["final_answer"] if "final_answer" in s else "unknown type"
|
|
165
|
+
return self._parse_primary_language(language)
|
|
166
|
+
|
|
167
|
+
def identify_meta_data(self):
|
|
168
|
+
s = self._go_graph({
|
|
169
|
+
"goal": IDENTIFICATION_GOAL_META_DATA,
|
|
170
|
+
"final_answer_example": META_DATA_FINAL_ANSWER_EXAMPLE,
|
|
171
|
+
})
|
|
172
|
+
meta_data = s["final_answer"] if "final_answer" in s else "unknown type"
|
|
173
|
+
return self._parse_meta_data(meta_data)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _parse_project_type(self, proj_type_obj: str) -> ProjectTypeEnum:
|
|
177
|
+
try:
|
|
178
|
+
json_obj = json.loads(proj_type_obj)
|
|
179
|
+
proj_type = json_obj["project_type"]
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.error(e)
|
|
182
|
+
return ProjectTypeEnum.unknown
|
|
183
|
+
proj_type = proj_type.strip()
|
|
184
|
+
if proj_type == "application":
|
|
185
|
+
return ProjectTypeEnum.application
|
|
186
|
+
elif proj_type == "package":
|
|
187
|
+
return ProjectTypeEnum.package
|
|
188
|
+
elif proj_type == "pipeline":
|
|
189
|
+
return ProjectTypeEnum.pipeline
|
|
190
|
+
else:
|
|
191
|
+
return ProjectTypeEnum.unknown
|
|
192
|
+
|
|
193
|
+
def _parse_primary_language(self, language_obj: str) -> PrimaryLanguageEnum:
|
|
194
|
+
try:
|
|
195
|
+
json_obj = json.loads(language_obj)
|
|
196
|
+
language = json_obj["primary_language"]
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.error(e)
|
|
199
|
+
return PrimaryLanguageEnum.unknown
|
|
200
|
+
language = language.strip()
|
|
201
|
+
if language == "python":
|
|
202
|
+
return PrimaryLanguageEnum.python
|
|
203
|
+
elif language == "R":
|
|
204
|
+
return PrimaryLanguageEnum.R
|
|
205
|
+
else:
|
|
206
|
+
return PrimaryLanguageEnum.unknown
|
|
207
|
+
|
|
208
|
+
def _parse_meta_data(self, meta_data_obj: str) -> dict:
|
|
209
|
+
try:
|
|
210
|
+
json_obj = json.loads(meta_data_obj)
|
|
211
|
+
meta_data = json_obj
|
|
212
|
+
return meta_data
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(e)
|
|
215
|
+
return {
|
|
216
|
+
"name": "unknown",
|
|
217
|
+
"description": "unknown",
|
|
218
|
+
"license": "unknown",
|
|
219
|
+
"owner": "unknown",
|
|
220
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
from typing import Callable, TypedDict, Optional
|
|
4
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
5
|
+
|
|
6
|
+
class IdentificationWorkflowState(TypedDict):
|
|
7
|
+
llm: BaseChatOpenAI
|
|
8
|
+
step_output_callback: Optional[Callable]
|
|
9
|
+
goal: str
|
|
10
|
+
|
|
11
|
+
plan_actions: Optional[str]
|
|
12
|
+
plan_reasoning: Optional[str]
|
|
13
|
+
intermediate_steps: Optional[list[str]]
|
|
14
|
+
final_answer: Optional[str]
|
|
15
|
+
final_answer_example: Optional[str]
|
|
16
|
+
step_output: Optional[str]
|
|
17
|
+
step_analysis: Optional[str]
|
|
18
|
+
step_thoughts: Optional[str]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from bioguider.agents.common_step import CommonState, CommonStep
|
|
7
|
+
|
|
8
|
+
class PEOWorkflowState(CommonState):
|
|
9
|
+
intermediate_steps: Optional[str]
|
|
10
|
+
step_output: Optional[str]
|
|
11
|
+
step_analysis: Optional[str]
|
|
12
|
+
step_thoughts: Optional[str]
|
|
13
|
+
plan_actions: Optional[list[dict]]
|
|
14
|
+
|
|
15
|
+
class PEOCommonStep(CommonStep):
|
|
16
|
+
"""
|
|
17
|
+
This class is a placeholder for common step functionality in the PEO agent.
|
|
18
|
+
It is currently empty and can be extended in the future.
|
|
19
|
+
"""
|
|
20
|
+
def __init__(self, llm: BaseChatOpenAI):
|
|
21
|
+
super().__init__()
|
|
22
|
+
self.llm = llm
|
|
23
|
+
|
|
24
|
+
def _build_intermediate_steps(self, state: PEOWorkflowState):
|
|
25
|
+
"""
|
|
26
|
+
Build intermediate steps for the PEO workflow.
|
|
27
|
+
"""
|
|
28
|
+
intermediate_steps = ""
|
|
29
|
+
# previous steps
|
|
30
|
+
if "intermediate_steps" in state:
|
|
31
|
+
for i in range(len(state['intermediate_steps'])):
|
|
32
|
+
step = state['intermediate_steps'][i].replace("{", "(").replace("}", ")")
|
|
33
|
+
intermediate_steps += step + "\n"
|
|
34
|
+
# current step
|
|
35
|
+
if "step_output" in state and state["step_output"] is not None:
|
|
36
|
+
step_content = state["step_output"]
|
|
37
|
+
step_content = step_content.replace("{", "(").replace("}", ")")
|
|
38
|
+
intermediate_steps += step_content
|
|
39
|
+
return intermediate_steps
|
|
40
|
+
|
|
41
|
+
def _build_intermediate_analysis_and_thoughts(self, state: PEOWorkflowState):
|
|
42
|
+
intermediate_analysis = "N/A" if "step_analysis" not in state or \
|
|
43
|
+
state["step_analysis"] is None \
|
|
44
|
+
else state["step_analysis"]
|
|
45
|
+
intermediate_analysis = intermediate_analysis.replace("{", "(").replace("}", ")")
|
|
46
|
+
intermediate_thoughts = "N/A" if "step_thoughts" not in state or \
|
|
47
|
+
state["step_thoughts"] is None \
|
|
48
|
+
else state["step_thoughts"]
|
|
49
|
+
intermediate_thoughts = intermediate_thoughts.replace("{", "(").replace("}", ")")
|
|
50
|
+
return intermediate_analysis, intermediate_thoughts
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _reset_step_state(state):
|
|
54
|
+
# move step_output to intermediate steps
|
|
55
|
+
if "intermediate_steps" not in state or state["intermediate_steps"] is None:
|
|
56
|
+
state["intermediate_steps"] = []
|
|
57
|
+
intermediate_steps = state["intermediate_steps"]
|
|
58
|
+
if "step_output" in state and state["step_output"] is not None:
|
|
59
|
+
intermediate_steps.append(state["step_output"])
|
|
60
|
+
state["intermediate_steps"] = intermediate_steps
|
|
61
|
+
|
|
62
|
+
state["step_analysis"] = None
|
|
63
|
+
state["step_thoughts"] = None
|
|
64
|
+
state["step_output"] = None
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
3
|
+
|
|
4
|
+
USER_INSTRUCTION = """Do not give the final result immediately. First, explain your reasoning process step by step, then provide the answer."""
|
|
5
|
+
|
|
6
|
+
EVALUATION_ITEMS = [
|
|
7
|
+
("1. Clarity & Readability", 20),
|
|
8
|
+
("2. Completeness", 20),
|
|
9
|
+
("3. Organization & Navigation", 10),
|
|
10
|
+
("4. Examples & Tutorials", 10),
|
|
11
|
+
("5. Maintainability & Updates", 15),
|
|
12
|
+
("6. Accessibility & Formatting", 15),
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
EVALUATION_SYSTEM_PROMPT = ChatPromptTemplate.from_template("""Please act as both a **biomedical researcher** and an **experienced software developer** to evaluate the documentation quality of a GitHub repository using the evaluation criteria below.
|
|
16
|
+
|
|
17
|
+
### **Evaluation Criteria (Total: 100 points)**
|
|
18
|
+
|
|
19
|
+
1. **Clarity & Readability (20 points)** - Is the documentation written in a clear, concise, and easy-to-understand manner?
|
|
20
|
+
2. **Completeness (20 points)** - Does the documentation cover all essential information needed for understanding, usage, and further development?
|
|
21
|
+
3. **Organization & Navigation (10 points)** - Is the structure logical and easy to navigate? Are key sections easy to find?
|
|
22
|
+
4. **Examples & Tutorials (10 points)** - Are there sufficient examples or tutorials to help users get started and understand core functionality?
|
|
23
|
+
5. **Maintainability & Updates (15 points)** - Does the documentation reflect ongoing maintenance and version history (e.g., changelogs, version tags)?
|
|
24
|
+
6. **Accessibility & Formatting (15 points)** - Is the documentation well-formatted and easy to read (e.g., Markdown formatting, appropriate use of code blocks, headers, etc.)?
|
|
25
|
+
### **Repository Structure Overview**
|
|
26
|
+
_(f = file, d = directory)_
|
|
27
|
+
```
|
|
28
|
+
{repository_structure}
|
|
29
|
+
```""")
|
|
30
|
+
|
|
31
|
+
EVALUATION_ITEM_PROMPT = ChatPromptTemplate.from_template("""Here are the content of files or directories in the repository that you need to take into account:
|
|
32
|
+
{files_or_directories}
|
|
33
|
+
|
|
34
|
+
### **Instructions**
|
|
35
|
+
|
|
36
|
+
Let's begin by evaluating **Criterion {evaluation_item}*.
|
|
37
|
+
|
|
38
|
+
- If the information provided is **sufficient**, please proceed with your evaluation using the following format:
|
|
39
|
+
```
|
|
40
|
+
{evaluation_item} ({score_point} points)
|
|
41
|
+
a. Score: [score out of {score_point}]
|
|
42
|
+
b. Reason: [brief explanation justifying the score]
|
|
43
|
+
```
|
|
44
|
+
- If the information provided is **insufficient**, do **not** attempt to evaluate. Instead, list the specific files or directories for which you need more detail, using the format below:
|
|
45
|
+
```
|
|
46
|
+
[files/directories needed for evaluation]
|
|
47
|
+
```""")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
## goal: identify project type
|
|
51
|
+
IDENTIFICATION_GOAL_PROJECT_TYPE = """Identify the following key attribute of the repository:
|
|
52
|
+
**project type**: The primary functional type of the project.
|
|
53
|
+
Options and their definitions:
|
|
54
|
+
- **package**: A reusable Python or R library intended to be imported by other software.
|
|
55
|
+
- **application**: A standalone Python or R program that can be directly executed by users.
|
|
56
|
+
- **pipeline**: A biomedical data processing workflow that integrates multiple tools or steps.
|
|
57
|
+
- **unknown type**: Use this only if the type cannot be determined reliably from available information.
|
|
58
|
+
**Notes**:
|
|
59
|
+
1. The project can be identified as one of the above project type.
|
|
60
|
+
2. The project may server as multiple project types, like package & pipeline, standalone application & package,
|
|
61
|
+
However, you need to investigate closely to find out the primary project type.
|
|
62
|
+
3. Do **not** rely heavily on directories like 'benchmark/' or 'tests/' when determining the project type, as they are often auxiliary."""
|
|
63
|
+
|
|
64
|
+
## goal: identify primary language
|
|
65
|
+
IDENTIFICATION_GOAL_PRIMARY_LANGUAGE = """Identify the following key attribute of the repository:
|
|
66
|
+
**primary language**: The primary language of the project.
|
|
67
|
+
Options and their definitions:
|
|
68
|
+
- **python**: Python language
|
|
69
|
+
- **R**: R language
|
|
70
|
+
- **unknown type**: Use this only if the type cannot be determined reliably from available information.
|
|
71
|
+
**Notes**:
|
|
72
|
+
The project can be identified as one of the above primary language."""
|
|
73
|
+
|
|
74
|
+
## goal: identify meta data: repo name, owner, description, license
|
|
75
|
+
IDENTIFICATION_GOAL_META_DATA = """Identify the following meta data of the repository:
|
|
76
|
+
**name**: The repository name.
|
|
77
|
+
**owner**: The repository user or orgnization.
|
|
78
|
+
**description**: The description of the repository.
|
|
79
|
+
**license**: The license of the repository, like 'MIT', 'Apache 2.0' or 'unknown'.
|
|
80
|
+
|
|
81
|
+
**Notes**: If the above meta data can't be identified, please return 'unknown' or 'N/A'.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
COT_USER_INSTRUCTION = "Do not give the answer immediately. First, explain your reasoning process step by step, then provide the answer."
|
|
85
|
+
|
|
86
|
+
class CollectionGoalItemEnum(Enum):
|
|
87
|
+
UserGuide = "User Guide"
|
|
88
|
+
Tutorial = "Tutorials & Vignettes"
|
|
89
|
+
DockerGeneration = "Docker Generation"
|
|
90
|
+
Installation = "Installation"
|
|
91
|
+
License = "License"
|
|
92
|
+
Contributing = "Contributing"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
COLLECTION_GOAL = """Your goal is to collect the names of all files that are relevant to **{goal_item}**.
|
|
97
|
+
**Note:**
|
|
98
|
+
- You only need to collect the **file names**, not their contents."""
|
|
99
|
+
|
|
100
|
+
COLLECTION_PROMPTS = {
|
|
101
|
+
"UserGuide": {
|
|
102
|
+
"goal_item": "User Guide",
|
|
103
|
+
"related_file_description": """A document qualifies as a **User Guide** if it includes **at least one** of the following elements.
|
|
104
|
+
If **any one** of these is present, the document should be classified as a User Guide — full coverage is **not required**:
|
|
105
|
+
- Overview: A brief introduction to the software, its purpose, and its intended audience.
|
|
106
|
+
- Installation Instructions: Step-by-step setup procedures.
|
|
107
|
+
- Input/Output Specifications: Detailed information on the data the software accepts and produces.
|
|
108
|
+
- Configuration Options: Descriptions of settings and parameters that can be adjusted.
|
|
109
|
+
- Function/Interface Listings: Comprehensive lists of available functions or interfaces, including their descriptions, parameters, and return values.
|
|
110
|
+
- Mathematical Equations/Numerical Methods: Embedded documentation explaining the underlying mathematical concepts or algorithms.
|
|
111
|
+
- Developer Guidance: Instructions on how to extend the software or contribute to its development.
|
|
112
|
+
**Do not** classify the document as a User Guide if it primarily serves as a Tutorial or Example. Such documents typically include:
|
|
113
|
+
- Sample Datasets: Example data used to illustrate functionality.
|
|
114
|
+
- Narrative Explanations: Story-like descriptions guiding the user through examples.
|
|
115
|
+
- Code Walkthroughs: Detailed explanations of code snippets in a tutorial format.
|
|
116
|
+
**Do not** classify the document as a User Guide if it is souce code or a script (*.py, *.R) that is not intended for end-user interaction.
|
|
117
|
+
- You can include directory names if all files in the directory are relevant to the goal item.""",
|
|
118
|
+
},
|
|
119
|
+
"Tutorial": {
|
|
120
|
+
"goal_item": "Tutorials & Vignettes",
|
|
121
|
+
"related_file_description": """
|
|
122
|
+
**Tutorials and Vignettes** are instructional documents or interactive notebooks that provide step-by-step guidance on using a software package or library. They typically include:
|
|
123
|
+
- Code Examples: Practical code snippets demonstrating how to use the software's features and functions.
|
|
124
|
+
- Explanatory Text: Clear explanations accompanying the code examples to help users understand the concepts and techniques being presented.
|
|
125
|
+
- Visualizations: Graphical representations of data or results to enhance understanding.
|
|
126
|
+
- Interactive Elements: Features that allow users to experiment with the code in real-time, such as Jupyter notebooks or R Markdown files.
|
|
127
|
+
- Use Cases: Real-world applications or scenarios where the software can be applied effectively.
|
|
128
|
+
- You can include directory names if all files in the directory are relevant to the goal item.
|
|
129
|
+
""",
|
|
130
|
+
},
|
|
131
|
+
"DockerGeneration": {
|
|
132
|
+
"goal_item": "Generating a Dockerfile for reproducibility testing",
|
|
133
|
+
|
|
134
|
+
"related_file_description": """A document qualifies **Dockerfile Generation** related if it includes **at least one** of the following elements.
|
|
135
|
+
If **any one** of these is present, the document should be classified as a Dockerfile — full coverage is **not required**:
|
|
136
|
+
- Existing Docker Configuration
|
|
137
|
+
* Files like `Dockerfile`, `docker-compose.yml`, or any Docker-related build scripts.
|
|
138
|
+
- Installation & Environment Setup
|
|
139
|
+
* Files used to define or install dependencies.
|
|
140
|
+
* Examples: `README.md` `requirements.txt`, `environment.yml`, `setup.py`, `install.R`, `DESCRIPTION`, `pyproject.toml`, etc.
|
|
141
|
+
- Build/Runtime Scripts
|
|
142
|
+
* Shell or batch scripts used for setup, building, or launching the application.
|
|
143
|
+
* Examples: `install.sh`, `build.sh`, `run.sh`, etc.
|
|
144
|
+
- Minimal Code Examples or Get-Started Files
|
|
145
|
+
* Files that demonstrate a minimal working example of the software (e.g., for testing or reproducing results).
|
|
146
|
+
* Examples: `example.py`, `main.py`, `demo.R`, `notebooks/get_started.ipynb`, etc.
|
|
147
|
+
* These should be runnable with minimal configuration.""",
|
|
148
|
+
|
|
149
|
+
"important_instructions": """- Only include minimal code examples that demonstrate basic functionality.
|
|
150
|
+
If multiple example files are found, select only the simplest and most lightweight one that is sufficient to verify the repository works.
|
|
151
|
+
- Give priority to analyzing files whose names include **"install"** or **"Dockerfile"**, as these are most likely to be useful for generating our Dockerfile
|
|
152
|
+
- The total number of collected files should **not exceed 5**.
|
|
153
|
+
- Make sure to include **only one code example**, selecting the most minimal and representative one.
|
|
154
|
+
"""
|
|
155
|
+
},
|
|
156
|
+
"Installation": {
|
|
157
|
+
"goal_item": "Installation Instructions",
|
|
158
|
+
"related_file_description": """A document qualifies as **Installation Instructions** if it includes **at least one** of the following elements.
|
|
159
|
+
If **any one** of these is present, the document should be classified as Installation Instructions — full coverage is **not required**:
|
|
160
|
+
- Step-by-step setup procedures for the software.
|
|
161
|
+
- Prerequisites or dependencies that need to be installed before using the software.
|
|
162
|
+
- Configuration steps required to get the software running.
|
|
163
|
+
- Troubleshooting tips related to installation issues.
|
|
164
|
+
- You can include directory names if all files in the directory are relevant to the goal item.""",
|
|
165
|
+
"important_instructions": """- Give priority to analyzing README file that contain installation instructions and the files whose names include **"install"** or **"setup"**.
|
|
166
|
+
- If multiple files are found, select the most comprehensive one that covers the installation process.
|
|
167
|
+
- The total number of collected files should **not exceed 3**.
|
|
168
|
+
- Make sure to include **only one installation instruction file**, selecting the most comprehensive and representative one.
|
|
169
|
+
"""
|
|
170
|
+
},
|
|
171
|
+
"License": {
|
|
172
|
+
"goal_item": "License Information",
|
|
173
|
+
"related_file_description": """A document qualifies as **License Information** if it includes **at least one** of the following elements.
|
|
174
|
+
If **any one** of these is present, the document should be classified as License Information — full coverage is **not required**:
|
|
175
|
+
- A file named `LICENSE`, `LICENSE.txt`, or similar that explicitly states the software's license.
|
|
176
|
+
- A section in the README or documentation that describes the licensing terms.
|
|
177
|
+
- Any file that contains legal information regarding the use, distribution, or modification of the software.
|
|
178
|
+
- You can include directory names if all files in the directory are relevant to the goal item.""",
|
|
179
|
+
},
|
|
180
|
+
"Contributing": {
|
|
181
|
+
"goal_item": "Contributing Guidelines",
|
|
182
|
+
"related_file_description": """A document qualifies as **Contributing Guidelines** if it includes **at least one** of the following elements.
|
|
183
|
+
If **any one** of these is present, the document should be classified as Contributing Guidelines — full coverage is **not required**:
|
|
184
|
+
- A file named `CONTRIBUTING.md`, `CONTRIBUTING.rst`, or similar that provides guidelines for contributing to the project.
|
|
185
|
+
- A section in the README or documentation that outlines how to contribute, report issues, or submit pull requests.
|
|
186
|
+
- Any file that contains instructions for developers on how to contribute to the project, including coding standards, testing procedures, and submission processes.
|
|
187
|
+
- You can include directory names if all files in the directory are relevant to the goal item.""",
|
|
188
|
+
},
|
|
189
|
+
}
|
|
190
|
+
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
|
|
2
|
+
from pydantic import PrivateAttr
|
|
3
|
+
import re
|
|
4
|
+
import io
|
|
5
|
+
import contextlib
|
|
6
|
+
import logging
|
|
7
|
+
from langchain_experimental.tools.python.tool import PythonAstREPLTool
|
|
8
|
+
|
|
9
|
+
class CustomPythonAstREPLTool(PythonAstREPLTool):
|
|
10
|
+
"""
|
|
11
|
+
Custom Python REPL tool that executes Python code and captures output.
|
|
12
|
+
This tool is designed to be used in a LangChain agent for executing Python code
|
|
13
|
+
and capturing the output, including any print statements.
|
|
14
|
+
"""
|
|
15
|
+
__name__ = "Custom_Python_AST_REPL"
|
|
16
|
+
_exec_globals: dict = PrivateAttr()
|
|
17
|
+
def __init__(self, *args, **kwargs):
|
|
18
|
+
super().__init__(*args, **kwargs)
|
|
19
|
+
self._exec_globals = {}
|
|
20
|
+
self._exec_globals.update(__builtins__)
|
|
21
|
+
|
|
22
|
+
def _set_globals(self, table_dict=None):
|
|
23
|
+
self._exec_globals = {}
|
|
24
|
+
self._exec_globals.update(__builtins__)
|
|
25
|
+
|
|
26
|
+
if table_dict is not None:
|
|
27
|
+
self._exec_globals.update(table_dict)
|
|
28
|
+
|
|
29
|
+
def _run(self, query: str, run_manager=None):
|
|
30
|
+
print("================================== code here ==============================")
|
|
31
|
+
print(query)
|
|
32
|
+
print("===========================================================================")
|
|
33
|
+
code_match = re.search(r"```(.*?)```", query, re.DOTALL)
|
|
34
|
+
if code_match:
|
|
35
|
+
# Extract code within backticks
|
|
36
|
+
code = code_match.group(1)
|
|
37
|
+
else:
|
|
38
|
+
code = query
|
|
39
|
+
code = code.strip()
|
|
40
|
+
if code.startswith("python"):
|
|
41
|
+
code = code[len("python"):].lstrip()
|
|
42
|
+
|
|
43
|
+
if code.endswith("Observation"):
|
|
44
|
+
code = code[:-len("Observation")].rstrip()
|
|
45
|
+
|
|
46
|
+
code_lines = code.strip().split('\n')
|
|
47
|
+
code = '\n'.join(code_lines[:-1]) # avoid printing the last line twice
|
|
48
|
+
last_line = code_lines[-1]
|
|
49
|
+
|
|
50
|
+
output_capture = io.StringIO()
|
|
51
|
+
with contextlib.redirect_stdout(output_capture), contextlib.redirect_stderr(output_capture):
|
|
52
|
+
logging.getLogger().handlers[0].stream = output_capture
|
|
53
|
+
try:
|
|
54
|
+
exec(code, self._exec_globals)
|
|
55
|
+
try:
|
|
56
|
+
result = eval(last_line, self._exec_globals)
|
|
57
|
+
if result is not None:
|
|
58
|
+
print(result, file=output_capture)
|
|
59
|
+
except:
|
|
60
|
+
pass
|
|
61
|
+
except Exception as e:
|
|
62
|
+
return str(e)
|
|
63
|
+
|
|
64
|
+
# Retrieve the output and return it
|
|
65
|
+
output = output_capture.getvalue()
|
|
66
|
+
return output if output else "Execution completed without output."
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|