bioguider 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

Files changed (47) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +88 -0
  4. bioguider/agents/agent_tools.py +147 -0
  5. bioguider/agents/agent_utils.py +357 -0
  6. bioguider/agents/collection_execute_step.py +180 -0
  7. bioguider/agents/collection_observe_step.py +113 -0
  8. bioguider/agents/collection_plan_step.py +154 -0
  9. bioguider/agents/collection_task.py +179 -0
  10. bioguider/agents/collection_task_utils.py +109 -0
  11. bioguider/agents/common_agent.py +159 -0
  12. bioguider/agents/common_agent_2step.py +126 -0
  13. bioguider/agents/common_step.py +85 -0
  14. bioguider/agents/dockergeneration_execute_step.py +186 -0
  15. bioguider/agents/dockergeneration_observe_step.py +153 -0
  16. bioguider/agents/dockergeneration_plan_step.py +158 -0
  17. bioguider/agents/dockergeneration_task.py +158 -0
  18. bioguider/agents/dockergeneration_task_utils.py +220 -0
  19. bioguider/agents/evaluation_task.py +269 -0
  20. bioguider/agents/identification_execute_step.py +179 -0
  21. bioguider/agents/identification_observe_step.py +92 -0
  22. bioguider/agents/identification_plan_step.py +135 -0
  23. bioguider/agents/identification_task.py +220 -0
  24. bioguider/agents/identification_task_utils.py +18 -0
  25. bioguider/agents/peo_common_step.py +64 -0
  26. bioguider/agents/prompt_utils.py +190 -0
  27. bioguider/agents/python_ast_repl_tool.py +69 -0
  28. bioguider/agents/rag_collection_task.py +130 -0
  29. bioguider/conversation.py +67 -0
  30. bioguider/database/summarized_file_db.py +140 -0
  31. bioguider/managers/evaluation_manager.py +108 -0
  32. bioguider/rag/__init__.py +0 -0
  33. bioguider/rag/config.py +117 -0
  34. bioguider/rag/data_pipeline.py +648 -0
  35. bioguider/rag/embedder.py +24 -0
  36. bioguider/rag/rag.py +134 -0
  37. bioguider/settings.py +103 -0
  38. bioguider/utils/constants.py +40 -0
  39. bioguider/utils/default.gitignore +140 -0
  40. bioguider/utils/file_utils.py +126 -0
  41. bioguider/utils/gitignore_checker.py +175 -0
  42. bioguider/utils/pyphen_utils.py +73 -0
  43. bioguider/utils/utils.py +27 -0
  44. bioguider-0.2.3.dist-info/LICENSE +21 -0
  45. bioguider-0.2.3.dist-info/METADATA +44 -0
  46. bioguider-0.2.3.dist-info/RECORD +47 -0
  47. bioguider-0.2.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,220 @@
1
+
2
+ import os
3
+ import json
4
+ import logging
5
+ from enum import Enum
6
+ from typing import Callable
7
+ from pydantic import BaseModel, Field
8
+ from langchain_openai.chat_models.base import BaseChatOpenAI
9
+ from langchain.tools import Tool
10
+ from langgraph.graph import StateGraph, START, END
11
+
12
+ from bioguider.utils.constants import PrimaryLanguageEnum, ProjectTypeEnum
13
+ from bioguider.utils.file_utils import get_file_type
14
+ from bioguider.agents.agent_tools import (
15
+ read_file_tool,
16
+ read_directory_tool,
17
+ summarize_file_tool,
18
+ )
19
+ from bioguider.agents.agent_utils import (
20
+ read_directory,
21
+ )
22
+ from bioguider.agents.identification_execute_step import IdentificationExecuteStep
23
+ from bioguider.agents.identification_observe_step import IdentificationObserveStep
24
+ from bioguider.agents.identification_plan_step import IdentificationPlanStep
25
+ from bioguider.agents.identification_task_utils import IdentificationWorkflowState
26
+ from bioguider.agents.peo_common_step import PEOCommonStep
27
+ from bioguider.agents.prompt_utils import (
28
+ IDENTIFICATION_GOAL_PROJECT_TYPE,
29
+ IDENTIFICATION_GOAL_PRIMARY_LANGUAGE,
30
+ IDENTIFICATION_GOAL_META_DATA,
31
+ )
32
+ from bioguider.agents.python_ast_repl_tool import CustomPythonAstREPLTool
33
+ from bioguider.agents.agent_task import AgentTask
34
+ from bioguider.database.summarized_file_db import SummarizedFilesDb
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ META_DATA_FINAL_ANSWER_EXAMPLE = '{{"name": "repo name", ...}}'
39
+ PROJECT_TYPE_FINAL_ANSWER_EXAMPLE = '{{"project_type": "project type"}}'
40
+ PRIMARY_LANGUAGE_FINAL_ANSWER_EXAMPLE = '{{"primary_language": "primary language"}}'
41
+
42
+ class IdentificationPlanResult(BaseModel):
43
+ """ Identification Plan Result """
44
+ actions: list[dict] = Field(description="a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]")
45
+
46
+ IdentificationPlanResultJsonSchema = {
47
+ "title": "identification_plan_result",
48
+ "description": "plan result",
49
+ "type": "object",
50
+ "properties": {
51
+ "actions": {
52
+ "type": "array",
53
+ "description": """a list of action dictionary, e.g. [{'name': 'read_file', 'input': 'README.md'}, ...]""",
54
+ "title": "Actions",
55
+ "items": {"type": "object"}
56
+ },
57
+ },
58
+ "required": ["actions"],
59
+ }
60
+
61
+ class IdentificationTask(AgentTask):
62
+ def __init__(
63
+ self,
64
+ llm: BaseChatOpenAI,
65
+ step_callback: Callable | None=None,
66
+ ):
67
+ super().__init__(llm=llm, step_callback=step_callback)
68
+ self.repo_path: str | None = None
69
+ self.gitignore_path: str | None = None
70
+ self.repo_structure: str | None = None
71
+ self.tools = []
72
+ self.custom_tools = []
73
+ self.steps: list[PEOCommonStep] = []
74
+
75
+ def _initialize(self):
76
+ if not os.path.exists(self.repo_path):
77
+ raise ValueError(f"Repository path {self.repo_path} does not exist.")
78
+ files = read_directory(self.repo_path, os.path.join(self.repo_path, ".gitignore"))
79
+ file_pairs = [(f, get_file_type(os.path.join(self.repo_path, f)).value) for f in files]
80
+ self.repo_structure = ""
81
+ for f, f_type in file_pairs:
82
+ self.repo_structure += f"{f} - {f_type}\n"
83
+
84
+ self.tools = [
85
+ summarize_file_tool(
86
+ llm=self.llm,
87
+ repo_path=self.repo_path,
88
+ output_callback=self._print_step,
89
+ db=self.summary_file_db,
90
+ ),
91
+ read_directory_tool(repo_path=self.repo_path, gitignore_path=self.gitignore_path),
92
+ read_file_tool(repo_path=self.repo_path),
93
+ ]
94
+ self.custom_tools = [Tool(
95
+ name=tool.__class__.__name__,
96
+ func=tool.run,
97
+ description=tool.__class__.__doc__,
98
+ ) for tool in self.tools]
99
+ self.custom_tools.append(CustomPythonAstREPLTool())
100
+ self.steps = [
101
+ IdentificationPlanStep(
102
+ llm=self.llm,
103
+ repo_path=self.repo_path,
104
+ repo_structure=self.repo_structure,
105
+ gitignore_path=self.gitignore_path,
106
+ custom_tools=self.custom_tools,
107
+ ),
108
+ IdentificationExecuteStep(
109
+ llm=self.llm,
110
+ repo_path=self.repo_path,
111
+ repo_structure=self.repo_structure,
112
+ gitignore_path=self.gitignore_path,
113
+ custom_tools=self.custom_tools,
114
+ ),
115
+ IdentificationObserveStep(
116
+ llm=self.llm,
117
+ repo_path=self.repo_path,
118
+ repo_structure=self.repo_structure,
119
+ gitignore_path=self.gitignore_path,
120
+ custom_tools=self.custom_tools,
121
+ )
122
+ ]
123
+
124
+
125
+ def _compile(
126
+ self,
127
+ repo_path: str,
128
+ gitignore_path: str,
129
+ **kwargs,
130
+ ):
131
+ self.repo_path = repo_path
132
+ self.gitignore_path = gitignore_path
133
+ self._initialize()
134
+
135
+ def check_observation_step(state: IdentificationWorkflowState):
136
+ if "final_answer" in state and state["final_answer"] is not None:
137
+ return END
138
+ return "plan_step"
139
+
140
+ graph = StateGraph(IdentificationWorkflowState)
141
+ graph.add_node("plan_step", self.steps[0].execute)
142
+ graph.add_node("execute_step", self.steps[1].execute)
143
+ graph.add_node("observe_step", self.steps[2].execute)
144
+ graph.add_edge(START, "plan_step")
145
+ graph.add_edge("plan_step", "execute_step")
146
+ graph.add_edge("execute_step", "observe_step")
147
+ graph.add_conditional_edges("observe_step", check_observation_step, {"plan_step", END})
148
+
149
+ self.graph = graph.compile()
150
+
151
+ def identify_project_type(self):
152
+ s = self._go_graph({
153
+ "goal": IDENTIFICATION_GOAL_PROJECT_TYPE,
154
+ "final_answer_example": PROJECT_TYPE_FINAL_ANSWER_EXAMPLE,
155
+ })
156
+ proj_type = s["final_answer"] if "final_answer" in s else "unknown type"
157
+ return self._parse_project_type(proj_type)
158
+
159
+ def identify_primary_language(self):
160
+ s = self._go_graph({
161
+ "goal": IDENTIFICATION_GOAL_PRIMARY_LANGUAGE,
162
+ "final_answer_example": PRIMARY_LANGUAGE_FINAL_ANSWER_EXAMPLE,
163
+ })
164
+ language = s["final_answer"] if "final_answer" in s else "unknown type"
165
+ return self._parse_primary_language(language)
166
+
167
+ def identify_meta_data(self):
168
+ s = self._go_graph({
169
+ "goal": IDENTIFICATION_GOAL_META_DATA,
170
+ "final_answer_example": META_DATA_FINAL_ANSWER_EXAMPLE,
171
+ })
172
+ meta_data = s["final_answer"] if "final_answer" in s else "unknown type"
173
+ return self._parse_meta_data(meta_data)
174
+
175
+
176
+ def _parse_project_type(self, proj_type_obj: str) -> ProjectTypeEnum:
177
+ try:
178
+ json_obj = json.loads(proj_type_obj)
179
+ proj_type = json_obj["project_type"]
180
+ except Exception as e:
181
+ logger.error(e)
182
+ return ProjectTypeEnum.unknown
183
+ proj_type = proj_type.strip()
184
+ if proj_type == "application":
185
+ return ProjectTypeEnum.application
186
+ elif proj_type == "package":
187
+ return ProjectTypeEnum.package
188
+ elif proj_type == "pipeline":
189
+ return ProjectTypeEnum.pipeline
190
+ else:
191
+ return ProjectTypeEnum.unknown
192
+
193
+ def _parse_primary_language(self, language_obj: str) -> PrimaryLanguageEnum:
194
+ try:
195
+ json_obj = json.loads(language_obj)
196
+ language = json_obj["primary_language"]
197
+ except Exception as e:
198
+ logger.error(e)
199
+ return PrimaryLanguageEnum.unknown
200
+ language = language.strip()
201
+ if language == "python":
202
+ return PrimaryLanguageEnum.python
203
+ elif language == "R":
204
+ return PrimaryLanguageEnum.R
205
+ else:
206
+ return PrimaryLanguageEnum.unknown
207
+
208
+ def _parse_meta_data(self, meta_data_obj: str) -> dict:
209
+ try:
210
+ json_obj = json.loads(meta_data_obj)
211
+ meta_data = json_obj
212
+ return meta_data
213
+ except Exception as e:
214
+ logger.error(e)
215
+ return {
216
+ "name": "unknown",
217
+ "description": "unknown",
218
+ "license": "unknown",
219
+ "owner": "unknown",
220
+ }
@@ -0,0 +1,18 @@
1
+
2
+
3
+ from typing import Callable, TypedDict, Optional
4
+ from langchain_openai.chat_models.base import BaseChatOpenAI
5
+
6
+ class IdentificationWorkflowState(TypedDict):
7
+ llm: BaseChatOpenAI
8
+ step_output_callback: Optional[Callable]
9
+ goal: str
10
+
11
+ plan_actions: Optional[str]
12
+ plan_reasoning: Optional[str]
13
+ intermediate_steps: Optional[list[str]]
14
+ final_answer: Optional[str]
15
+ final_answer_example: Optional[str]
16
+ step_output: Optional[str]
17
+ step_analysis: Optional[str]
18
+ step_thoughts: Optional[str]
@@ -0,0 +1,64 @@
1
+
2
+
3
+ from typing import Optional
4
+ from langchain_openai.chat_models.base import BaseChatOpenAI
5
+ from pydantic import BaseModel, Field
6
+ from bioguider.agents.common_step import CommonState, CommonStep
7
+
8
+ class PEOWorkflowState(CommonState):
9
+ intermediate_steps: Optional[str]
10
+ step_output: Optional[str]
11
+ step_analysis: Optional[str]
12
+ step_thoughts: Optional[str]
13
+ plan_actions: Optional[list[dict]]
14
+
15
+ class PEOCommonStep(CommonStep):
16
+ """
17
+ This class is a placeholder for common step functionality in the PEO agent.
18
+ It is currently empty and can be extended in the future.
19
+ """
20
+ def __init__(self, llm: BaseChatOpenAI):
21
+ super().__init__()
22
+ self.llm = llm
23
+
24
+ def _build_intermediate_steps(self, state: PEOWorkflowState):
25
+ """
26
+ Build intermediate steps for the PEO workflow.
27
+ """
28
+ intermediate_steps = ""
29
+ # previous steps
30
+ if "intermediate_steps" in state:
31
+ for i in range(len(state['intermediate_steps'])):
32
+ step = state['intermediate_steps'][i].replace("{", "(").replace("}", ")")
33
+ intermediate_steps += step + "\n"
34
+ # current step
35
+ if "step_output" in state and state["step_output"] is not None:
36
+ step_content = state["step_output"]
37
+ step_content = step_content.replace("{", "(").replace("}", ")")
38
+ intermediate_steps += step_content
39
+ return intermediate_steps
40
+
41
+ def _build_intermediate_analysis_and_thoughts(self, state: PEOWorkflowState):
42
+ intermediate_analysis = "N/A" if "step_analysis" not in state or \
43
+ state["step_analysis"] is None \
44
+ else state["step_analysis"]
45
+ intermediate_analysis = intermediate_analysis.replace("{", "(").replace("}", ")")
46
+ intermediate_thoughts = "N/A" if "step_thoughts" not in state or \
47
+ state["step_thoughts"] is None \
48
+ else state["step_thoughts"]
49
+ intermediate_thoughts = intermediate_thoughts.replace("{", "(").replace("}", ")")
50
+ return intermediate_analysis, intermediate_thoughts
51
+
52
+ @staticmethod
53
+ def _reset_step_state(state):
54
+ # move step_output to intermediate steps
55
+ if "intermediate_steps" not in state or state["intermediate_steps"] is None:
56
+ state["intermediate_steps"] = []
57
+ intermediate_steps = state["intermediate_steps"]
58
+ if "step_output" in state and state["step_output"] is not None:
59
+ intermediate_steps.append(state["step_output"])
60
+ state["intermediate_steps"] = intermediate_steps
61
+
62
+ state["step_analysis"] = None
63
+ state["step_thoughts"] = None
64
+ state["step_output"] = None
@@ -0,0 +1,190 @@
1
+ from enum import Enum
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+
4
+ USER_INSTRUCTION = """Do not give the final result immediately. First, explain your reasoning process step by step, then provide the answer."""
5
+
6
+ EVALUATION_ITEMS = [
7
+ ("1. Clarity & Readability", 20),
8
+ ("2. Completeness", 20),
9
+ ("3. Organization & Navigation", 10),
10
+ ("4. Examples & Tutorials", 10),
11
+ ("5. Maintainability & Updates", 15),
12
+ ("6. Accessibility & Formatting", 15),
13
+ ]
14
+
15
+ EVALUATION_SYSTEM_PROMPT = ChatPromptTemplate.from_template("""Please act as both a **biomedical researcher** and an **experienced software developer** to evaluate the documentation quality of a GitHub repository using the evaluation criteria below.
16
+
17
+ ### **Evaluation Criteria (Total: 100 points)**
18
+
19
+ 1. **Clarity & Readability (20 points)** - Is the documentation written in a clear, concise, and easy-to-understand manner?
20
+ 2. **Completeness (20 points)** - Does the documentation cover all essential information needed for understanding, usage, and further development?
21
+ 3. **Organization & Navigation (10 points)** - Is the structure logical and easy to navigate? Are key sections easy to find?
22
+ 4. **Examples & Tutorials (10 points)** - Are there sufficient examples or tutorials to help users get started and understand core functionality?
23
+ 5. **Maintainability & Updates (15 points)** - Does the documentation reflect ongoing maintenance and version history (e.g., changelogs, version tags)?
24
+ 6. **Accessibility & Formatting (15 points)** - Is the documentation well-formatted and easy to read (e.g., Markdown formatting, appropriate use of code blocks, headers, etc.)?
25
+ ### **Repository Structure Overview**
26
+ _(f = file, d = directory)_
27
+ ```
28
+ {repository_structure}
29
+ ```""")
30
+
31
+ EVALUATION_ITEM_PROMPT = ChatPromptTemplate.from_template("""Here are the content of files or directories in the repository that you need to take into account:
32
+ {files_or_directories}
33
+
34
+ ### **Instructions**
35
+
36
+ Let's begin by evaluating **Criterion {evaluation_item}*.
37
+
38
+ - If the information provided is **sufficient**, please proceed with your evaluation using the following format:
39
+ ```
40
+ {evaluation_item} ({score_point} points)
41
+ a. Score: [score out of {score_point}]
42
+ b. Reason: [brief explanation justifying the score]
43
+ ```
44
+ - If the information provided is **insufficient**, do **not** attempt to evaluate. Instead, list the specific files or directories for which you need more detail, using the format below:
45
+ ```
46
+ [files/directories needed for evaluation]
47
+ ```""")
48
+
49
+
50
+ ## goal: identify project type
51
+ IDENTIFICATION_GOAL_PROJECT_TYPE = """Identify the following key attribute of the repository:
52
+ **project type**: The primary functional type of the project.
53
+ Options and their definitions:
54
+ - **package**: A reusable Python or R library intended to be imported by other software.
55
+ - **application**: A standalone Python or R program that can be directly executed by users.
56
+ - **pipeline**: A biomedical data processing workflow that integrates multiple tools or steps.
57
+ - **unknown type**: Use this only if the type cannot be determined reliably from available information.
58
+ **Notes**:
59
+ 1. The project can be identified as one of the above project type.
60
+ 2. The project may server as multiple project types, like package & pipeline, standalone application & package,
61
+ However, you need to investigate closely to find out the primary project type.
62
+ 3. Do **not** rely heavily on directories like 'benchmark/' or 'tests/' when determining the project type, as they are often auxiliary."""
63
+
64
+ ## goal: identify primary language
65
+ IDENTIFICATION_GOAL_PRIMARY_LANGUAGE = """Identify the following key attribute of the repository:
66
+ **primary language**: The primary language of the project.
67
+ Options and their definitions:
68
+ - **python**: Python language
69
+ - **R**: R language
70
+ - **unknown type**: Use this only if the type cannot be determined reliably from available information.
71
+ **Notes**:
72
+ The project can be identified as one of the above primary language."""
73
+
74
+ ## goal: identify meta data: repo name, owner, description, license
75
+ IDENTIFICATION_GOAL_META_DATA = """Identify the following meta data of the repository:
76
+ **name**: The repository name.
77
+ **owner**: The repository user or orgnization.
78
+ **description**: The description of the repository.
79
+ **license**: The license of the repository, like 'MIT', 'Apache 2.0' or 'unknown'.
80
+
81
+ **Notes**: If the above meta data can't be identified, please return 'unknown' or 'N/A'.
82
+ """
83
+
84
+ COT_USER_INSTRUCTION = "Do not give the answer immediately. First, explain your reasoning process step by step, then provide the answer."
85
+
86
+ class CollectionGoalItemEnum(Enum):
87
+ UserGuide = "User Guide"
88
+ Tutorial = "Tutorials & Vignettes"
89
+ DockerGeneration = "Docker Generation"
90
+ Installation = "Installation"
91
+ License = "License"
92
+ Contributing = "Contributing"
93
+
94
+
95
+
96
+ COLLECTION_GOAL = """Your goal is to collect the names of all files that are relevant to **{goal_item}**.
97
+ **Note:**
98
+ - You only need to collect the **file names**, not their contents."""
99
+
100
+ COLLECTION_PROMPTS = {
101
+ "UserGuide": {
102
+ "goal_item": "User Guide",
103
+ "related_file_description": """A document qualifies as a **User Guide** if it includes **at least one** of the following elements.
104
+ If **any one** of these is present, the document should be classified as a User Guide — full coverage is **not required**:
105
+ - Overview: A brief introduction to the software, its purpose, and its intended audience.
106
+ - Installation Instructions: Step-by-step setup procedures.
107
+ - Input/Output Specifications: Detailed information on the data the software accepts and produces.
108
+ - Configuration Options: Descriptions of settings and parameters that can be adjusted.
109
+ - Function/Interface Listings: Comprehensive lists of available functions or interfaces, including their descriptions, parameters, and return values.
110
+ - Mathematical Equations/Numerical Methods: Embedded documentation explaining the underlying mathematical concepts or algorithms.
111
+ - Developer Guidance: Instructions on how to extend the software or contribute to its development.
112
+ **Do not** classify the document as a User Guide if it primarily serves as a Tutorial or Example. Such documents typically include:
113
+ - Sample Datasets: Example data used to illustrate functionality.
114
+ - Narrative Explanations: Story-like descriptions guiding the user through examples.
115
+ - Code Walkthroughs: Detailed explanations of code snippets in a tutorial format.
116
+ **Do not** classify the document as a User Guide if it is souce code or a script (*.py, *.R) that is not intended for end-user interaction.
117
+ - You can include directory names if all files in the directory are relevant to the goal item.""",
118
+ },
119
+ "Tutorial": {
120
+ "goal_item": "Tutorials & Vignettes",
121
+ "related_file_description": """
122
+ **Tutorials and Vignettes** are instructional documents or interactive notebooks that provide step-by-step guidance on using a software package or library. They typically include:
123
+ - Code Examples: Practical code snippets demonstrating how to use the software's features and functions.
124
+ - Explanatory Text: Clear explanations accompanying the code examples to help users understand the concepts and techniques being presented.​
125
+ - Visualizations: Graphical representations of data or results to enhance understanding.
126
+ - Interactive Elements: Features that allow users to experiment with the code in real-time, such as Jupyter notebooks or R Markdown files.​
127
+ - Use Cases: Real-world applications or scenarios where the software can be applied effectively.
128
+ - You can include directory names if all files in the directory are relevant to the goal item.
129
+ """,
130
+ },
131
+ "DockerGeneration": {
132
+ "goal_item": "Generating a Dockerfile for reproducibility testing",
133
+
134
+ "related_file_description": """A document qualifies **Dockerfile Generation** related if it includes **at least one** of the following elements.
135
+ If **any one** of these is present, the document should be classified as a Dockerfile — full coverage is **not required**:
136
+ - Existing Docker Configuration
137
+ * Files like `Dockerfile`, `docker-compose.yml`, or any Docker-related build scripts.
138
+ - Installation & Environment Setup
139
+ * Files used to define or install dependencies.
140
+ * Examples: `README.md` `requirements.txt`, `environment.yml`, `setup.py`, `install.R`, `DESCRIPTION`, `pyproject.toml`, etc.
141
+ - Build/Runtime Scripts
142
+ * Shell or batch scripts used for setup, building, or launching the application.
143
+ * Examples: `install.sh`, `build.sh`, `run.sh`, etc.
144
+ - Minimal Code Examples or Get-Started Files
145
+ * Files that demonstrate a minimal working example of the software (e.g., for testing or reproducing results).
146
+ * Examples: `example.py`, `main.py`, `demo.R`, `notebooks/get_started.ipynb`, etc.
147
+ * These should be runnable with minimal configuration.""",
148
+
149
+ "important_instructions": """- Only include minimal code examples that demonstrate basic functionality.
150
+ If multiple example files are found, select only the simplest and most lightweight one that is sufficient to verify the repository works.
151
+ - Give priority to analyzing files whose names include **"install"** or **"Dockerfile"**, as these are most likely to be useful for generating our Dockerfile
152
+ - The total number of collected files should **not exceed 5**.
153
+ - Make sure to include **only one code example**, selecting the most minimal and representative one.
154
+ """
155
+ },
156
+ "Installation": {
157
+ "goal_item": "Installation Instructions",
158
+ "related_file_description": """A document qualifies as **Installation Instructions** if it includes **at least one** of the following elements.
159
+ If **any one** of these is present, the document should be classified as Installation Instructions — full coverage is **not required**:
160
+ - Step-by-step setup procedures for the software.
161
+ - Prerequisites or dependencies that need to be installed before using the software.
162
+ - Configuration steps required to get the software running.
163
+ - Troubleshooting tips related to installation issues.
164
+ - You can include directory names if all files in the directory are relevant to the goal item.""",
165
+ "important_instructions": """- Give priority to analyzing README file that contain installation instructions and the files whose names include **"install"** or **"setup"**.
166
+ - If multiple files are found, select the most comprehensive one that covers the installation process.
167
+ - The total number of collected files should **not exceed 3**.
168
+ - Make sure to include **only one installation instruction file**, selecting the most comprehensive and representative one.
169
+ """
170
+ },
171
+ "License": {
172
+ "goal_item": "License Information",
173
+ "related_file_description": """A document qualifies as **License Information** if it includes **at least one** of the following elements.
174
+ If **any one** of these is present, the document should be classified as License Information — full coverage is **not required**:
175
+ - A file named `LICENSE`, `LICENSE.txt`, or similar that explicitly states the software's license.
176
+ - A section in the README or documentation that describes the licensing terms.
177
+ - Any file that contains legal information regarding the use, distribution, or modification of the software.
178
+ - You can include directory names if all files in the directory are relevant to the goal item.""",
179
+ },
180
+ "Contributing": {
181
+ "goal_item": "Contributing Guidelines",
182
+ "related_file_description": """A document qualifies as **Contributing Guidelines** if it includes **at least one** of the following elements.
183
+ If **any one** of these is present, the document should be classified as Contributing Guidelines — full coverage is **not required**:
184
+ - A file named `CONTRIBUTING.md`, `CONTRIBUTING.rst`, or similar that provides guidelines for contributing to the project.
185
+ - A section in the README or documentation that outlines how to contribute, report issues, or submit pull requests.
186
+ - Any file that contains instructions for developers on how to contribute to the project, including coding standards, testing procedures, and submission processes.
187
+ - You can include directory names if all files in the directory are relevant to the goal item.""",
188
+ },
189
+ }
190
+
@@ -0,0 +1,69 @@
1
+
2
+ from pydantic import PrivateAttr
3
+ import re
4
+ import io
5
+ import contextlib
6
+ import logging
7
+ from langchain_experimental.tools.python.tool import PythonAstREPLTool
8
+
9
+ class CustomPythonAstREPLTool(PythonAstREPLTool):
10
+ """
11
+ Custom Python REPL tool that executes Python code and captures output.
12
+ This tool is designed to be used in a LangChain agent for executing Python code
13
+ and capturing the output, including any print statements.
14
+ """
15
+ __name__ = "Custom_Python_AST_REPL"
16
+ _exec_globals: dict = PrivateAttr()
17
+ def __init__(self, *args, **kwargs):
18
+ super().__init__(*args, **kwargs)
19
+ self._exec_globals = {}
20
+ self._exec_globals.update(__builtins__)
21
+
22
+ def _set_globals(self, table_dict=None):
23
+ self._exec_globals = {}
24
+ self._exec_globals.update(__builtins__)
25
+
26
+ if table_dict is not None:
27
+ self._exec_globals.update(table_dict)
28
+
29
+ def _run(self, query: str, run_manager=None):
30
+ print("================================== code here ==============================")
31
+ print(query)
32
+ print("===========================================================================")
33
+ code_match = re.search(r"```(.*?)```", query, re.DOTALL)
34
+ if code_match:
35
+ # Extract code within backticks
36
+ code = code_match.group(1)
37
+ else:
38
+ code = query
39
+ code = code.strip()
40
+ if code.startswith("python"):
41
+ code = code[len("python"):].lstrip()
42
+
43
+ if code.endswith("Observation"):
44
+ code = code[:-len("Observation")].rstrip()
45
+
46
+ code_lines = code.strip().split('\n')
47
+ code = '\n'.join(code_lines[:-1]) # avoid printing the last line twice
48
+ last_line = code_lines[-1]
49
+
50
+ output_capture = io.StringIO()
51
+ with contextlib.redirect_stdout(output_capture), contextlib.redirect_stderr(output_capture):
52
+ logging.getLogger().handlers[0].stream = output_capture
53
+ try:
54
+ exec(code, self._exec_globals)
55
+ try:
56
+ result = eval(last_line, self._exec_globals)
57
+ if result is not None:
58
+ print(result, file=output_capture)
59
+ except:
60
+ pass
61
+ except Exception as e:
62
+ return str(e)
63
+
64
+ # Retrieve the output and return it
65
+ output = output_capture.getvalue()
66
+ return output if output else "Execution completed without output."
67
+
68
+
69
+