bioguider 0.2.11__tar.gz → 0.2.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- {bioguider-0.2.11 → bioguider-0.2.13}/PKG-INFO +1 -1
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/agent_task.py +8 -4
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/agent_tools.py +17 -14
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/agent_utils.py +40 -4
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/collection_observe_step.py +7 -5
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/collection_plan_step.py +9 -7
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/collection_task.py +15 -5
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/collection_task_utils.py +46 -15
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/dockergeneration_task.py +1 -1
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/evaluation_installation_task.py +30 -8
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/evaluation_readme_task.py +30 -4
- bioguider-0.2.13/bioguider/agents/evaluation_submission_requirements_task.py +153 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/evaluation_task.py +19 -6
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/identification_observe_step.py +7 -1
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/identification_plan_step.py +6 -1
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/identification_task.py +23 -4
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/identification_task_utils.py +2 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/prompt_utils.py +44 -4
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/managers/evaluation_manager.py +38 -46
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/utils/constants.py +2 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/pyproject.toml +1 -1
- {bioguider-0.2.11 → bioguider-0.2.13}/LICENSE +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/README.md +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/__init__.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/__init__.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/collection_execute_step.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/common_agent.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/common_agent_2step.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/common_step.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/dockergeneration_execute_step.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/dockergeneration_observe_step.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/dockergeneration_plan_step.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/dockergeneration_task_utils.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/identification_execute_step.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/peo_common_step.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/python_ast_repl_tool.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/agents/rag_collection_task.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/conversation.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/database/summarized_file_db.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/rag/__init__.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/rag/config.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/rag/data_pipeline.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/rag/embedder.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/rag/rag.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/settings.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/utils/default.gitignore +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/utils/file_utils.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/utils/gitignore_checker.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/utils/pyphen_utils.py +0 -0
- {bioguider-0.2.11 → bioguider-0.2.13}/bioguider/utils/utils.py +0 -0
|
@@ -13,7 +13,12 @@ class AgentTask(ABC):
|
|
|
13
13
|
A class representing a step in an agent's process.
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
|
-
def __init__(
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
llm: BaseChatOpenAI,
|
|
19
|
+
step_callback: Callable | None = None,
|
|
20
|
+
summarized_files_db: SummarizedFilesDb | None = None,
|
|
21
|
+
):
|
|
17
22
|
"""
|
|
18
23
|
Initialize the AgentStep with a language model and a callback function.
|
|
19
24
|
|
|
@@ -23,7 +28,7 @@ class AgentTask(ABC):
|
|
|
23
28
|
"""
|
|
24
29
|
self.llm = llm
|
|
25
30
|
self.step_callback = step_callback
|
|
26
|
-
self.
|
|
31
|
+
self.summarized_files_db = summarized_files_db
|
|
27
32
|
self.graph: CompiledGraph | None = None
|
|
28
33
|
|
|
29
34
|
def _print_step(
|
|
@@ -45,7 +50,7 @@ class AgentTask(ABC):
|
|
|
45
50
|
token_usage=token_usage,
|
|
46
51
|
)
|
|
47
52
|
|
|
48
|
-
def compile(self, repo_path: str, gitignore_path: str,
|
|
53
|
+
def compile(self, repo_path: str, gitignore_path: str, **kwargs):
|
|
49
54
|
"""
|
|
50
55
|
Compile the agent step with the given repository and gitignore paths.
|
|
51
56
|
|
|
@@ -55,7 +60,6 @@ class AgentTask(ABC):
|
|
|
55
60
|
**kwargs: derived class may pass more arguments to implmented _compile(), that is,
|
|
56
61
|
what **kwargs is depends on derived class
|
|
57
62
|
"""
|
|
58
|
-
self.summary_file_db = db
|
|
59
63
|
self._compile(repo_path, gitignore_path, **kwargs)
|
|
60
64
|
|
|
61
65
|
@abstractmethod
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import logging
|
|
2
3
|
from typing import Callable
|
|
3
4
|
from markdownify import markdownify as md
|
|
4
5
|
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
@@ -7,6 +8,8 @@ from bioguider.utils.file_utils import get_file_type
|
|
|
7
8
|
from bioguider.agents.agent_utils import read_directory, read_file, summarize_file
|
|
8
9
|
from bioguider.rag.data_pipeline import count_tokens
|
|
9
10
|
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
10
13
|
class agent_tool:
|
|
11
14
|
def __init__(
|
|
12
15
|
self,
|
|
@@ -53,19 +56,12 @@ Returns:
|
|
|
53
56
|
class summarize_file_tool(agent_tool):
|
|
54
57
|
""" Read a file and generate a summary according to a specified prompt.
|
|
55
58
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
Path to the file to read.
|
|
60
|
-
summarize_prompt : str, optional
|
|
61
|
-
Instruction guiding the summarization focus (default is "N/A").
|
|
62
|
-
Use this to emphasize specific aspects of the content.
|
|
59
|
+
Args:
|
|
60
|
+
file_path str: required. The file path to read.
|
|
61
|
+
summarize_prompt str: optional. A string instruction guiding the summarization focus (default is "N/A"). Use this to emphasize specific aspects of the content.
|
|
63
62
|
|
|
64
|
-
Returns
|
|
65
|
-
|
|
66
|
-
str or None
|
|
67
|
-
A summarized version of the file content.
|
|
68
|
-
Returns None if the file does not exist or cannot be read.
|
|
63
|
+
Returns:
|
|
64
|
+
str or None: A summarized version of the file content. Returns None if the file does not exist or cannot be read.
|
|
69
65
|
"""
|
|
70
66
|
def __init__(
|
|
71
67
|
self,
|
|
@@ -124,8 +120,15 @@ Returns
|
|
|
124
120
|
if summarized_content is not None:
|
|
125
121
|
return f"summarized content of file {file_path}: " + summarized_content
|
|
126
122
|
|
|
127
|
-
|
|
128
|
-
|
|
123
|
+
try:
|
|
124
|
+
file_content = read_file(abs_file_path)
|
|
125
|
+
file_content = file_content.replace("{", "{{").replace("}", "}}")
|
|
126
|
+
except UnicodeDecodeError as e:
|
|
127
|
+
logger.error(str(e))
|
|
128
|
+
return f"{file_path} is a binary, can't be summarized."
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(str(e))
|
|
131
|
+
return f"Failed to read {file_path}."
|
|
129
132
|
summarized_content, token_usage = summarize_file(
|
|
130
133
|
self.llm, abs_file_path, file_content, self.detailed_level,
|
|
131
134
|
summary_instructions=self.summarize_instruction,
|
|
@@ -16,11 +16,12 @@ from langchain.tools import BaseTool
|
|
|
16
16
|
from langchain.schema import AgentAction, AgentFinish
|
|
17
17
|
from langchain.agents import AgentOutputParser
|
|
18
18
|
from langgraph.prebuilt import create_react_agent
|
|
19
|
+
from langchain_community.callbacks.openai_info import OpenAICallbackHandler
|
|
19
20
|
import logging
|
|
20
21
|
|
|
21
22
|
from pydantic import BaseModel, Field
|
|
22
23
|
|
|
23
|
-
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
|
|
24
|
+
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, MAX_FILE_LENGTH, MAX_SENTENCE_NUM
|
|
24
25
|
from bioguider.utils.file_utils import get_file_type
|
|
25
26
|
from ..utils.gitignore_checker import GitignoreChecker
|
|
26
27
|
from ..database.summarized_file_db import SummarizedFilesDb
|
|
@@ -178,8 +179,7 @@ Here is the file content:
|
|
|
178
179
|
Now, let's start to summarize.
|
|
179
180
|
""")
|
|
180
181
|
|
|
181
|
-
|
|
182
|
-
MAX_SENTENCE_NUM=20
|
|
182
|
+
|
|
183
183
|
def summarize_file(
|
|
184
184
|
llm: BaseChatOpenAI,
|
|
185
185
|
name: str,
|
|
@@ -379,6 +379,20 @@ def escape_braces(text: str) -> str:
|
|
|
379
379
|
text = re.sub(r'(?<!{){(?!{)', '{{', text)
|
|
380
380
|
return text
|
|
381
381
|
|
|
382
|
+
STRING_TO_OBJECT_SYSTEM_PROMPT = """
|
|
383
|
+
You are an expert to understand data. You will be provided a text, and your task is to extracted structured data from the provided text.
|
|
384
|
+
|
|
385
|
+
---
|
|
386
|
+
|
|
387
|
+
### **Instructions**
|
|
388
|
+
1. If no structured data can be extracted, return None
|
|
389
|
+
|
|
390
|
+
---
|
|
391
|
+
|
|
392
|
+
### **Input Text**
|
|
393
|
+
{input_text}
|
|
394
|
+
"""
|
|
395
|
+
|
|
382
396
|
def try_parse_json_object(json_obj: str) -> dict | None:
|
|
383
397
|
json_obj = json_obj.strip()
|
|
384
398
|
|
|
@@ -406,4 +420,26 @@ def try_parse_json_object(json_obj: str) -> dict | None:
|
|
|
406
420
|
return None
|
|
407
421
|
except Exception as e:
|
|
408
422
|
logger.error(e)
|
|
409
|
-
return None
|
|
423
|
+
return None
|
|
424
|
+
|
|
425
|
+
def try_parse_with_llm(llm: BaseChatOpenAI, input_text: str, schema: any):
|
|
426
|
+
system_prompt = ChatPromptTemplate.from_template(
|
|
427
|
+
STRING_TO_OBJECT_SYSTEM_PROMPT
|
|
428
|
+
).format(input_text=input_text)
|
|
429
|
+
prompt = ChatPromptTemplate.from_messages([
|
|
430
|
+
("system", system_prompt)
|
|
431
|
+
])
|
|
432
|
+
agent = prompt | llm.with_structured_output(schema)
|
|
433
|
+
callback_handler = OpenAICallbackHandler()
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
res = agent.invoke(
|
|
437
|
+
input={},
|
|
438
|
+
config={
|
|
439
|
+
"callbacks": [callback_handler],
|
|
440
|
+
},
|
|
441
|
+
)
|
|
442
|
+
return res, vars(callback_handler)
|
|
443
|
+
except Exception as e:
|
|
444
|
+
logger.error(e)
|
|
445
|
+
return None
|
|
@@ -5,7 +5,7 @@ from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
|
5
5
|
from langchain_core.prompts import ChatPromptTemplate
|
|
6
6
|
from bioguider.agents.agent_utils import ObservationResult
|
|
7
7
|
from bioguider.agents.collection_task_utils import CollectionWorkflowState
|
|
8
|
-
from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
|
|
8
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
|
|
9
9
|
from bioguider.agents.peo_common_step import PEOCommonStep
|
|
10
10
|
from bioguider.agents.prompt_utils import COLLECTION_GOAL, COLLECTION_PROMPTS
|
|
11
11
|
|
|
@@ -34,11 +34,13 @@ Here is the 2-level file structure of the repository (`f` = file, `d` = director
|
|
|
34
34
|
|
|
35
35
|
* Provide your reasoning under **Analysis**
|
|
36
36
|
* Then list all relevant files and folders under **FinalAnswer**
|
|
37
|
+
* **FinalAnswer** format must exactly match this format:
|
|
38
|
+
**FinalAnswer**: {{"final_answer": [<file path>, <file path>, <file path>, ...]}}
|
|
37
39
|
* Be sure to include the **full relative paths** with respect to the repository root.
|
|
38
|
-
* Your answer **must
|
|
40
|
+
* Your answer **must exactly match the follwing format** (note: no JSON code block, no additional comments), **do not** make up anything:
|
|
39
41
|
|
|
40
42
|
```
|
|
41
|
-
**Analysis**: your analysis here
|
|
43
|
+
**Analysis**: your analysis here
|
|
42
44
|
**FinalAnswer**: {{"final_answer": ["path/to/file1", "path/to/file2", ...]}}
|
|
43
45
|
```
|
|
44
46
|
4. If you believe **more files still need to be collected**:
|
|
@@ -80,8 +82,8 @@ class CollectionObserveStep(PEOCommonStep):
|
|
|
80
82
|
repo_structure = self.repo_structure
|
|
81
83
|
intermediate_steps = self._build_intermediate_steps(state)
|
|
82
84
|
prompt = ChatPromptTemplate.from_template(COLLECTION_OBSERVE_SYSTEM_PROMPT)
|
|
83
|
-
important_instructions = "N/A" if "
|
|
84
|
-
else collection_item["
|
|
85
|
+
important_instructions = "N/A" if "observe_important_instructions" not in collection_item or len(collection_item["observe_important_instructions"]) == 0 \
|
|
86
|
+
else collection_item["observe_important_instructions"]
|
|
85
87
|
return prompt.format(
|
|
86
88
|
goal_item_desc=goal_item_desc,
|
|
87
89
|
related_file_description=collection_item["related_file_description"],
|
|
@@ -8,7 +8,7 @@ from bioguider.agents.agent_utils import (
|
|
|
8
8
|
PlanAgentResultJsonSchema,
|
|
9
9
|
PlanAgentResult,
|
|
10
10
|
)
|
|
11
|
-
from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
|
|
11
|
+
from bioguider.agents.common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
|
|
12
12
|
from bioguider.agents.peo_common_step import PEOCommonStep
|
|
13
13
|
from bioguider.agents.collection_task_utils import CollectionWorkflowState
|
|
14
14
|
from bioguider.agents.prompt_utils import COLLECTION_GOAL, COLLECTION_PROMPTS
|
|
@@ -57,7 +57,9 @@ Here are the results from previous steps:
|
|
|
57
57
|
|
|
58
58
|
3. You may use the `read_directory` tool to explore directory contents, but avoid using it in the first step unless necessary.
|
|
59
59
|
|
|
60
|
-
4.
|
|
60
|
+
4. Your plan can only use the above tools, **do not** make up any tools not in the above tools list.
|
|
61
|
+
|
|
62
|
+
5. Your planned step input file or input directory must come from the above repository files structure, **do not** make up file name or directory name.
|
|
61
63
|
|
|
62
64
|
---
|
|
63
65
|
|
|
@@ -65,12 +67,12 @@ Here are the results from previous steps:
|
|
|
65
67
|
{important_instructions}
|
|
66
68
|
|
|
67
69
|
### **Output Format**
|
|
68
|
-
Your plan
|
|
70
|
+
Your plan **must exactly match** a sequence of steps in the following format, **do not** make up anything:
|
|
69
71
|
|
|
70
|
-
Step: <tool name> # Tool name must be one of {tool_names}
|
|
72
|
+
Step: <tool name> # Tool name **must be one** of {tool_names}
|
|
71
73
|
Step Input: <file or directory name>
|
|
72
74
|
|
|
73
|
-
Step: <tool name>
|
|
75
|
+
Step: <tool name> # Tool name **must be one** of {tool_names}
|
|
74
76
|
Step Input: <file or directory name>
|
|
75
77
|
...
|
|
76
78
|
""")
|
|
@@ -105,8 +107,8 @@ class CollectionPlanStep(PEOCommonStep):
|
|
|
105
107
|
step_analysis, step_thoughts = self._build_intermediate_analysis_and_thoughts(state)
|
|
106
108
|
goal = ChatPromptTemplate.from_template(COLLECTION_GOAL).format(goal_item=collection_item["goal_item"])
|
|
107
109
|
related_file_description = collection_item["related_file_description"]
|
|
108
|
-
important_instructions="N/A" if "
|
|
109
|
-
else collection_item["
|
|
110
|
+
important_instructions="N/A" if "plan_important_instructions" not in collection_item or len(collection_item["plan_important_instructions"]) == 0 \
|
|
111
|
+
else collection_item["plan_important_instructions"]
|
|
110
112
|
tool_names, tools_desc = get_tool_names_and_descriptions(self.custom_tools)
|
|
111
113
|
system_prompt = COLLECTION_PLAN_SYSTEM_PROMPT.format(
|
|
112
114
|
goal=goal,
|
|
@@ -50,9 +50,12 @@ class CollectionTask(AgentTask):
|
|
|
50
50
|
def __init__(
|
|
51
51
|
self,
|
|
52
52
|
llm: BaseChatOpenAI,
|
|
53
|
-
step_callback: Callable | None = None
|
|
53
|
+
step_callback: Callable | None = None,
|
|
54
|
+
summarize_instruction: str | None = "N/A",
|
|
55
|
+
summarized_files_db: SummarizedFilesDb | None = None,
|
|
56
|
+
provided_files: list[str] | None = None,
|
|
54
57
|
):
|
|
55
|
-
super().__init__(llm, step_callback)
|
|
58
|
+
super().__init__(llm, step_callback, summarized_files_db=summarized_files_db)
|
|
56
59
|
self.repo_path: str | None = None
|
|
57
60
|
self.gitignore_path: str | None = None
|
|
58
61
|
self.repo_structure: str | None = None
|
|
@@ -60,6 +63,8 @@ class CollectionTask(AgentTask):
|
|
|
60
63
|
self.steps: list[PEOCommonStep] = []
|
|
61
64
|
self.tools: list[any] | None = None
|
|
62
65
|
self.custom_tools: list[Tool] | None = None
|
|
66
|
+
self.summarize_instruction = summarize_instruction
|
|
67
|
+
self.provided_files = provided_files
|
|
63
68
|
|
|
64
69
|
def _prepare_tools(self, related_file_goal_item_desc):
|
|
65
70
|
tool_rd = read_directory_tool(repo_path=self.repo_path)
|
|
@@ -67,7 +72,8 @@ class CollectionTask(AgentTask):
|
|
|
67
72
|
llm=self.llm,
|
|
68
73
|
repo_path=self.repo_path,
|
|
69
74
|
output_callback=self.step_callback,
|
|
70
|
-
db=self.
|
|
75
|
+
db=self.summarized_files_db,
|
|
76
|
+
summaize_instruction=self.summarize_instruction,
|
|
71
77
|
)
|
|
72
78
|
tool_rf = read_file_tool(repo_path=self.repo_path)
|
|
73
79
|
tool_cf = check_file_related_tool(
|
|
@@ -75,6 +81,8 @@ class CollectionTask(AgentTask):
|
|
|
75
81
|
repo_path=self.repo_path,
|
|
76
82
|
goal_item_desc=related_file_goal_item_desc,
|
|
77
83
|
output_callback=self.step_callback,
|
|
84
|
+
summarize_instruction=self.summarize_instruction,
|
|
85
|
+
summarized_files_db=self.summarized_files_db,
|
|
78
86
|
)
|
|
79
87
|
self.tools = [tool_rd, tool_sum, tool_rf, tool_cf]
|
|
80
88
|
self.custom_tools = [
|
|
@@ -99,13 +107,15 @@ class CollectionTask(AgentTask):
|
|
|
99
107
|
description=tool_cf.__class__.__doc__,
|
|
100
108
|
),
|
|
101
109
|
]
|
|
102
|
-
self.custom_tools.append(CustomPythonAstREPLTool())
|
|
110
|
+
# self.custom_tools.append(CustomPythonAstREPLTool())
|
|
103
111
|
|
|
104
112
|
def _initialize(self):
|
|
105
113
|
# initialize the 2-level file structure of the repo
|
|
106
114
|
if not os.path.exists(self.repo_path):
|
|
107
115
|
raise ValueError(f"Repository path {self.repo_path} does not exist.")
|
|
108
|
-
files =
|
|
116
|
+
files = self.provided_files
|
|
117
|
+
if files is None:
|
|
118
|
+
files = read_directory(self.repo_path, os.path.join(self.repo_path, ".gitignore"))
|
|
109
119
|
file_pairs = [(f, get_file_type(os.path.join(self.repo_path, f)).value) for f in files]
|
|
110
120
|
self.repo_structure = ""
|
|
111
121
|
for f, f_type in file_pairs:
|
|
@@ -4,13 +4,17 @@ from langchain.prompts import ChatPromptTemplate
|
|
|
4
4
|
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
5
5
|
from langchain_core.messages import AIMessage
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
7
|
+
import logging
|
|
7
8
|
|
|
8
9
|
from bioguider.agents.agent_tools import agent_tool
|
|
9
10
|
from bioguider.agents.agent_utils import read_file, summarize_file
|
|
10
11
|
from bioguider.agents.peo_common_step import PEOWorkflowState
|
|
11
12
|
from bioguider.agents.common_agent import CommonAgent
|
|
12
13
|
from bioguider.agents.common_agent_2step import CommonAgentTwoSteps
|
|
14
|
+
from bioguider.database.summarized_file_db import SummarizedFilesDb
|
|
15
|
+
from bioguider.utils.constants import MAX_FILE_LENGTH
|
|
13
16
|
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
14
18
|
|
|
15
19
|
class CollectionWorkflowState(TypedDict):
|
|
16
20
|
llm: Optional[BaseChatOpenAI]
|
|
@@ -46,20 +50,22 @@ Does this file appear to contain related information?
|
|
|
46
50
|
|
|
47
51
|
---
|
|
48
52
|
|
|
49
|
-
### **Output Format:**
|
|
50
|
-
Respond with
|
|
51
|
-
|
|
53
|
+
### **Output Format:**
|
|
54
|
+
Respond with exactly two parts:
|
|
55
|
+
1. A single word: Yes or No (indicating if the file meets the goal criteria)
|
|
56
|
+
2. One brief explanatory sentence.
|
|
57
|
+
For example: Yes. This file is a compiled binary file, so, it is related to the compiled standalone file (goal item).
|
|
52
58
|
""")
|
|
53
59
|
|
|
54
60
|
class CheckFileRelatedResult(BaseModel):
|
|
55
|
-
is_related:
|
|
61
|
+
is_related: str = Field(description="A string conclusion specify if the provided file is related. The string value contains two parts:\n 1. A single word: Yes or No (indicating if the file meets the goal criteria).\n 2. One brief explanatory sentence.")
|
|
56
62
|
|
|
57
63
|
class check_file_related_tool(agent_tool):
|
|
58
64
|
""" Check if the file is related to the goal item
|
|
59
65
|
Args:
|
|
60
66
|
file_path str: file path
|
|
61
67
|
Returns:
|
|
62
|
-
|
|
68
|
+
str: A string conclusion. The string conclusion contains two parts:\n 1. A single word: Yes or No (indicating if the file meets the goal criteria).\n 2. One brief explanatory sentence.
|
|
63
69
|
"""
|
|
64
70
|
def __init__(
|
|
65
71
|
self,
|
|
@@ -67,23 +73,51 @@ Returns:
|
|
|
67
73
|
repo_path: str,
|
|
68
74
|
goal_item_desc: str,
|
|
69
75
|
output_callback: Callable | None = None,
|
|
76
|
+
summarize_instruction: str | None = None,
|
|
77
|
+
summarize_level: int | None = 6,
|
|
78
|
+
summarized_files_db: SummarizedFilesDb | None = None,
|
|
70
79
|
):
|
|
71
80
|
super().__init__(llm=llm, output_callback=output_callback)
|
|
72
81
|
self.repo_path = repo_path
|
|
73
82
|
self.goal_item_desc = goal_item_desc
|
|
83
|
+
self.summarize_instruction = summarize_instruction \
|
|
84
|
+
if summarize_instruction is not None else "N/A"
|
|
85
|
+
self.summarize_level = summarize_level
|
|
86
|
+
self.summarized_files_db = summarized_files_db
|
|
74
87
|
|
|
75
88
|
def run(self, file_path: str) -> str:
|
|
76
89
|
if not self.repo_path in file_path:
|
|
77
90
|
file_path = os.path.join(self.repo_path, file_path)
|
|
78
91
|
if not os.path.isfile(file_path):
|
|
79
92
|
return "Can't read file"
|
|
80
|
-
|
|
81
|
-
|
|
93
|
+
|
|
94
|
+
check_prompts = None
|
|
95
|
+
try:
|
|
96
|
+
file_content = read_file(file_path)
|
|
97
|
+
except UnicodeDecodeError as e:
|
|
98
|
+
logger.error(str(e))
|
|
99
|
+
check_prompts = "Can't summarize binary file, please decide according to file name and extension."
|
|
100
|
+
except Exception as e:
|
|
101
|
+
logger.error(str(e))
|
|
102
|
+
check_prompts = "Failed to summarize file, please decide according to file name and extension."
|
|
103
|
+
if check_prompts is None and file_content is None:
|
|
82
104
|
return "Failed to read file"
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
105
|
+
if check_prompts is not None:
|
|
106
|
+
summarized_content = check_prompts
|
|
107
|
+
else:
|
|
108
|
+
if len(file_content) > MAX_FILE_LENGTH:
|
|
109
|
+
file_content = file_content[:MAX_FILE_LENGTH]
|
|
110
|
+
summarized_content, token_usage = summarize_file(
|
|
111
|
+
llm=self.llm,
|
|
112
|
+
name=file_path,
|
|
113
|
+
content=file_content,
|
|
114
|
+
level=self.summarize_level,
|
|
115
|
+
summary_instructions=self.summarize_instruction,
|
|
116
|
+
db=self.summarized_files_db,
|
|
117
|
+
)
|
|
118
|
+
if summarized_content is None:
|
|
119
|
+
return "Failed to summarize file"
|
|
120
|
+
self._print_token_usage(token_usage)
|
|
87
121
|
|
|
88
122
|
prompt = CHECK_FILE_RELATED_USER_PROMPT.format(
|
|
89
123
|
goal_item_desc=self.goal_item_desc,
|
|
@@ -102,8 +136,5 @@ Returns:
|
|
|
102
136
|
|
|
103
137
|
self._print_step_output(step_output=reasoning)
|
|
104
138
|
self._print_token_usage(token_usage)
|
|
105
|
-
|
|
106
|
-
return "Yes, the file is related to the goal item."
|
|
107
|
-
else:
|
|
108
|
-
return "No, the file **is not** related to the goal item."
|
|
139
|
+
return res.is_related
|
|
109
140
|
|
|
@@ -9,7 +9,8 @@ from pydantic import BaseModel, Field
|
|
|
9
9
|
from markdownify import markdownify as md
|
|
10
10
|
|
|
11
11
|
from bioguider.agents.agent_utils import read_file
|
|
12
|
-
from bioguider.agents.
|
|
12
|
+
from bioguider.agents.collection_task import CollectionTask
|
|
13
|
+
from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION, CollectionGoalItemEnum
|
|
13
14
|
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
|
|
14
15
|
from bioguider.rag.data_pipeline import count_tokens
|
|
15
16
|
from .common_agent_2step import CommonAgentTwoSteps, CommonAgentTwoChainSteps
|
|
@@ -29,17 +30,20 @@ Your task is to analyze the provided files related to installation and generate
|
|
|
29
30
|
|
|
30
31
|
### **Evaluation Criteria**
|
|
31
32
|
|
|
32
|
-
1. **Installation Available**: Is the installation
|
|
33
|
+
1. **Installation Available**: Is the installation section in document (like README.md or INSTALLATION)?
|
|
33
34
|
* Output: `Yes` or `No`
|
|
34
35
|
|
|
35
|
-
2. **Installation Tutorial**: Is the installation tutorial provided?
|
|
36
|
+
2. **Installation Tutorial**: Is the step-by-step installation tutorial provided?
|
|
36
37
|
* Ouput: `Yes` or `No`
|
|
37
38
|
|
|
38
39
|
3. **Number of required Dependencies Installation**: The number of dependencies that are required to install
|
|
39
40
|
* Output: Number
|
|
40
41
|
* Suggest specific improvements if necessary, such as missing dependencies
|
|
41
42
|
|
|
42
|
-
4. **
|
|
43
|
+
4. **Compatible Operating System**: Is the compatible operating system described?
|
|
44
|
+
* Output: `Yes` or `No`
|
|
45
|
+
|
|
46
|
+
5. **Overall Score**: Give an overall quality rating of the Installation information.
|
|
43
47
|
* Output: `Poor`, `Fair`, `Good`, or `Excellent`
|
|
44
48
|
|
|
45
49
|
---
|
|
@@ -53,6 +57,7 @@ Your final report must **exactly match** the following format. Do not add or omi
|
|
|
53
57
|
**Dependency:**
|
|
54
58
|
* number: [Number]
|
|
55
59
|
* suggestions: <suggestion to improve **dependency information** like missing dependencies
|
|
60
|
+
**Compatible Operating System:** [Yes / No]
|
|
56
61
|
**Overall Score:** [Poor / Fair / Good / Excellent]
|
|
57
62
|
|
|
58
63
|
---
|
|
@@ -113,6 +118,7 @@ class StructuredEvaluationInstallationResult(BaseModel):
|
|
|
113
118
|
install_tutorial: Optional[bool]=Field(description="A boolean value. Is the installation tutorial provided?")
|
|
114
119
|
dependency_number: Optional[int]=Field(description="A number. It is the number of dependencies that are required to install.")
|
|
115
120
|
dependency_suggestions: Optional[str]=Field(description="A string value. It is the specific improvements if necessary, such as missing dependencies")
|
|
121
|
+
compatible_os: Optional[bool]=Field(description="A boolean value. Is compatible operating system described?")
|
|
116
122
|
overall_score: Optional[str]=Field(description="A overall scroll for the installation quality, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
117
123
|
|
|
118
124
|
class EvaluationInstallationResult(BaseModel):
|
|
@@ -163,8 +169,9 @@ class EvaluationInstallationTask(EvaluationTask):
|
|
|
163
169
|
gitignore_path,
|
|
164
170
|
meta_data = None,
|
|
165
171
|
step_callback = None,
|
|
172
|
+
summarized_files_db = None,
|
|
166
173
|
):
|
|
167
|
-
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
|
|
174
|
+
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
|
|
168
175
|
self.evaluation_name = "Installation Evaluation"
|
|
169
176
|
|
|
170
177
|
|
|
@@ -235,7 +242,7 @@ class EvaluationInstallationTask(EvaluationTask):
|
|
|
235
242
|
}
|
|
236
243
|
return evaluation, token_usage
|
|
237
244
|
|
|
238
|
-
def _evaluate(self, files: list[str] | None = None) -> tuple[dict | None, dict]:
|
|
245
|
+
def _evaluate(self, files: list[str] | None = None) -> tuple[dict | None, dict, list[str]]:
|
|
239
246
|
evaluation, token_usage = self._free_evaluate(files)
|
|
240
247
|
structured_evaluation, structured_token_usage = self._structured_evaluate(files)
|
|
241
248
|
|
|
@@ -245,5 +252,20 @@ class EvaluationInstallationTask(EvaluationTask):
|
|
|
245
252
|
}
|
|
246
253
|
total_token_usage = increase_token_usage(token_usage, structured_token_usage)
|
|
247
254
|
|
|
248
|
-
return combined_evaluation, total_token_usage
|
|
249
|
-
|
|
255
|
+
return combined_evaluation, total_token_usage, files
|
|
256
|
+
|
|
257
|
+
def _collect_files(self):
|
|
258
|
+
task = CollectionTask(
|
|
259
|
+
llm=self.llm,
|
|
260
|
+
step_callback=self.step_callback,
|
|
261
|
+
)
|
|
262
|
+
task.compile(
|
|
263
|
+
repo_path=self.repo_path,
|
|
264
|
+
gitignore_path=Path(self.repo_path, ".gitignore"),
|
|
265
|
+
db=self.summarized_files_db,
|
|
266
|
+
goal_item=CollectionGoalItemEnum.Installation.name,
|
|
267
|
+
)
|
|
268
|
+
files = task.collect()
|
|
269
|
+
if files is None:
|
|
270
|
+
return []
|
|
271
|
+
return files
|
|
@@ -7,6 +7,7 @@ from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
9
9
|
from bioguider.agents.prompt_utils import EVALUATION_INSTRUCTION
|
|
10
|
+
from bioguider.utils.gitignore_checker import GitignoreChecker
|
|
10
11
|
|
|
11
12
|
from ..utils.pyphen_utils import PyphenReadability
|
|
12
13
|
from bioguider.agents.agent_utils import increase_token_usage, read_file, summarize_file
|
|
@@ -84,6 +85,8 @@ Your final report must **exactly match** the following format. Do not add or omi
|
|
|
84
85
|
**License Information Included:**
|
|
85
86
|
* score: [Yes / No]
|
|
86
87
|
* suggestions: <suggestions to improve **License Information**>
|
|
88
|
+
** Code contributor / Author information included
|
|
89
|
+
* score: [Yes / No]
|
|
87
90
|
**Overall Score:** [Poor / Fair / Good / Excellent]
|
|
88
91
|
|
|
89
92
|
---
|
|
@@ -257,6 +260,7 @@ class StructuredEvaluationREADMEResult(BaseModel):
|
|
|
257
260
|
dependency_suggestions: Optional[str]=Field(description="Suggestions if dependencies are not clearly stated")
|
|
258
261
|
license_score: Optional[bool]=Field(description="A boolean value, Are contributor or maintainer details provided?")
|
|
259
262
|
license_suggestions: Optional[str]=Field(description="Suggestions to improve license information")
|
|
263
|
+
contributor_author_score: Optional[bool]=Field(description="A boolean value. are contributors or author included?")
|
|
260
264
|
overall_score: str=Field(description="A overall scroll for the README quality, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
261
265
|
|
|
262
266
|
class EvaluationREADMEResult(BaseModel):
|
|
@@ -300,9 +304,10 @@ class EvaluationREADMETask(EvaluationTask):
|
|
|
300
304
|
repo_path: str,
|
|
301
305
|
gitignore_path: str,
|
|
302
306
|
meta_data: ProjectMetadata | None = None,
|
|
303
|
-
step_callback: Callable | None = None
|
|
307
|
+
step_callback: Callable | None = None,
|
|
308
|
+
summarized_files_db = None,
|
|
304
309
|
):
|
|
305
|
-
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback)
|
|
310
|
+
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
|
|
306
311
|
self.evaluation_name = "README Evaluation"
|
|
307
312
|
|
|
308
313
|
def _structured_evaluate(self, free_readme_evaluations: dict[str, dict]):
|
|
@@ -354,6 +359,7 @@ class EvaluationREADMETask(EvaluationTask):
|
|
|
354
359
|
dependency_suggestions="No dependency provided",
|
|
355
360
|
license_score=False,
|
|
356
361
|
license_suggestions="No license information",
|
|
362
|
+
contributor_author_score=False,
|
|
357
363
|
overall_score="Poor",
|
|
358
364
|
),
|
|
359
365
|
"structured_reasoning_process": f"{readme_file} is an empty file.",
|
|
@@ -451,7 +457,7 @@ class EvaluationREADMETask(EvaluationTask):
|
|
|
451
457
|
total_token_usage = increase_token_usage(total_token_usage, token_usage)
|
|
452
458
|
return readme_evaluations, total_token_usage
|
|
453
459
|
|
|
454
|
-
def _evaluate(self, files: list[str]) -> tuple[dict, dict]:
|
|
460
|
+
def _evaluate(self, files: list[str]) -> tuple[dict, dict, list[str]]:
|
|
455
461
|
free_readme_evaluations, free_token_usage = self._free_evaluate(files)
|
|
456
462
|
structured_readme_evaluations, structured_token_usage = self._structured_evaluate(free_readme_evaluations)
|
|
457
463
|
|
|
@@ -468,6 +474,26 @@ class EvaluationREADMETask(EvaluationTask):
|
|
|
468
474
|
|
|
469
475
|
total_token_usage = increase_token_usage(free_token_usage, structured_token_usage)
|
|
470
476
|
|
|
471
|
-
return combined_evaluations, total_token_usage
|
|
477
|
+
return combined_evaluations, total_token_usage, files
|
|
472
478
|
|
|
479
|
+
def _collect_files(self):
|
|
480
|
+
"""
|
|
481
|
+
Search for a README file in the repository directory.
|
|
482
|
+
"""
|
|
483
|
+
possible_readme_files = [
|
|
484
|
+
"readme.md",
|
|
485
|
+
"readme.rst",
|
|
486
|
+
"readme.txt",
|
|
487
|
+
"readme",
|
|
488
|
+
]
|
|
489
|
+
repo_path = self.repo_path
|
|
490
|
+
gitignore_path = Path(repo_path, ".gitignore")
|
|
491
|
+
gitignore_checker = GitignoreChecker(
|
|
492
|
+
directory=repo_path, gitignore_path=gitignore_path
|
|
493
|
+
)
|
|
494
|
+
found_readme_files = gitignore_checker.check_files_and_folders(
|
|
495
|
+
check_file_cb=lambda root_dir, relative_path: Path(relative_path).name.lower() in possible_readme_files,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
return found_readme_files
|
|
473
499
|
|