vision-agent 0.2.30__py3-none-any.whl → 0.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -2
- vision_agent/agent/agent.py +1 -1
- vision_agent/agent/agent_coder.py +16 -10
- vision_agent/agent/{vision_agent_v2.py → data_interpreter.py} +12 -12
- vision_agent/agent/{vision_agent_v2_prompts.py → data_interpreter_prompts.py} +3 -3
- vision_agent/agent/easytool.py +8 -8
- vision_agent/agent/easytool_v2.py +778 -0
- vision_agent/agent/easytool_v2_prompts.py +152 -0
- vision_agent/agent/reflexion.py +8 -8
- vision_agent/agent/vision_agent.py +368 -690
- vision_agent/agent/vision_agent_prompts.py +233 -149
- vision_agent/llm/llm.py +3 -4
- vision_agent/lmm/lmm.py +6 -6
- vision_agent/tools/__init__.py +21 -22
- vision_agent/tools/easytool_tools.py +1242 -0
- vision_agent/tools/tools.py +533 -1090
- vision_agent-0.2.32.dist-info/METADATA +175 -0
- vision_agent-0.2.32.dist-info/RECORD +36 -0
- vision_agent/agent/vision_agent_v3.py +0 -394
- vision_agent/agent/vision_agent_v3_prompts.py +0 -234
- vision_agent/tools/tools_v2.py +0 -685
- vision_agent-0.2.30.dist-info/METADATA +0 -226
- vision_agent-0.2.30.dist-info/RECORD +0 -36
- {vision_agent-0.2.30.dist-info → vision_agent-0.2.32.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.30.dist-info → vision_agent-0.2.32.dist-info}/WHEEL +0 -0
@@ -0,0 +1,152 @@
|
|
1
|
+
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question, the tool usage for each of the tools used and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used.
|
2
|
+
|
3
|
+
Please note that:
|
4
|
+
1. You must ONLY output parsible JSON format. If the agents output was correct set "Finish" to true, else set "Finish" to false. An example output looks like:
|
5
|
+
{{"Finish": true, "Reflection": "The agent's answer was correct."}}
|
6
|
+
2. You must utilize the image with the visualized bounding boxes or masks and determine if the tools were used correctly or if the tools were used incorrectly or the wrong tools were used.
|
7
|
+
3. If the agent's answer was incorrect, you must diagnose the reason for failure and devise a new concise and concrete plan that aims to mitigate the same failure with the tools available. An example output looks like:
|
8
|
+
{{"Finish": false, "Reflection": "I can see from the visualized bounding boxes that the agent's answer was incorrect because the grounding_dino_ tool produced false positive predictions. The agent should use the following tools with the following parameters:
|
9
|
+
Step 1: Use 'grounding_dino_' with a 'prompt' of 'baby. bed' and a 'box_threshold' of 0.7 to reduce the false positives.
|
10
|
+
Step 2: Use 'box_iou_' with the baby bounding box and the bed bounding box to determine if the baby is on the bed or not."}}
|
11
|
+
4. If the task cannot be completed with the existing tools or by adjusting the parameters, set "Finish" to true.
|
12
|
+
|
13
|
+
User's question: {question}
|
14
|
+
|
15
|
+
Tools available:
|
16
|
+
{tools}
|
17
|
+
|
18
|
+
Tasks and tools used:
|
19
|
+
{tool_results}
|
20
|
+
|
21
|
+
Tool's used API documentation:
|
22
|
+
{tool_usage}
|
23
|
+
|
24
|
+
Final answer:
|
25
|
+
{final_answer}
|
26
|
+
|
27
|
+
Reflection: """
|
28
|
+
|
29
|
+
TASK_DECOMPOSE = """You need to decompose a user's complex question into one or more simple subtasks and let the model execute it step by step.
|
30
|
+
This is the user's question: {question}
|
31
|
+
This is the tool list:
|
32
|
+
{tools}
|
33
|
+
|
34
|
+
Please note that:
|
35
|
+
1. If the given task is simple and the answer can be provided by executing one tool, you should only use that tool to provide the answer.
|
36
|
+
2. If the given task is complex, You should decompose this user's complex question into simple subtasks which can only be executed easily by using one single tool in the tool list.
|
37
|
+
3. You should try to decompose the complex question into least number of subtasks.
|
38
|
+
4. If one subtask needs the results from another subtask, you should write clearly. For example:
|
39
|
+
{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
|
40
|
+
5. You must ONLY output in a parsible JSON format. An example output looks like:
|
41
|
+
|
42
|
+
{{"Tasks": ["Task 1", "Task 2", ...]}}
|
43
|
+
|
44
|
+
Output: """
|
45
|
+
|
46
|
+
TASK_DECOMPOSE_DEPENDS = """You need to decompose a user's complex question into one or more simple subtasks and let the model execute it step by step.
|
47
|
+
This is the user's question: {question}
|
48
|
+
|
49
|
+
This is the tool list:
|
50
|
+
{tools}
|
51
|
+
|
52
|
+
This is a reflection from a previous failed attempt:
|
53
|
+
{reflections}
|
54
|
+
|
55
|
+
Please note that:
|
56
|
+
1. If the given task is simple and the answer can be provided by executing one tool, you should only use that tool to provide the answer.
|
57
|
+
2. If the given task is complex, You should decompose this user's complex question into simple subtasks which can only be executed easily by using one single tool in the tool list.
|
58
|
+
3. You should try to decompose the complex question into least number of subtasks.
|
59
|
+
4. If one subtask needs the results from another subtask, you should write clearly. For example:
|
60
|
+
{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
|
61
|
+
5. You must ONLY output in a parsible JSON format. An example output looks like:
|
62
|
+
|
63
|
+
{{"Tasks": ["Task 1", "Task 2", ...]}}
|
64
|
+
|
65
|
+
Output: """
|
66
|
+
|
67
|
+
CHOOSE_TOOL = """This is the user's question: {question}
|
68
|
+
These are the tools you can select to solve the question:
|
69
|
+
{tools}
|
70
|
+
|
71
|
+
Please note that:
|
72
|
+
1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question.
|
73
|
+
2. You should only choose the tool whose parameters are most relevant to the user's question and are available as part of the question.
|
74
|
+
3. You should choose the tool whose return type is most relevant to the answer of the user's question.
|
75
|
+
4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
76
|
+
|
77
|
+
Example 1: {{"ID": 1}}
|
78
|
+
Example 2: {{"ID": 2}}
|
79
|
+
|
80
|
+
Output: """
|
81
|
+
|
82
|
+
CHOOSE_TOOL_DEPENDS = """This is the user's question: {question}
|
83
|
+
These are the tools you can select to solve the question:
|
84
|
+
{tools}
|
85
|
+
|
86
|
+
This is a reflection from a previous failed attempt:
|
87
|
+
{reflections}
|
88
|
+
|
89
|
+
Please note that:
|
90
|
+
1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question.
|
91
|
+
2. You should only choose the tool whose parameters are most relevant to the user's question and are available as part of the question.
|
92
|
+
3. You should choose the tool whose return type is most relevant to the answer of the user's question.
|
93
|
+
4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
94
|
+
|
95
|
+
Example 1: {{"ID": 1}}
|
96
|
+
Example 2: {{"ID": 2}}
|
97
|
+
|
98
|
+
Output: """
|
99
|
+
|
100
|
+
CHOOSE_PARAMETER_DEPENDS = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
|
101
|
+
Please note that:
|
102
|
+
1. The Example in the API tool documentation can help you better understand the use of the API. Pay attention to the examples which show how to parse the question and extract tool parameters such as prompts and visual inputs.
|
103
|
+
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no parameters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
104
|
+
3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
|
105
|
+
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
|
106
|
+
5. If you need to use this API multiple times, please set "Parameters" to a list.
|
107
|
+
6. You must ONLY output in a parsible JSON format. Two example outputs look like:
|
108
|
+
|
109
|
+
Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
|
110
|
+
Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
|
111
|
+
|
112
|
+
This is a reflection from a previous failed attempt:
|
113
|
+
{reflections}
|
114
|
+
|
115
|
+
These are logs of previous questions and answers:
|
116
|
+
{previous_log}
|
117
|
+
|
118
|
+
This is the current user's question: {question}
|
119
|
+
This is the API tool documentation: {tool_usage}
|
120
|
+
Output: """
|
121
|
+
|
122
|
+
ANSWER_GENERATE_DEPENDS = """You should answer the question based on the response output by the API tool.
|
123
|
+
Please note that:
|
124
|
+
1. You should try to organize the response into a natural language answer.
|
125
|
+
2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
126
|
+
3. If the API tool does not provide useful information in the response, please answer with your knowledge.
|
127
|
+
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
|
128
|
+
|
129
|
+
This is a reflection from a previous failed attempt:
|
130
|
+
{reflections}
|
131
|
+
|
132
|
+
These are logs of previous questions and answers:
|
133
|
+
{previous_log}
|
134
|
+
|
135
|
+
This is the user's question: {question}
|
136
|
+
|
137
|
+
This is the response output by the API tool:
|
138
|
+
{call_results}
|
139
|
+
|
140
|
+
We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
141
|
+
Output: """
|
142
|
+
|
143
|
+
ANSWER_SUMMARIZE_DEPENDS = """We break down a user's complex problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question
|
144
|
+
This is the user's question: {question}
|
145
|
+
|
146
|
+
These are subtasks and their answers:
|
147
|
+
{answers}
|
148
|
+
|
149
|
+
This is a reflection from a previous failed attempt:
|
150
|
+
{reflections}
|
151
|
+
|
152
|
+
Final answer: """
|
vision_agent/agent/reflexion.py
CHANGED
@@ -138,7 +138,7 @@ class Reflexion(Agent):
|
|
138
138
|
def __call__(
|
139
139
|
self,
|
140
140
|
input: Union[str, List[Dict[str, str]]],
|
141
|
-
|
141
|
+
media: Optional[Union[str, Path]] = None,
|
142
142
|
) -> str:
|
143
143
|
"""Invoke the vision agent.
|
144
144
|
|
@@ -151,24 +151,24 @@ class Reflexion(Agent):
|
|
151
151
|
"""
|
152
152
|
if isinstance(input, str):
|
153
153
|
input = [{"role": "user", "content": input}]
|
154
|
-
return self.chat(input,
|
154
|
+
return self.chat(input, media)
|
155
155
|
|
156
156
|
def chat(
|
157
|
-
self, chat: List[Dict[str, str]],
|
157
|
+
self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
|
158
158
|
) -> str:
|
159
159
|
if len(chat) == 0 or chat[0]["role"] != "user":
|
160
160
|
raise ValueError(
|
161
161
|
f"Invalid chat. Should start with user and alternate between user"
|
162
162
|
f"and assistant and contain at least one entry {chat}"
|
163
163
|
)
|
164
|
-
if
|
164
|
+
if media is not None and isinstance(self.action_agent, LLM):
|
165
165
|
raise ValueError(
|
166
166
|
"If image is provided, then action_agent must be an agent or LMM."
|
167
167
|
)
|
168
168
|
|
169
169
|
question = chat[0]["content"]
|
170
170
|
if len(chat) == 1:
|
171
|
-
results = self._step(question, image=
|
171
|
+
results = self._step(question, image=media)
|
172
172
|
self.last_scratchpad = results["scratchpad"]
|
173
173
|
return results["action_arg"]
|
174
174
|
|
@@ -183,10 +183,10 @@ class Reflexion(Agent):
|
|
183
183
|
self.last_scratchpad += "Answer is INCORRECT"
|
184
184
|
chat_context = "The previous conversation was:\n" + chat_str
|
185
185
|
reflections = self.reflect(
|
186
|
-
question, chat_context, self.last_scratchpad,
|
186
|
+
question, chat_context, self.last_scratchpad, media
|
187
187
|
)
|
188
188
|
_LOGGER.info(f" {reflections}")
|
189
|
-
results = self._step(question, reflections, image=
|
189
|
+
results = self._step(question, reflections, image=media)
|
190
190
|
self.last_scratchpad = results["scratchpad"]
|
191
191
|
return results["action_arg"]
|
192
192
|
|
@@ -249,7 +249,7 @@ class Reflexion(Agent):
|
|
249
249
|
return format_step(
|
250
250
|
self.action_agent(
|
251
251
|
self._build_agent_prompt(question, reflections, scratchpad),
|
252
|
-
|
252
|
+
media=image,
|
253
253
|
)
|
254
254
|
)
|
255
255
|
|