vision-agent 0.2.56__py3-none-any.whl → 0.2.58__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,346 +0,0 @@
1
- import json
2
- import logging
3
- import sys
4
- from pathlib import Path
5
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
-
7
- from vision_agent.llm import LLM, OpenAILLM
8
- from vision_agent.lmm import LMM
9
- from vision_agent.tools.easytool_tools import TOOLS
10
-
11
- from .agent import Agent
12
- from .easytool_prompts import (
13
- ANSWER_GENERATE,
14
- ANSWER_SUMMARIZE,
15
- CHOOSE_PARAMETER,
16
- CHOOSE_TOOL,
17
- TASK_DECOMPOSE,
18
- TASK_TOPOLOGY,
19
- )
20
-
21
- logging.basicConfig(stream=sys.stdout)
22
- _LOGGER = logging.getLogger(__name__)
23
-
24
-
25
- def parse_json(s: str) -> Any:
26
- s = (
27
- s.replace(": true", ": True")
28
- .replace(": false", ": False")
29
- .replace(":true", ": True")
30
- .replace(":false", ": False")
31
- .replace("```", "")
32
- .strip()
33
- )
34
- return json.loads(s)
35
-
36
-
37
- def change_name(name: str) -> str:
38
- change_list = ["from", "class", "return", "false", "true", "id", "and", "", "ID"]
39
- if name in change_list:
40
- name = "is_" + name.lower()
41
- return name
42
-
43
-
44
- def format_tools(tools: Dict[int, Any]) -> str:
45
- # Format this way so it's clear what the ID's are
46
- tool_str = ""
47
- for key in tools:
48
- tool_str += f"ID: {key} - {tools[key]}\n"
49
- return tool_str
50
-
51
-
52
- def topological_sort(tasks: List[Dict]) -> List[Dict]:
53
- in_degree = {task["id"]: 0 for task in tasks}
54
- for task in tasks:
55
- for dep in task["dep"]:
56
- if dep in in_degree:
57
- in_degree[task["id"]] += 1
58
-
59
- queue = [task for task in tasks if in_degree[task["id"]] == 0]
60
- sorted_order = []
61
-
62
- while queue:
63
- current = queue.pop(0)
64
- sorted_order.append(current)
65
-
66
- for task in tasks:
67
- if current["id"] in task["dep"]:
68
- in_degree[task["id"]] -= 1
69
- if in_degree[task["id"]] == 0:
70
- queue.append(task)
71
-
72
- if len(sorted_order) != len(tasks):
73
- completed_ids = set([task["id"] for task in sorted_order])
74
- remaining_tasks = [task for task in tasks if task["id"] not in completed_ids]
75
- sorted_order.extend(remaining_tasks)
76
- return sorted_order
77
-
78
-
79
- def task_decompose(
80
- model: Union[LLM, LMM, Agent], question: str, tools: Dict[int, Any]
81
- ) -> Optional[Dict]:
82
- prompt = TASK_DECOMPOSE.format(question=question, tools=format_tools(tools))
83
- tries = 0
84
- str_result = ""
85
- while True:
86
- try:
87
- str_result = model(prompt)
88
- result = parse_json(str_result)
89
- return result["Tasks"] # type: ignore
90
- except Exception:
91
- if tries > 10:
92
- _LOGGER.error(f"Failed task_decompose on: {str_result}")
93
- return None
94
- tries += 1
95
- continue
96
-
97
-
98
- def task_topology(
99
- model: Union[LLM, LMM, Agent], question: str, task_list: List[Dict]
100
- ) -> List[Dict[str, Any]]:
101
- prompt = TASK_TOPOLOGY.format(question=question, task_list=task_list)
102
- tries = 0
103
- str_result = ""
104
- while True:
105
- try:
106
- str_result = model(prompt)
107
- result = parse_json(str_result)
108
- for elt in result["Tasks"]:
109
- if isinstance(elt["dep"], str):
110
- elt["dep"] = [int(dep) for dep in elt["dep"].split(",")]
111
- elif isinstance(elt["dep"], int):
112
- elt["dep"] = [elt["dep"]]
113
- elif isinstance(elt["dep"], list):
114
- elt["dep"] = [int(dep) for dep in elt["dep"]]
115
- return result["Tasks"] # type: ignore
116
- except Exception:
117
- if tries > 10:
118
- _LOGGER.error(f"Failed task_topology on: {str_result}")
119
- return task_list
120
- tries += 1
121
- continue
122
-
123
-
124
- def choose_tool(
125
- model: Union[LLM, LMM, Agent], question: str, tools: Dict[int, Any]
126
- ) -> Optional[int]:
127
- prompt = CHOOSE_TOOL.format(question=question, tools=format_tools(tools))
128
- tries = 0
129
- str_result = ""
130
- while True:
131
- try:
132
- str_result = model(prompt)
133
- result = parse_json(str_result)
134
- return result["ID"] # type: ignore
135
- except Exception:
136
- if tries > 10:
137
- _LOGGER.error(f"Failed choose_tool on: {str_result}")
138
- return None
139
- tries += 1
140
- continue
141
-
142
-
143
- def choose_parameter(
144
- model: Union[LLM, LMM, Agent], question: str, tool_usage: Dict, previous_log: str
145
- ) -> Optional[Any]:
146
- # TODO: should format tool_usage
147
- prompt = CHOOSE_PARAMETER.format(
148
- question=question, tool_usage=tool_usage, previous_log=previous_log
149
- )
150
- tries = 0
151
- str_result = ""
152
- while True:
153
- try:
154
- str_result = model(prompt)
155
- result = parse_json(str_result)
156
- return result["Parameters"]
157
- except Exception:
158
- if tries > 10:
159
- _LOGGER.error(f"Failed choose_parameter on: {str_result}")
160
- return None
161
- tries += 1
162
- continue
163
-
164
-
165
- def answer_generate(
166
- model: Union[LLM, LMM, Agent], question: str, call_results: str, previous_log: str
167
- ) -> str:
168
- prompt = ANSWER_GENERATE.format(
169
- question=question, call_results=call_results, previous_log=previous_log
170
- )
171
- return model(prompt)
172
-
173
-
174
- def answer_summarize(
175
- model: Union[LLM, LMM, Agent], question: str, answers: List[Dict]
176
- ) -> str:
177
- prompt = ANSWER_SUMMARIZE.format(question=question, answers=answers)
178
- return model(prompt)
179
-
180
-
181
- def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any:
182
- try:
183
- return tool()(**parameters)
184
- except Exception as e:
185
- _LOGGER.error(f"Failed function_call on: {e}")
186
- return None
187
-
188
-
189
- def retrieval(
190
- model: Union[LLM, LMM, Agent],
191
- question: str,
192
- tools: Dict[int, Any],
193
- previous_log: str,
194
- ) -> Tuple[List[Dict], str]:
195
- tool_id = choose_tool(
196
- model, question, {k: v["description"] for k, v in tools.items()}
197
- )
198
- if tool_id is None:
199
- return [{}], ""
200
- _LOGGER.info(f"\t(Tool ID, name): ({tool_id}, {tools[tool_id]['name']})")
201
-
202
- tool_instructions = tools[tool_id]
203
- tool_usage = tool_instructions["usage"]
204
- tool_name = tool_instructions["name"]
205
-
206
- parameters = choose_parameter(model, question, tool_usage, previous_log)
207
- _LOGGER.info(f"\tParameters: {parameters} for {tool_name}")
208
- if parameters is None:
209
- return [{}], ""
210
- tool_results = [
211
- {"task": question, "tool_name": tool_name, "parameters": parameters}
212
- ]
213
-
214
- def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
215
- call_results: List[Any] = []
216
- if isinstance(result["parameters"], Dict):
217
- call_result = function_call(tools[tool_id]["class"], result["parameters"])
218
- if call_result is None:
219
- return call_results
220
- call_results.append(call_result)
221
- elif isinstance(result["parameters"], List):
222
- for parameters in result["parameters"]:
223
- call_result = function_call(tools[tool_id]["class"], parameters)
224
- if call_result is None:
225
- continue
226
- call_results.append(call_result)
227
- return call_results
228
-
229
- call_results = []
230
- for i, result in enumerate(tool_results):
231
- call_results.extend(parse_tool_results(result))
232
- tool_results[i]["call_results"] = call_results
233
-
234
- call_results_str = "\n\n".join([str(e) for e in call_results if e is not None])
235
- _LOGGER.info(f"\tCall Results: {call_results_str}")
236
- return tool_results, call_results_str
237
-
238
-
239
- class EasyTool(Agent):
240
- r"""This is an implementation of the EasyTool paper https://arxiv.org/abs/2401.06201
241
- based on the original implementation https://github.com/microsoft/JARVIS/tree/main/easytool
242
- from the funcQA code.
243
-
244
- Example
245
- -------
246
- >>> from vision_agent.agent import EasyTool
247
- >>> agent = EasyTool()
248
- >>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?")
249
- >>> print(resp)
250
- "It will travel approximately 31.03 kilometers in 29 minutes."
251
- >>> resp = agent("How many cards are in this image?", image="cards.jpg")
252
- >>> print(resp)
253
- "There are 2 cards in this image."
254
- """
255
-
256
- def __init__(
257
- self,
258
- task_model: Optional[Union[LLM, LMM]] = None,
259
- answer_model: Optional[Union[LLM, LMM]] = None,
260
- verbose: bool = False,
261
- ):
262
- self.task_model = (
263
- OpenAILLM(json_mode=True) if task_model is None else task_model
264
- )
265
- self.answer_model = OpenAILLM() if answer_model is None else answer_model
266
-
267
- self.retrieval_num = 3
268
- self.tools = TOOLS
269
- if verbose:
270
- _LOGGER.setLevel(logging.INFO)
271
-
272
- def __call__(
273
- self,
274
- input: Union[List[Dict[str, str]], str],
275
- media: Optional[Union[str, Path]] = None,
276
- ) -> str:
277
- """Invoke the vision agent.
278
-
279
- Parameters:
280
- input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
281
- image: the input image referenced in the prompt parameter.
282
-
283
- Returns:
284
- A text response.
285
- """
286
- if isinstance(input, str):
287
- input = [{"role": "user", "content": input}]
288
- return self.chat(input, media=media)
289
-
290
- def chat_with_workflow(
291
- self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
292
- ) -> Tuple[str, List[Dict]]:
293
- question = chat[0]["content"]
294
- if media:
295
- question += f" Image name: {media}"
296
- tasks = task_decompose(
297
- self.task_model,
298
- question,
299
- {k: v["description"] for k, v in self.tools.items()},
300
- )
301
- _LOGGER.info(f"Tasks: {tasks}")
302
- if tasks is not None:
303
- task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)]
304
- task_list = task_topology(self.task_model, question, task_list)
305
- try:
306
- task_list = topological_sort(task_list)
307
- except Exception:
308
- _LOGGER.error(f"Failed topological_sort on: {task_list}")
309
- else:
310
- task_list = []
311
-
312
- _LOGGER.info(f"Task Dependency: {task_list}")
313
- task_depend = {"Original Quesiton": question}
314
- previous_log = ""
315
- answers = []
316
- for task in task_list:
317
- task_depend[task["id"]] = {"task": task["task"], "answer": ""} # type: ignore
318
- all_tool_results = []
319
- for task in task_list:
320
- task_str = task["task"]
321
- previous_log = str(task_depend)
322
- _LOGGER.info(f"\tSubtask: {task_str}")
323
- tool_results, call_results = retrieval(
324
- self.task_model,
325
- task_str,
326
- self.tools,
327
- previous_log,
328
- )
329
- answer = answer_generate(
330
- self.answer_model, task_str, call_results, previous_log
331
- )
332
-
333
- for tool_result in tool_results:
334
- tool_result["answer"] = answer
335
- all_tool_results.extend(tool_results)
336
-
337
- _LOGGER.info(f"\tAnswer: {answer}")
338
- answers.append({"task": task_str, "answer": answer})
339
- task_depend[task["id"]]["answer"] = answer # type: ignore
340
- return answer_summarize(self.answer_model, question, answers), all_tool_results
341
-
342
- def chat(
343
- self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
344
- ) -> str:
345
- answer, _ = self.chat_with_workflow(chat, media=media)
346
- return answer
@@ -1,89 +0,0 @@
1
- TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
2
- This is the user's question: {question}
3
- This is the tool list:
4
- {tools}
5
-
6
- Please note that:
7
- 1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list.
8
- 2. If one subtask needs the results from another subtask, you should write clearly. For example:
9
- {{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
10
- 3. You must ONLY output in a parsible JSON format. An example output looks like:
11
-
12
- {{"Tasks": ["Task 1", "Task 2", ...]}}
13
-
14
- Output: """
15
-
16
- TASK_TOPOLOGY = """Given a user's complex question, I have decomposed this question into some simple subtasks. I think there exist logical connections and order among the tasks. Thus, you need to help me output these logical connections and order.
17
- You must ONLY output in a parsible JSON format with the following format:
18
-
19
- {{"Tasks": [{{"task": task, "id", task_id, "dep": [dependency_task_id1, dependency_task_id2, ...]}}]}}
20
-
21
- The "dep" field denotes the id of the previous task which generates a new resource upon which the current task depends. If there are no dependencies, set "dep" to -1.
22
-
23
-
24
- This is the user's question: {question}
25
-
26
- These are subtasks of this question:
27
-
28
- {task_list}
29
-
30
- Output: """
31
-
32
- CHOOSE_TOOL = """This is the user's question: {question}
33
- These are the tools you can select to solve the question:
34
- {tools}
35
-
36
- Please note that:
37
- 1. You should only choose one tool from the Tool List to solve this question.
38
- 2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
39
-
40
- Example 1: {{"ID": 1}}
41
- Example 2: {{"ID": 2}}
42
-
43
- Output: """
44
-
45
- CHOOSE_PARAMETER = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
46
- Please note that:
47
- 1. The Example in the API tool documentation can help you better understand the use of the API. Pay attention to the examples which show how to parse the question and extract tool parameters such as prompts and visual inputs.
48
- 2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
49
- 3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
50
- 4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
51
- 5. If you need to use this API multiple times, please set "Parameters" to a list.
52
- 6. You must ONLY output in a parsible JSON format. Two example outputs looks like:
53
-
54
- Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
55
- Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
56
-
57
- These are logs of previous questions and answers:
58
- {previous_log}
59
-
60
- This is the current user's question: {question}
61
- This is the API tool documentation: {tool_usage}
62
- Output: """
63
-
64
-
65
- ANSWER_GENERATE = """You should answer the question based on the response output by the API tool.
66
- Please note that:
67
- 1. Try to organize the response into a natural language answer.
68
- 2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
69
- 3. If the API tool does not provide useful information in the response, please answer with your knowledge.
70
- 4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
71
-
72
- These are logs of previous questions and answers:
73
- {previous_log}
74
-
75
- This is the user's question: {question}
76
-
77
- This is the response output by the API tool:
78
- {call_results}
79
-
80
- We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
81
- Output: """
82
-
83
- ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question.
84
- This is the user's question: {question}
85
-
86
- These are subtasks and their answers:
87
- {answers}
88
-
89
- Final answer: """