vision-agent 0.0.47__tar.gz → 0.0.50__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.0.47 → vision_agent-0.0.50}/PKG-INFO +23 -2
- {vision_agent-0.0.47 → vision_agent-0.0.50}/README.md +22 -1
- {vision_agent-0.0.47 → vision_agent-0.0.50}/pyproject.toml +1 -1
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/agent.py +7 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/easytool_prompts.py +14 -14
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/reflexion_prompts.py +1 -1
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/vision_agent.py +113 -82
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/vision_agent_prompts.py +20 -20
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/image_utils.py +1 -1
- vision_agent-0.0.50/vision_agent/llm/__init__.py +1 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/llm/llm.py +38 -3
- vision_agent-0.0.50/vision_agent/lmm/__init__.py +1 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/lmm/lmm.py +37 -2
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/tools/prompts.py +3 -3
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/tools/tools.py +115 -50
- vision_agent-0.0.47/vision_agent/llm/__init__.py +0 -1
- vision_agent-0.0.47/vision_agent/lmm/__init__.py +0 -1
- {vision_agent-0.0.47 → vision_agent-0.0.50}/LICENSE +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/__init__.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/easytool.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/reflexion.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/data/__init__.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/data/data.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/emb/__init__.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/emb/emb.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/tools/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.50
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -59,7 +59,8 @@ To get started, you can install the library using pip:
|
|
59
59
|
pip install vision-agent
|
60
60
|
```
|
61
61
|
|
62
|
-
Ensure you have an OpenAI API key and set it as an environment variable
|
62
|
+
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
63
|
+
using Azure OpenAI please see the additional setup section):
|
63
64
|
|
64
65
|
```bash
|
65
66
|
export OPENAI_API_KEY="your-api-key"
|
@@ -139,3 +140,23 @@ you. For example:
|
|
139
140
|
|
140
141
|
It also has a basic set of calculate tools such as add, subtract, multiply and divide.
|
141
142
|
|
143
|
+
### Additional Setup
|
144
|
+
If you want to use Azure OpenAI models, you can set the environment variable:
|
145
|
+
|
146
|
+
```bash
|
147
|
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
148
|
+
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
149
|
+
```
|
150
|
+
|
151
|
+
You can then run Vision Agent using the Azure OpenAI models:
|
152
|
+
|
153
|
+
```python
|
154
|
+
>>> import vision_agent as va
|
155
|
+
>>> agent = va.agent.VisionAgent(
|
156
|
+
>>> task_model=va.llm.AzureOpenAILLM(),
|
157
|
+
>>> answer_model=va.lmm.AzureOpenAILMM(),
|
158
|
+
>>> reflection_model=va.lmm.AzureOpenAILMM(),
|
159
|
+
>>> )
|
160
|
+
```
|
161
|
+
|
162
|
+
|
@@ -30,7 +30,8 @@ To get started, you can install the library using pip:
|
|
30
30
|
pip install vision-agent
|
31
31
|
```
|
32
32
|
|
33
|
-
Ensure you have an OpenAI API key and set it as an environment variable
|
33
|
+
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
34
|
+
using Azure OpenAI please see the additional setup section):
|
34
35
|
|
35
36
|
```bash
|
36
37
|
export OPENAI_API_KEY="your-api-key"
|
@@ -109,3 +110,23 @@ you. For example:
|
|
109
110
|
|
110
111
|
|
111
112
|
It also has a basic set of calculate tools such as add, subtract, multiply and divide.
|
113
|
+
|
114
|
+
### Additional Setup
|
115
|
+
If you want to use Azure OpenAI models, you can set the environment variable:
|
116
|
+
|
117
|
+
```bash
|
118
|
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
119
|
+
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
120
|
+
```
|
121
|
+
|
122
|
+
You can then run Vision Agent using the Azure OpenAI models:
|
123
|
+
|
124
|
+
```python
|
125
|
+
>>> import vision_agent as va
|
126
|
+
>>> agent = va.agent.VisionAgent(
|
127
|
+
>>> task_model=va.llm.AzureOpenAILLM(),
|
128
|
+
>>> answer_model=va.lmm.AzureOpenAILMM(),
|
129
|
+
>>> reflection_model=va.lmm.AzureOpenAILMM(),
|
130
|
+
>>> )
|
131
|
+
```
|
132
|
+
|
@@ -11,3 +11,10 @@ class Agent(ABC):
|
|
11
11
|
image: Optional[Union[str, Path]] = None,
|
12
12
|
) -> str:
|
13
13
|
pass
|
14
|
+
|
15
|
+
@abstractmethod
|
16
|
+
def log_progress(self, description: str) -> None:
|
17
|
+
"""Log the progress of the agent.
|
18
|
+
This is a hook that is intended for reporting the progress of the agent.
|
19
|
+
"""
|
20
|
+
pass
|
@@ -1,11 +1,11 @@
|
|
1
|
-
TASK_DECOMPOSE = """You need to decompose a
|
1
|
+
TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
|
2
2
|
This is the user's question: {question}
|
3
|
-
This is tool list:
|
3
|
+
This is the tool list:
|
4
4
|
{tools}
|
5
5
|
|
6
6
|
Please note that:
|
7
7
|
1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list.
|
8
|
-
2. If one subtask
|
8
|
+
2. If one subtask needs the results from another subtask, you should write clearly. For example:
|
9
9
|
{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
|
10
10
|
3. You must ONLY output in a parsible JSON format. An example output looks like:
|
11
11
|
|
@@ -13,7 +13,7 @@ Please note that:
|
|
13
13
|
|
14
14
|
Output: """
|
15
15
|
|
16
|
-
TASK_TOPOLOGY = """Given a
|
16
|
+
TASK_TOPOLOGY = """Given a user's complex question, I have decomposed this question into some simple subtasks. I think there exist logical connections and order among the tasks. Thus, you need to help me output these logical connections and order.
|
17
17
|
You must ONLY output in a parsible JSON format with the following format:
|
18
18
|
|
19
19
|
{{"Tasks": [{{"task": task, "id", task_id, "dep": [dependency_task_id1, dependency_task_id2, ...]}}]}}
|
@@ -21,7 +21,7 @@ You must ONLY output in a parsible JSON format with the following format:
|
|
21
21
|
The "dep" field denotes the id of the previous task which generates a new resource upon which the current task depends. If there are no dependencies, set "dep" to -1.
|
22
22
|
|
23
23
|
|
24
|
-
This is user's question: {question}
|
24
|
+
This is the user's question: {question}
|
25
25
|
|
26
26
|
These are subtasks of this question:
|
27
27
|
|
@@ -34,7 +34,7 @@ These are the tools you can select to solve the question:
|
|
34
34
|
{tools}
|
35
35
|
|
36
36
|
Please note that:
|
37
|
-
1. You should only
|
37
|
+
1. You should only choose one tool from the Tool List to solve this question.
|
38
38
|
2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
39
39
|
|
40
40
|
Example 1: {{"ID": 1}}
|
@@ -42,22 +42,22 @@ Example 2: {{"ID": 2}}
|
|
42
42
|
|
43
43
|
Output: """
|
44
44
|
|
45
|
-
CHOOSE_PARAMETER = """Given a user's question and
|
45
|
+
CHOOSE_PARAMETER = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
|
46
46
|
Please note that:
|
47
47
|
1. The Example in the API tool documentation can help you better understand the use of the API.
|
48
|
-
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
48
|
+
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
49
49
|
3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
|
50
50
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
|
51
|
-
5. If you need to use this API multiple times
|
52
|
-
6. You must ONLY output in a parsible JSON format. Two
|
51
|
+
5. If you need to use this API multiple times, please set "Parameters" to a list.
|
52
|
+
6. You must ONLY output in a parsible JSON format. Two example outputs looks like:
|
53
53
|
|
54
54
|
Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
|
55
55
|
Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
|
56
56
|
|
57
|
-
|
57
|
+
These are logs of previous questions and answers:
|
58
58
|
{previous_log}
|
59
59
|
This is the current user's question: {question}
|
60
|
-
This is API tool documentation: {tool_usage}
|
60
|
+
This is the API tool documentation: {tool_usage}
|
61
61
|
Output: """
|
62
62
|
|
63
63
|
|
@@ -67,7 +67,7 @@ Please note that:
|
|
67
67
|
2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
68
68
|
3. If the API tool does not provide useful information in the response, please answer with your knowledge.
|
69
69
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
|
70
|
-
|
70
|
+
These are logs of previous questions and answers:
|
71
71
|
{previous_log}
|
72
72
|
This is the user's question: {question}
|
73
73
|
This is the response output by the API tool:
|
@@ -75,7 +75,7 @@ This is the response output by the API tool:
|
|
75
75
|
We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
76
76
|
Output: """
|
77
77
|
|
78
|
-
ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question
|
78
|
+
ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question.
|
79
79
|
This is the user's question: {question}
|
80
80
|
These are subtasks and their answers: {answers}
|
81
81
|
Final answer: """
|
@@ -9,7 +9,7 @@ Relevant Context: {context}
|
|
9
9
|
Question: {question}{scratchpad}"""
|
10
10
|
|
11
11
|
|
12
|
-
COT_REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self
|
12
|
+
COT_REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given access to relevant context and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
|
13
13
|
Here are some examples:
|
14
14
|
{examples}
|
15
15
|
(END OF EXAMPLES)
|
@@ -244,79 +244,6 @@ def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any:
|
|
244
244
|
return str(e)
|
245
245
|
|
246
246
|
|
247
|
-
def retrieval(
|
248
|
-
model: Union[LLM, LMM, Agent],
|
249
|
-
question: str,
|
250
|
-
tools: Dict[int, Any],
|
251
|
-
previous_log: str,
|
252
|
-
reflections: str,
|
253
|
-
) -> Tuple[Dict, str]:
|
254
|
-
tool_id = choose_tool(
|
255
|
-
model, question, {k: v["description"] for k, v in tools.items()}, reflections
|
256
|
-
)
|
257
|
-
if tool_id is None:
|
258
|
-
return {}, ""
|
259
|
-
|
260
|
-
tool_instructions = tools[tool_id]
|
261
|
-
tool_usage = tool_instructions["usage"]
|
262
|
-
tool_name = tool_instructions["name"]
|
263
|
-
|
264
|
-
parameters = choose_parameter(
|
265
|
-
model, question, tool_usage, previous_log, reflections
|
266
|
-
)
|
267
|
-
if parameters is None:
|
268
|
-
return {}, ""
|
269
|
-
tool_results = {"task": question, "tool_name": tool_name, "parameters": parameters}
|
270
|
-
|
271
|
-
_LOGGER.info(
|
272
|
-
f"""Going to run the following tool(s) in sequence:
|
273
|
-
{tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
|
274
|
-
)
|
275
|
-
|
276
|
-
def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
|
277
|
-
call_results: List[Any] = []
|
278
|
-
if isinstance(result["parameters"], Dict):
|
279
|
-
call_results.append(
|
280
|
-
function_call(tools[tool_id]["class"], result["parameters"])
|
281
|
-
)
|
282
|
-
elif isinstance(result["parameters"], List):
|
283
|
-
for parameters in result["parameters"]:
|
284
|
-
call_results.append(function_call(tools[tool_id]["class"], parameters))
|
285
|
-
return call_results
|
286
|
-
|
287
|
-
call_results = parse_tool_results(tool_results)
|
288
|
-
tool_results["call_results"] = call_results
|
289
|
-
|
290
|
-
call_results_str = str(call_results)
|
291
|
-
# _LOGGER.info(f"\tCall Results: {call_results_str}")
|
292
|
-
return tool_results, call_results_str
|
293
|
-
|
294
|
-
|
295
|
-
def create_tasks(
|
296
|
-
task_model: Union[LLM, LMM], question: str, tools: Dict[int, Any], reflections: str
|
297
|
-
) -> List[Dict]:
|
298
|
-
tasks = task_decompose(
|
299
|
-
task_model,
|
300
|
-
question,
|
301
|
-
{k: v["description"] for k, v in tools.items()},
|
302
|
-
reflections,
|
303
|
-
)
|
304
|
-
if tasks is not None:
|
305
|
-
task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)]
|
306
|
-
task_list = task_topology(task_model, question, task_list)
|
307
|
-
try:
|
308
|
-
task_list = topological_sort(task_list)
|
309
|
-
except Exception:
|
310
|
-
_LOGGER.error(f"Failed topological_sort on: {task_list}")
|
311
|
-
else:
|
312
|
-
task_list = []
|
313
|
-
_LOGGER.info(
|
314
|
-
f"""Planned tasks:
|
315
|
-
{tabulate(task_list, headers="keys", tablefmt="mixed_grid")}"""
|
316
|
-
)
|
317
|
-
return task_list
|
318
|
-
|
319
|
-
|
320
247
|
def self_reflect(
|
321
248
|
reflect_model: Union[LLM, LMM],
|
322
249
|
question: str,
|
@@ -350,7 +277,7 @@ def parse_reflect(reflect: str) -> bool:
|
|
350
277
|
def visualize_result(all_tool_results: List[Dict]) -> List[str]:
|
351
278
|
image_to_data: Dict[str, Dict] = {}
|
352
279
|
for tool_result in all_tool_results:
|
353
|
-
if
|
280
|
+
if tool_result["tool_name"] not in ["grounding_sam_", "grounding_dino_"]:
|
354
281
|
continue
|
355
282
|
|
356
283
|
parameters = tool_result["parameters"]
|
@@ -368,7 +295,6 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
|
|
368
295
|
continue
|
369
296
|
|
370
297
|
for param, call_result in zip(parameters, tool_result["call_results"]):
|
371
|
-
|
372
298
|
# calls can fail, so we need to check if the call was successful
|
373
299
|
if not isinstance(call_result, dict):
|
374
300
|
continue
|
@@ -421,7 +347,18 @@ class VisionAgent(Agent):
|
|
421
347
|
reflect_model: Optional[Union[LLM, LMM]] = None,
|
422
348
|
max_retries: int = 2,
|
423
349
|
verbose: bool = False,
|
350
|
+
report_progress_callback: Optional[Callable[[str], None]] = None,
|
424
351
|
):
|
352
|
+
"""VisionAgent constructor.
|
353
|
+
|
354
|
+
Parameters
|
355
|
+
task_model: the model to use for task decomposition.
|
356
|
+
answer_model: the model to use for reasoning and concluding the answer.
|
357
|
+
reflect_model: the model to use for self reflection.
|
358
|
+
max_retries: maximum number of retries to attempt to complete the task.
|
359
|
+
verbose: whether to print more logs.
|
360
|
+
report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgent instances are running in parallel. This callback ensures that the progress are not mixed up.
|
361
|
+
"""
|
425
362
|
self.task_model = (
|
426
363
|
OpenAILLM(json_mode=True, temperature=0.1)
|
427
364
|
if task_model is None
|
@@ -434,8 +371,8 @@ class VisionAgent(Agent):
|
|
434
371
|
OpenAILMM(temperature=0.1) if reflect_model is None else reflect_model
|
435
372
|
)
|
436
373
|
self.max_retries = max_retries
|
437
|
-
|
438
374
|
self.tools = TOOLS
|
375
|
+
self.report_progress_callback = report_progress_callback
|
439
376
|
if verbose:
|
440
377
|
_LOGGER.setLevel(logging.INFO)
|
441
378
|
|
@@ -458,6 +395,11 @@ class VisionAgent(Agent):
|
|
458
395
|
input = [{"role": "user", "content": input}]
|
459
396
|
return self.chat(input, image=image)
|
460
397
|
|
398
|
+
def log_progress(self, description: str) -> None:
|
399
|
+
_LOGGER.info(description)
|
400
|
+
if self.report_progress_callback:
|
401
|
+
self.report_progress_callback(description)
|
402
|
+
|
461
403
|
def chat_with_workflow(
|
462
404
|
self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
|
463
405
|
) -> Tuple[str, List[Dict]]:
|
@@ -470,7 +412,9 @@ class VisionAgent(Agent):
|
|
470
412
|
all_tool_results: List[Dict] = []
|
471
413
|
|
472
414
|
for _ in range(self.max_retries):
|
473
|
-
task_list = create_tasks(
|
415
|
+
task_list = self.create_tasks(
|
416
|
+
self.task_model, question, self.tools, reflections
|
417
|
+
)
|
474
418
|
|
475
419
|
task_depend = {"Original Quesiton": question}
|
476
420
|
previous_log = ""
|
@@ -482,7 +426,7 @@ class VisionAgent(Agent):
|
|
482
426
|
for task in task_list:
|
483
427
|
task_str = task["task"]
|
484
428
|
previous_log = str(task_depend)
|
485
|
-
tool_results, call_results = retrieval(
|
429
|
+
tool_results, call_results = self.retrieval(
|
486
430
|
self.task_model,
|
487
431
|
task_str,
|
488
432
|
self.tools,
|
@@ -496,8 +440,8 @@ class VisionAgent(Agent):
|
|
496
440
|
tool_results["answer"] = answer
|
497
441
|
all_tool_results.append(tool_results)
|
498
442
|
|
499
|
-
|
500
|
-
|
443
|
+
self.log_progress(f"\tCall Result: {call_results}")
|
444
|
+
self.log_progress(f"\tAnswer: {answer}")
|
501
445
|
answers.append({"task": task_str, "answer": answer})
|
502
446
|
task_depend[task["id"]]["answer"] = answer # type: ignore
|
503
447
|
task_depend[task["id"]]["call_result"] = call_results # type: ignore
|
@@ -515,12 +459,15 @@ class VisionAgent(Agent):
|
|
515
459
|
final_answer,
|
516
460
|
visualized_images[0] if len(visualized_images) > 0 else image,
|
517
461
|
)
|
518
|
-
|
462
|
+
self.log_progress(f"Reflection: {reflection}")
|
519
463
|
if parse_reflect(reflection):
|
520
464
|
break
|
521
465
|
else:
|
522
466
|
reflections += reflection
|
523
|
-
|
467
|
+
# '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
|
468
|
+
self.log_progress(
|
469
|
+
f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</<ANSWER>"
|
470
|
+
)
|
524
471
|
return final_answer, all_tool_results
|
525
472
|
|
526
473
|
def chat(
|
@@ -528,3 +475,87 @@ class VisionAgent(Agent):
|
|
528
475
|
) -> str:
|
529
476
|
answer, _ = self.chat_with_workflow(chat, image=image)
|
530
477
|
return answer
|
478
|
+
|
479
|
+
def retrieval(
|
480
|
+
self,
|
481
|
+
model: Union[LLM, LMM, Agent],
|
482
|
+
question: str,
|
483
|
+
tools: Dict[int, Any],
|
484
|
+
previous_log: str,
|
485
|
+
reflections: str,
|
486
|
+
) -> Tuple[Dict, str]:
|
487
|
+
tool_id = choose_tool(
|
488
|
+
model,
|
489
|
+
question,
|
490
|
+
{k: v["description"] for k, v in tools.items()},
|
491
|
+
reflections,
|
492
|
+
)
|
493
|
+
if tool_id is None:
|
494
|
+
return {}, ""
|
495
|
+
|
496
|
+
tool_instructions = tools[tool_id]
|
497
|
+
tool_usage = tool_instructions["usage"]
|
498
|
+
tool_name = tool_instructions["name"]
|
499
|
+
|
500
|
+
parameters = choose_parameter(
|
501
|
+
model, question, tool_usage, previous_log, reflections
|
502
|
+
)
|
503
|
+
if parameters is None:
|
504
|
+
return {}, ""
|
505
|
+
tool_results = {
|
506
|
+
"task": question,
|
507
|
+
"tool_name": tool_name,
|
508
|
+
"parameters": parameters,
|
509
|
+
}
|
510
|
+
|
511
|
+
self.log_progress(
|
512
|
+
f"""Going to run the following tool(s) in sequence:
|
513
|
+
{tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
|
514
|
+
)
|
515
|
+
|
516
|
+
def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
|
517
|
+
call_results: List[Any] = []
|
518
|
+
if isinstance(result["parameters"], Dict):
|
519
|
+
call_results.append(
|
520
|
+
function_call(tools[tool_id]["class"], result["parameters"])
|
521
|
+
)
|
522
|
+
elif isinstance(result["parameters"], List):
|
523
|
+
for parameters in result["parameters"]:
|
524
|
+
call_results.append(
|
525
|
+
function_call(tools[tool_id]["class"], parameters)
|
526
|
+
)
|
527
|
+
return call_results
|
528
|
+
|
529
|
+
call_results = parse_tool_results(tool_results)
|
530
|
+
tool_results["call_results"] = call_results
|
531
|
+
|
532
|
+
call_results_str = str(call_results)
|
533
|
+
return tool_results, call_results_str
|
534
|
+
|
535
|
+
def create_tasks(
|
536
|
+
self,
|
537
|
+
task_model: Union[LLM, LMM],
|
538
|
+
question: str,
|
539
|
+
tools: Dict[int, Any],
|
540
|
+
reflections: str,
|
541
|
+
) -> List[Dict]:
|
542
|
+
tasks = task_decompose(
|
543
|
+
task_model,
|
544
|
+
question,
|
545
|
+
{k: v["description"] for k, v in tools.items()},
|
546
|
+
reflections,
|
547
|
+
)
|
548
|
+
if tasks is not None:
|
549
|
+
task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)]
|
550
|
+
task_list = task_topology(task_model, question, task_list)
|
551
|
+
try:
|
552
|
+
task_list = topological_sort(task_list)
|
553
|
+
except Exception:
|
554
|
+
_LOGGER.error(f"Failed topological_sort on: {task_list}")
|
555
|
+
else:
|
556
|
+
task_list = []
|
557
|
+
self.log_progress(
|
558
|
+
f"""Planned tasks:
|
559
|
+
{tabulate(task_list, headers="keys", tablefmt="mixed_grid")}"""
|
560
|
+
)
|
561
|
+
return task_list
|
@@ -1,4 +1,4 @@
|
|
1
|
-
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self
|
1
|
+
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure with the tools available. Use complete sentences.
|
2
2
|
|
3
3
|
User's question: {question}
|
4
4
|
|
@@ -13,14 +13,14 @@ Final answer:
|
|
13
13
|
|
14
14
|
Reflection: """
|
15
15
|
|
16
|
-
TASK_DECOMPOSE = """You need to decompose a
|
16
|
+
TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
|
17
17
|
This is the user's question: {question}
|
18
|
-
This is tool list:
|
18
|
+
This is the tool list:
|
19
19
|
{tools}
|
20
20
|
|
21
21
|
Please note that:
|
22
|
-
1. You should only decompose this
|
23
|
-
2. If one subtask
|
22
|
+
1. You should only decompose this user's complex question into some simple subtasks which can be executed easily by using one single tool in the tool list.
|
23
|
+
2. If one subtask needs the results from another subtask, you should write clearly. For example:
|
24
24
|
{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
|
25
25
|
3. You must ONLY output in a parsible JSON format. An example output looks like:
|
26
26
|
|
@@ -28,18 +28,18 @@ Please note that:
|
|
28
28
|
|
29
29
|
Output: """
|
30
30
|
|
31
|
-
TASK_DECOMPOSE_DEPENDS = """You need to decompose a
|
31
|
+
TASK_DECOMPOSE_DEPENDS = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
|
32
32
|
This is the user's question: {question}
|
33
33
|
|
34
|
-
This is tool list:
|
34
|
+
This is the tool list:
|
35
35
|
{tools}
|
36
36
|
|
37
37
|
This is a reflection from a previous failed attempt:
|
38
38
|
{reflections}
|
39
39
|
|
40
40
|
Please note that:
|
41
|
-
1. You should only decompose this
|
42
|
-
2. If one subtask
|
41
|
+
1. You should only decompose this user's complex question into some simple subtasks which can be executed easily by using one single tool in the tool list.
|
42
|
+
2. If one subtask needs the results from another subtask, you should write clearly. For example:
|
43
43
|
{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
|
44
44
|
3. You must ONLY output in a parsible JSON format. An example output looks like:
|
45
45
|
|
@@ -53,7 +53,7 @@ These are the tools you can select to solve the question:
|
|
53
53
|
{tools}
|
54
54
|
|
55
55
|
Please note that:
|
56
|
-
1. You should only
|
56
|
+
1. You should only choose one tool from the Tool List to solve this question.
|
57
57
|
2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
58
58
|
|
59
59
|
Example 1: {{"ID": 1}}
|
@@ -70,7 +70,7 @@ This is a reflection from a previous failed attempt:
|
|
70
70
|
{reflections}
|
71
71
|
|
72
72
|
Please note that:
|
73
|
-
1. You should only
|
73
|
+
1. You should only choose one tool from the Tool List to solve this question.
|
74
74
|
2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
75
75
|
|
76
76
|
Example 1: {{"ID": 1}}
|
@@ -78,14 +78,14 @@ Example 2: {{"ID": 2}}
|
|
78
78
|
|
79
79
|
Output: """
|
80
80
|
|
81
|
-
CHOOSE_PARAMETER_DEPENDS = """Given a user's question and
|
81
|
+
CHOOSE_PARAMETER_DEPENDS = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
|
82
82
|
Please note that:
|
83
83
|
1. The Example in the API tool documentation can help you better understand the use of the API.
|
84
|
-
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
84
|
+
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
85
85
|
3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
|
86
86
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
|
87
|
-
5. If you need to use this API multiple times
|
88
|
-
6. You must ONLY output in a parsible JSON format. Two
|
87
|
+
5. If you need to use this API multiple times, please set "Parameters" to a list.
|
88
|
+
6. You must ONLY output in a parsible JSON format. Two example outputs look like:
|
89
89
|
|
90
90
|
Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
|
91
91
|
Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
|
@@ -93,16 +93,16 @@ Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
|
|
93
93
|
This is a reflection from a previous failed attempt:
|
94
94
|
{reflections}
|
95
95
|
|
96
|
-
|
96
|
+
These are logs of previous questions and answers:
|
97
97
|
{previous_log}
|
98
98
|
|
99
99
|
This is the current user's question: {question}
|
100
|
-
This is API tool documentation: {tool_usage}
|
100
|
+
This is the API tool documentation: {tool_usage}
|
101
101
|
Output: """
|
102
102
|
|
103
103
|
ANSWER_GENERATE_DEPENDS = """You should answer the question based on the response output by the API tool.
|
104
104
|
Please note that:
|
105
|
-
1.
|
105
|
+
1. You should try to organize the response into a natural language answer.
|
106
106
|
2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
107
107
|
3. If the API tool does not provide useful information in the response, please answer with your knowledge.
|
108
108
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
|
@@ -110,7 +110,7 @@ Please note that:
|
|
110
110
|
This is a reflection from a previous failed attempt:
|
111
111
|
{reflections}
|
112
112
|
|
113
|
-
|
113
|
+
These are logs of previous questions and answers:
|
114
114
|
{previous_log}
|
115
115
|
|
116
116
|
This is the user's question: {question}
|
@@ -121,7 +121,7 @@ This is the response output by the API tool:
|
|
121
121
|
We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
122
122
|
Output: """
|
123
123
|
|
124
|
-
ANSWER_SUMMARIZE_DEPENDS = """We break down a
|
124
|
+
ANSWER_SUMMARIZE_DEPENDS = """We break down a user's complex problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question
|
125
125
|
This is the user's question: {question}
|
126
126
|
|
127
127
|
These are subtasks and their answers:
|
@@ -78,7 +78,7 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
|
78
78
|
data = Image.open(data)
|
79
79
|
if isinstance(data, Image.Image):
|
80
80
|
buffer = BytesIO()
|
81
|
-
data.save(buffer, format="
|
81
|
+
data.convert("RGB").save(buffer, format="JPEG")
|
82
82
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
83
83
|
else:
|
84
84
|
arr_bytes = data.tobytes()
|
@@ -0,0 +1 @@
|
|
1
|
+
from .llm import LLM, AzureOpenAILLM, OpenAILLM
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
from abc import ABC, abstractmethod
|
3
|
-
from typing import Any, Callable, Dict, List, Mapping, Union, cast
|
4
|
+
from typing import Any, Callable, Dict, List, Mapping, Optional, Union, cast
|
4
5
|
|
5
|
-
from openai import OpenAI
|
6
|
+
from openai import AzureOpenAI, OpenAI
|
6
7
|
|
7
8
|
from vision_agent.tools import (
|
8
9
|
CHOOSE_PARAMS,
|
@@ -33,11 +34,16 @@ class OpenAILLM(LLM):
|
|
33
34
|
def __init__(
|
34
35
|
self,
|
35
36
|
model_name: str = "gpt-4-turbo-preview",
|
37
|
+
api_key: Optional[str] = None,
|
36
38
|
json_mode: bool = False,
|
37
39
|
**kwargs: Any
|
38
40
|
):
|
41
|
+
if not api_key:
|
42
|
+
self.client = OpenAI()
|
43
|
+
else:
|
44
|
+
self.client = OpenAI(api_key=api_key)
|
45
|
+
|
39
46
|
self.model_name = model_name
|
40
|
-
self.client = OpenAI()
|
41
47
|
self.kwargs = kwargs
|
42
48
|
if json_mode:
|
43
49
|
self.kwargs["response_format"] = {"type": "json_object"}
|
@@ -120,3 +126,32 @@ class OpenAILLM(LLM):
|
|
120
126
|
]
|
121
127
|
|
122
128
|
return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
|
129
|
+
|
130
|
+
|
131
|
+
class AzureOpenAILLM(OpenAILLM):
|
132
|
+
def __init__(
|
133
|
+
self,
|
134
|
+
model_name: str = "gpt-4-turbo-preview",
|
135
|
+
api_key: Optional[str] = None,
|
136
|
+
api_version: str = "2024-02-01",
|
137
|
+
azure_endpoint: Optional[str] = None,
|
138
|
+
json_mode: bool = False,
|
139
|
+
**kwargs: Any
|
140
|
+
):
|
141
|
+
if not api_key:
|
142
|
+
api_key = os.getenv("AZURE_OPENAI_API_KEY")
|
143
|
+
if not azure_endpoint:
|
144
|
+
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
|
145
|
+
|
146
|
+
if not api_key:
|
147
|
+
raise ValueError("Azure OpenAI API key is required.")
|
148
|
+
if not azure_endpoint:
|
149
|
+
raise ValueError("Azure OpenAI endpoint is required.")
|
150
|
+
|
151
|
+
self.client = AzureOpenAI(
|
152
|
+
api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
|
153
|
+
)
|
154
|
+
self.model_name = model_name
|
155
|
+
self.kwargs = kwargs
|
156
|
+
if json_mode:
|
157
|
+
self.kwargs["response_format"] = {"type": "json_object"}
|
@@ -0,0 +1 @@
|
|
1
|
+
from .lmm import LMM, AzureOpenAILMM, LLaVALMM, OpenAILMM, get_lmm
|
@@ -1,12 +1,13 @@
|
|
1
1
|
import base64
|
2
2
|
import json
|
3
3
|
import logging
|
4
|
+
import os
|
4
5
|
from abc import ABC, abstractmethod
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
7
8
|
|
8
9
|
import requests
|
9
|
-
from openai import OpenAI
|
10
|
+
from openai import AzureOpenAI, OpenAI
|
10
11
|
|
11
12
|
from vision_agent.tools import (
|
12
13
|
CHOOSE_PARAMS,
|
@@ -99,12 +100,18 @@ class OpenAILMM(LMM):
|
|
99
100
|
def __init__(
|
100
101
|
self,
|
101
102
|
model_name: str = "gpt-4-vision-preview",
|
103
|
+
api_key: Optional[str] = None,
|
102
104
|
max_tokens: int = 1024,
|
103
105
|
**kwargs: Any,
|
104
106
|
):
|
107
|
+
if not api_key:
|
108
|
+
self.client = OpenAI()
|
109
|
+
else:
|
110
|
+
self.client = OpenAI(api_key=api_key)
|
111
|
+
|
112
|
+
self.client = OpenAI(api_key=api_key)
|
105
113
|
self.model_name = model_name
|
106
114
|
self.max_tokens = max_tokens
|
107
|
-
self.client = OpenAI()
|
108
115
|
self.kwargs = kwargs
|
109
116
|
|
110
117
|
def __call__(
|
@@ -248,6 +255,34 @@ class OpenAILMM(LMM):
|
|
248
255
|
return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
|
249
256
|
|
250
257
|
|
258
|
+
class AzureOpenAILMM(OpenAILMM):
|
259
|
+
def __init__(
|
260
|
+
self,
|
261
|
+
model_name: str = "gpt-4-vision-preview",
|
262
|
+
api_key: Optional[str] = None,
|
263
|
+
api_version: str = "2024-02-01",
|
264
|
+
azure_endpoint: Optional[str] = None,
|
265
|
+
max_tokens: int = 1024,
|
266
|
+
**kwargs: Any,
|
267
|
+
):
|
268
|
+
if not api_key:
|
269
|
+
api_key = os.getenv("AZURE_OPENAI_API_KEY")
|
270
|
+
if not azure_endpoint:
|
271
|
+
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
|
272
|
+
|
273
|
+
if not api_key:
|
274
|
+
raise ValueError("OpenAI API key is required.")
|
275
|
+
if not azure_endpoint:
|
276
|
+
raise ValueError("Azure OpenAI endpoint is required.")
|
277
|
+
|
278
|
+
self.client = AzureOpenAI(
|
279
|
+
api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
|
280
|
+
)
|
281
|
+
self.model_name = model_name
|
282
|
+
self.max_tokens = max_tokens
|
283
|
+
self.kwargs = kwargs
|
284
|
+
|
285
|
+
|
251
286
|
def get_lmm(name: str) -> LMM:
|
252
287
|
if name == "openai":
|
253
288
|
return OpenAILMM(name)
|
@@ -6,14 +6,14 @@ CHOOSE_PARAMS = (
|
|
6
6
|
"This is the API tool documentation: {api_doc}\n"
|
7
7
|
"Please note that: \n"
|
8
8
|
"1. The Example in the API tool documentation can help you better understand the use of the API.\n"
|
9
|
-
'2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no
|
9
|
+
'2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no parameters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}\n'
|
10
10
|
"3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.\n"
|
11
11
|
'4. If you need to use this API multiple times, please set "Parameters" to a list.\n'
|
12
|
-
"5. You must ONLY output in a parsible JSON format. Two
|
12
|
+
"5. You must ONLY output in a parsible JSON format. Two example outputs look like:\n"
|
13
13
|
"'''\n"
|
14
14
|
'Example 1: {{"Parameters":{{"keyword": "Artificial Intelligence", "language": "English"}}}}\n'
|
15
15
|
'Example 2: {{"Parameters":[{{"keyword": "Artificial Intelligence", "language": "English"}}, {{"keyword": "Machine Learning", "language": "English"}}]}}\n'
|
16
16
|
"'''\n"
|
17
|
-
"This is user's question: {question}\n"
|
17
|
+
"This is the user's question: {question}\n"
|
18
18
|
"Output:\n"
|
19
19
|
)
|
@@ -51,6 +51,25 @@ class Tool(ABC):
|
|
51
51
|
usage: Dict
|
52
52
|
|
53
53
|
|
54
|
+
class NoOp(Tool):
|
55
|
+
name = "noop_"
|
56
|
+
description = (
|
57
|
+
"'noop_' is a no-op tool that does nothing if you do not need to use a tool."
|
58
|
+
)
|
59
|
+
usage = {
|
60
|
+
"required_parameters": [],
|
61
|
+
"examples": [
|
62
|
+
{
|
63
|
+
"scenario": "If you do not want to use a tool.",
|
64
|
+
"parameters": {},
|
65
|
+
}
|
66
|
+
],
|
67
|
+
}
|
68
|
+
|
69
|
+
def __call__(self) -> None:
|
70
|
+
return None
|
71
|
+
|
72
|
+
|
54
73
|
class CLIP(Tool):
|
55
74
|
r"""CLIP is a tool that can classify or tag any image given a set if input classes
|
56
75
|
or tags.
|
@@ -59,32 +78,32 @@ class CLIP(Tool):
|
|
59
78
|
-------
|
60
79
|
>>> import vision_agent as va
|
61
80
|
>>> clip = va.tools.CLIP()
|
62
|
-
>>> clip(
|
81
|
+
>>> clip("red line, yellow dot", "ct_scan1.jpg"))
|
63
82
|
[{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
|
64
83
|
"""
|
65
84
|
|
66
|
-
_ENDPOINT = "https://
|
85
|
+
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
67
86
|
|
68
87
|
name = "clip_"
|
69
88
|
description = "'clip_' is a tool that can classify or tag any image given a set of input classes or tags."
|
70
89
|
usage = {
|
71
90
|
"required_parameters": [
|
72
|
-
{"name": "prompt", "type": "
|
91
|
+
{"name": "prompt", "type": "str"},
|
73
92
|
{"name": "image", "type": "str"},
|
74
93
|
],
|
75
94
|
"examples": [
|
76
95
|
{
|
77
96
|
"scenario": "Can you classify this image as a cat? Image name: cat.jpg",
|
78
|
-
"parameters": {"prompt":
|
97
|
+
"parameters": {"prompt": "cat", "image": "cat.jpg"},
|
79
98
|
},
|
80
99
|
{
|
81
100
|
"scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg",
|
82
|
-
"parameters": {"prompt":
|
101
|
+
"parameters": {"prompt": "cat, dog", "image": "cat_dog.jpg"},
|
83
102
|
},
|
84
103
|
{
|
85
104
|
"scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg",
|
86
105
|
"parameters": {
|
87
|
-
"prompt":
|
106
|
+
"prompt": "red shirt, green shirt, other",
|
88
107
|
"image": "shirts.jpg",
|
89
108
|
},
|
90
109
|
},
|
@@ -92,11 +111,11 @@ class CLIP(Tool):
|
|
92
111
|
}
|
93
112
|
|
94
113
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
95
|
-
def __call__(self, prompt:
|
114
|
+
def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
|
96
115
|
"""Invoke the CLIP model.
|
97
116
|
|
98
117
|
Parameters:
|
99
|
-
prompt: a list of classes or tags to classify the image.
|
118
|
+
prompt: a string includes a list of classes or tags to classify the image.
|
100
119
|
image: the input image to classify.
|
101
120
|
|
102
121
|
Returns:
|
@@ -104,8 +123,9 @@ class CLIP(Tool):
|
|
104
123
|
"""
|
105
124
|
image_b64 = convert_to_b64(image)
|
106
125
|
data = {
|
107
|
-
"
|
108
|
-
"
|
126
|
+
"prompt": prompt,
|
127
|
+
"image": image_b64,
|
128
|
+
"tool": "closed_set_image_classification",
|
109
129
|
}
|
110
130
|
res = requests.post(
|
111
131
|
self._ENDPOINT,
|
@@ -119,10 +139,11 @@ class CLIP(Tool):
|
|
119
139
|
_LOGGER.error(f"Request failed: {resp_json}")
|
120
140
|
raise ValueError(f"Request failed: {resp_json}")
|
121
141
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
142
|
+
resp_json["data"]["scores"] = [
|
143
|
+
round(prob, 4) for prob in resp_json["data"]["scores"]
|
144
|
+
]
|
145
|
+
|
146
|
+
return resp_json["data"] # type: ignore
|
126
147
|
|
127
148
|
|
128
149
|
class GroundingDINO(Tool):
|
@@ -139,7 +160,7 @@ class GroundingDINO(Tool):
|
|
139
160
|
'scores': [0.98, 0.02]}]
|
140
161
|
"""
|
141
162
|
|
142
|
-
_ENDPOINT = "https://
|
163
|
+
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
143
164
|
|
144
165
|
name = "grounding_dino_"
|
145
166
|
description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
|
@@ -148,6 +169,10 @@ class GroundingDINO(Tool):
|
|
148
169
|
{"name": "prompt", "type": "str"},
|
149
170
|
{"name": "image", "type": "str"},
|
150
171
|
],
|
172
|
+
"optional_parameters": [
|
173
|
+
{"name": "box_threshold", "type": "float"},
|
174
|
+
{"name": "iou_threshold", "type": "float"},
|
175
|
+
],
|
151
176
|
"examples": [
|
152
177
|
{
|
153
178
|
"scenario": "Can you build me a car detector?",
|
@@ -162,32 +187,44 @@ class GroundingDINO(Tool):
|
|
162
187
|
"parameters": {
|
163
188
|
"prompt": "red shirt. green shirt",
|
164
189
|
"image": "shirts.jpg",
|
190
|
+
"box_threshold": 0.20,
|
191
|
+
"iou_threshold": 0.75,
|
165
192
|
},
|
166
193
|
},
|
167
194
|
],
|
168
195
|
}
|
169
196
|
|
170
197
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
171
|
-
def __call__(
|
198
|
+
def __call__(
|
199
|
+
self,
|
200
|
+
prompt: str,
|
201
|
+
image: Union[str, Path, ImageType],
|
202
|
+
box_threshold: float = 0.20,
|
203
|
+
iou_threshold: float = 0.75,
|
204
|
+
) -> Dict:
|
172
205
|
"""Invoke the Grounding DINO model.
|
173
206
|
|
174
207
|
Parameters:
|
175
208
|
prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
|
176
209
|
image: the input image to run against.
|
210
|
+
box_threshold: the threshold to filter out the bounding boxes with low scores.
|
211
|
+
iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
|
177
212
|
|
178
213
|
Returns:
|
179
214
|
A list of dictionaries containing the labels, scores, and bboxes. Each dictionary contains the detection result for an image.
|
180
215
|
"""
|
181
216
|
image_size = get_image_size(image)
|
182
217
|
image_b64 = convert_to_b64(image)
|
183
|
-
|
218
|
+
request_data = {
|
184
219
|
"prompt": prompt,
|
185
|
-
"
|
220
|
+
"image": image_b64,
|
221
|
+
"tool": "visual_grounding",
|
222
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
186
223
|
}
|
187
224
|
res = requests.post(
|
188
225
|
self._ENDPOINT,
|
189
226
|
headers={"Content-Type": "application/json"},
|
190
|
-
json=
|
227
|
+
json=request_data,
|
191
228
|
)
|
192
229
|
resp_json: Dict[str, Any] = res.json()
|
193
230
|
if (
|
@@ -195,16 +232,15 @@ class GroundingDINO(Tool):
|
|
195
232
|
) or "statusCode" not in resp_json:
|
196
233
|
_LOGGER.error(f"Request failed: {resp_json}")
|
197
234
|
raise ValueError(f"Request failed: {resp_json}")
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
return cast(Dict, resp_data)
|
235
|
+
data: Dict[str, Any] = resp_json["data"]
|
236
|
+
if "bboxes" in data:
|
237
|
+
data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
|
238
|
+
if "scores" in data:
|
239
|
+
data["scores"] = [round(score, 2) for score in data["scores"]]
|
240
|
+
if "labels" in data:
|
241
|
+
data["labels"] = [label for label in data["labels"]]
|
242
|
+
data["size"] = (image_size[1], image_size[0])
|
243
|
+
return data
|
208
244
|
|
209
245
|
|
210
246
|
class GroundingSAM(Tool):
|
@@ -215,7 +251,7 @@ class GroundingSAM(Tool):
|
|
215
251
|
-------
|
216
252
|
>>> import vision_agent as va
|
217
253
|
>>> t = va.tools.GroundingSAM()
|
218
|
-
>>> t(
|
254
|
+
>>> t("red line, yellow dot", "ct_scan1.jpg"])
|
219
255
|
[{'labels': ['yellow dot', 'red line'],
|
220
256
|
'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
|
221
257
|
'masks': [array([[0, 0, 0, ..., 0, 0, 0],
|
@@ -230,55 +266,71 @@ class GroundingSAM(Tool):
|
|
230
266
|
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
231
267
|
"""
|
232
268
|
|
233
|
-
_ENDPOINT = "https://
|
269
|
+
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
234
270
|
|
235
271
|
name = "grounding_sam_"
|
236
272
|
description = "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions."
|
237
273
|
usage = {
|
238
274
|
"required_parameters": [
|
239
|
-
{"name": "prompt", "type": "
|
275
|
+
{"name": "prompt", "type": "str"},
|
240
276
|
{"name": "image", "type": "str"},
|
241
277
|
],
|
278
|
+
"optional_parameters": [
|
279
|
+
{"name": "box_threshold", "type": "float"},
|
280
|
+
{"name": "iou_threshold", "type": "float"},
|
281
|
+
],
|
242
282
|
"examples": [
|
243
283
|
{
|
244
284
|
"scenario": "Can you build me a car segmentor?",
|
245
|
-
"parameters": {"prompt":
|
285
|
+
"parameters": {"prompt": "car", "image": ""},
|
246
286
|
},
|
247
287
|
{
|
248
288
|
"scenario": "Can you segment the person on the left? Image name: person.jpg",
|
249
|
-
"parameters": {"prompt":
|
289
|
+
"parameters": {"prompt": "person on the left", "image": "person.jpg"},
|
250
290
|
},
|
251
291
|
{
|
252
292
|
"scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg",
|
253
293
|
"parameters": {
|
254
|
-
"prompt":
|
294
|
+
"prompt": "red shirt, green shirt",
|
255
295
|
"image": "shirts.jpg",
|
296
|
+
"box_threshold": 0.20,
|
297
|
+
"iou_threshold": 0.75,
|
256
298
|
},
|
257
299
|
},
|
258
300
|
],
|
259
301
|
}
|
260
302
|
|
261
303
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
262
|
-
def __call__(
|
304
|
+
def __call__(
|
305
|
+
self,
|
306
|
+
prompt: str,
|
307
|
+
image: Union[str, ImageType],
|
308
|
+
box_threshold: float = 0.2,
|
309
|
+
iou_threshold: float = 0.75,
|
310
|
+
) -> Dict:
|
263
311
|
"""Invoke the Grounding SAM model.
|
264
312
|
|
265
313
|
Parameters:
|
266
314
|
prompt: a list of classes to segment.
|
267
315
|
image: the input image to segment.
|
316
|
+
box_threshold: the threshold to filter out the bounding boxes with low scores.
|
317
|
+
iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
|
268
318
|
|
269
319
|
Returns:
|
270
320
|
A list of dictionaries containing the labels, scores, bboxes and masks. Each dictionary contains the segmentation result for an image.
|
271
321
|
"""
|
272
322
|
image_size = get_image_size(image)
|
273
323
|
image_b64 = convert_to_b64(image)
|
274
|
-
|
275
|
-
"
|
324
|
+
request_data = {
|
325
|
+
"prompt": prompt,
|
276
326
|
"image": image_b64,
|
327
|
+
"tool": "visual_grounding_segment",
|
328
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
277
329
|
}
|
278
330
|
res = requests.post(
|
279
331
|
self._ENDPOINT,
|
280
332
|
headers={"Content-Type": "application/json"},
|
281
|
-
json=
|
333
|
+
json=request_data,
|
282
334
|
)
|
283
335
|
resp_json: Dict[str, Any] = res.json()
|
284
336
|
if (
|
@@ -286,14 +338,19 @@ class GroundingSAM(Tool):
|
|
286
338
|
) or "statusCode" not in resp_json:
|
287
339
|
_LOGGER.error(f"Request failed: {resp_json}")
|
288
340
|
raise ValueError(f"Request failed: {resp_json}")
|
289
|
-
|
341
|
+
data: Dict[str, Any] = resp_json["data"]
|
290
342
|
ret_pred: Dict[str, List] = {"labels": [], "bboxes": [], "masks": []}
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
ret_pred["masks"]
|
343
|
+
if "bboxes" in data:
|
344
|
+
ret_pred["bboxes"] = [
|
345
|
+
normalize_bbox(box, image_size) for box in data["bboxes"]
|
346
|
+
]
|
347
|
+
if "masks" in data:
|
348
|
+
ret_pred["masks"] = [
|
349
|
+
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
350
|
+
for mask in data["masks"]
|
351
|
+
]
|
352
|
+
ret_pred["labels"] = data["labels"]
|
353
|
+
ret_pred["scores"] = data["scores"]
|
297
354
|
return ret_pred
|
298
355
|
|
299
356
|
|
@@ -302,8 +359,14 @@ class AgentGroundingSAM(GroundingSAM):
|
|
302
359
|
returns the file name. This makes it easier for agents to use.
|
303
360
|
"""
|
304
361
|
|
305
|
-
def __call__(
|
306
|
-
|
362
|
+
def __call__(
|
363
|
+
self,
|
364
|
+
prompt: str,
|
365
|
+
image: Union[str, ImageType],
|
366
|
+
box_threshold: float = 0.2,
|
367
|
+
iou_threshold: float = 0.75,
|
368
|
+
) -> Dict:
|
369
|
+
rets = super().__call__(prompt, image, box_threshold, iou_threshold)
|
307
370
|
mask_files = []
|
308
371
|
for mask in rets["masks"]:
|
309
372
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
@@ -384,7 +447,7 @@ class BboxArea(Tool):
|
|
384
447
|
name = "bbox_area_"
|
385
448
|
description = "'bbox_area_' returns the area of the bounding box in pixels normalized to 2 decimal places."
|
386
449
|
usage = {
|
387
|
-
"required_parameters": [{"name": "
|
450
|
+
"required_parameters": [{"name": "bboxes", "type": "List[int]"}],
|
388
451
|
"examples": [
|
389
452
|
{
|
390
453
|
"scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
@@ -426,7 +489,8 @@ class SegArea(Tool):
|
|
426
489
|
def __call__(self, masks: Union[str, Path]) -> float:
|
427
490
|
pil_mask = Image.open(str(masks))
|
428
491
|
np_mask = np.array(pil_mask)
|
429
|
-
|
492
|
+
np_mask = np.clip(np_mask, 0, 1)
|
493
|
+
return cast(float, round(np.sum(np_mask), 2))
|
430
494
|
|
431
495
|
|
432
496
|
class BboxIoU(Tool):
|
@@ -612,6 +676,7 @@ TOOLS = {
|
|
612
676
|
i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
|
613
677
|
for i, c in enumerate(
|
614
678
|
[
|
679
|
+
NoOp,
|
615
680
|
CLIP,
|
616
681
|
GroundingDINO,
|
617
682
|
AgentGroundingSAM,
|
@@ -1 +0,0 @@
|
|
1
|
-
from .llm import LLM, OpenAILLM
|
@@ -1 +0,0 @@
|
|
1
|
-
from .lmm import LMM, LLaVALMM, OpenAILMM, get_lmm
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|