vision-agent 0.0.48__py3-none-any.whl → 0.0.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent.py +7 -0
- vision_agent/agent/easytool_prompts.py +14 -14
- vision_agent/agent/reflexion_prompts.py +1 -1
- vision_agent/agent/vision_agent.py +113 -82
- vision_agent/agent/vision_agent_prompts.py +20 -20
- vision_agent/image_utils.py +1 -1
- vision_agent/llm/__init__.py +1 -1
- vision_agent/llm/llm.py +38 -3
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +37 -2
- vision_agent/tools/prompts.py +3 -3
- vision_agent/tools/tools.py +95 -50
- {vision_agent-0.0.48.dist-info → vision_agent-0.0.49.dist-info}/METADATA +23 -2
- vision_agent-0.0.49.dist-info/RECORD +26 -0
- vision_agent-0.0.48.dist-info/RECORD +0 -26
- {vision_agent-0.0.48.dist-info → vision_agent-0.0.49.dist-info}/LICENSE +0 -0
- {vision_agent-0.0.48.dist-info → vision_agent-0.0.49.dist-info}/WHEEL +0 -0
vision_agent/agent/agent.py
CHANGED
@@ -11,3 +11,10 @@ class Agent(ABC):
|
|
11
11
|
image: Optional[Union[str, Path]] = None,
|
12
12
|
) -> str:
|
13
13
|
pass
|
14
|
+
|
15
|
+
@abstractmethod
|
16
|
+
def log_progress(self, description: str) -> None:
|
17
|
+
"""Log the progress of the agent.
|
18
|
+
This is a hook that is intended for reporting the progress of the agent.
|
19
|
+
"""
|
20
|
+
pass
|
@@ -1,11 +1,11 @@
|
|
1
|
-
TASK_DECOMPOSE = """You need to decompose a
|
1
|
+
TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
|
2
2
|
This is the user's question: {question}
|
3
|
-
This is tool list:
|
3
|
+
This is the tool list:
|
4
4
|
{tools}
|
5
5
|
|
6
6
|
Please note that:
|
7
7
|
1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list.
|
8
|
-
2. If one subtask
|
8
|
+
2. If one subtask needs the results from another subtask, you should write clearly. For example:
|
9
9
|
{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
|
10
10
|
3. You must ONLY output in a parsible JSON format. An example output looks like:
|
11
11
|
|
@@ -13,7 +13,7 @@ Please note that:
|
|
13
13
|
|
14
14
|
Output: """
|
15
15
|
|
16
|
-
TASK_TOPOLOGY = """Given a
|
16
|
+
TASK_TOPOLOGY = """Given a user's complex question, I have decomposed this question into some simple subtasks. I think there exist logical connections and order among the tasks. Thus, you need to help me output these logical connections and order.
|
17
17
|
You must ONLY output in a parsible JSON format with the following format:
|
18
18
|
|
19
19
|
{{"Tasks": [{{"task": task, "id", task_id, "dep": [dependency_task_id1, dependency_task_id2, ...]}}]}}
|
@@ -21,7 +21,7 @@ You must ONLY output in a parsible JSON format with the following format:
|
|
21
21
|
The "dep" field denotes the id of the previous task which generates a new resource upon which the current task depends. If there are no dependencies, set "dep" to -1.
|
22
22
|
|
23
23
|
|
24
|
-
This is user's question: {question}
|
24
|
+
This is the user's question: {question}
|
25
25
|
|
26
26
|
These are subtasks of this question:
|
27
27
|
|
@@ -34,7 +34,7 @@ These are the tools you can select to solve the question:
|
|
34
34
|
{tools}
|
35
35
|
|
36
36
|
Please note that:
|
37
|
-
1. You should only
|
37
|
+
1. You should only choose one tool from the Tool List to solve this question.
|
38
38
|
2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
39
39
|
|
40
40
|
Example 1: {{"ID": 1}}
|
@@ -42,22 +42,22 @@ Example 2: {{"ID": 2}}
|
|
42
42
|
|
43
43
|
Output: """
|
44
44
|
|
45
|
-
CHOOSE_PARAMETER = """Given a user's question and
|
45
|
+
CHOOSE_PARAMETER = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
|
46
46
|
Please note that:
|
47
47
|
1. The Example in the API tool documentation can help you better understand the use of the API.
|
48
|
-
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
48
|
+
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
49
49
|
3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
|
50
50
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
|
51
|
-
5. If you need to use this API multiple times
|
52
|
-
6. You must ONLY output in a parsible JSON format. Two
|
51
|
+
5. If you need to use this API multiple times, please set "Parameters" to a list.
|
52
|
+
6. You must ONLY output in a parsible JSON format. Two example outputs looks like:
|
53
53
|
|
54
54
|
Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
|
55
55
|
Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
|
56
56
|
|
57
|
-
|
57
|
+
These are logs of previous questions and answers:
|
58
58
|
{previous_log}
|
59
59
|
This is the current user's question: {question}
|
60
|
-
This is API tool documentation: {tool_usage}
|
60
|
+
This is the API tool documentation: {tool_usage}
|
61
61
|
Output: """
|
62
62
|
|
63
63
|
|
@@ -67,7 +67,7 @@ Please note that:
|
|
67
67
|
2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
68
68
|
3. If the API tool does not provide useful information in the response, please answer with your knowledge.
|
69
69
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
|
70
|
-
|
70
|
+
These are logs of previous questions and answers:
|
71
71
|
{previous_log}
|
72
72
|
This is the user's question: {question}
|
73
73
|
This is the response output by the API tool:
|
@@ -75,7 +75,7 @@ This is the response output by the API tool:
|
|
75
75
|
We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
76
76
|
Output: """
|
77
77
|
|
78
|
-
ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question
|
78
|
+
ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question.
|
79
79
|
This is the user's question: {question}
|
80
80
|
These are subtasks and their answers: {answers}
|
81
81
|
Final answer: """
|
@@ -9,7 +9,7 @@ Relevant Context: {context}
|
|
9
9
|
Question: {question}{scratchpad}"""
|
10
10
|
|
11
11
|
|
12
|
-
COT_REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self
|
12
|
+
COT_REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given access to relevant context and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
|
13
13
|
Here are some examples:
|
14
14
|
{examples}
|
15
15
|
(END OF EXAMPLES)
|
@@ -244,79 +244,6 @@ def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any:
|
|
244
244
|
return str(e)
|
245
245
|
|
246
246
|
|
247
|
-
def retrieval(
|
248
|
-
model: Union[LLM, LMM, Agent],
|
249
|
-
question: str,
|
250
|
-
tools: Dict[int, Any],
|
251
|
-
previous_log: str,
|
252
|
-
reflections: str,
|
253
|
-
) -> Tuple[Dict, str]:
|
254
|
-
tool_id = choose_tool(
|
255
|
-
model, question, {k: v["description"] for k, v in tools.items()}, reflections
|
256
|
-
)
|
257
|
-
if tool_id is None:
|
258
|
-
return {}, ""
|
259
|
-
|
260
|
-
tool_instructions = tools[tool_id]
|
261
|
-
tool_usage = tool_instructions["usage"]
|
262
|
-
tool_name = tool_instructions["name"]
|
263
|
-
|
264
|
-
parameters = choose_parameter(
|
265
|
-
model, question, tool_usage, previous_log, reflections
|
266
|
-
)
|
267
|
-
if parameters is None:
|
268
|
-
return {}, ""
|
269
|
-
tool_results = {"task": question, "tool_name": tool_name, "parameters": parameters}
|
270
|
-
|
271
|
-
_LOGGER.info(
|
272
|
-
f"""Going to run the following tool(s) in sequence:
|
273
|
-
{tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
|
274
|
-
)
|
275
|
-
|
276
|
-
def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
|
277
|
-
call_results: List[Any] = []
|
278
|
-
if isinstance(result["parameters"], Dict):
|
279
|
-
call_results.append(
|
280
|
-
function_call(tools[tool_id]["class"], result["parameters"])
|
281
|
-
)
|
282
|
-
elif isinstance(result["parameters"], List):
|
283
|
-
for parameters in result["parameters"]:
|
284
|
-
call_results.append(function_call(tools[tool_id]["class"], parameters))
|
285
|
-
return call_results
|
286
|
-
|
287
|
-
call_results = parse_tool_results(tool_results)
|
288
|
-
tool_results["call_results"] = call_results
|
289
|
-
|
290
|
-
call_results_str = str(call_results)
|
291
|
-
# _LOGGER.info(f"\tCall Results: {call_results_str}")
|
292
|
-
return tool_results, call_results_str
|
293
|
-
|
294
|
-
|
295
|
-
def create_tasks(
|
296
|
-
task_model: Union[LLM, LMM], question: str, tools: Dict[int, Any], reflections: str
|
297
|
-
) -> List[Dict]:
|
298
|
-
tasks = task_decompose(
|
299
|
-
task_model,
|
300
|
-
question,
|
301
|
-
{k: v["description"] for k, v in tools.items()},
|
302
|
-
reflections,
|
303
|
-
)
|
304
|
-
if tasks is not None:
|
305
|
-
task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)]
|
306
|
-
task_list = task_topology(task_model, question, task_list)
|
307
|
-
try:
|
308
|
-
task_list = topological_sort(task_list)
|
309
|
-
except Exception:
|
310
|
-
_LOGGER.error(f"Failed topological_sort on: {task_list}")
|
311
|
-
else:
|
312
|
-
task_list = []
|
313
|
-
_LOGGER.info(
|
314
|
-
f"""Planned tasks:
|
315
|
-
{tabulate(task_list, headers="keys", tablefmt="mixed_grid")}"""
|
316
|
-
)
|
317
|
-
return task_list
|
318
|
-
|
319
|
-
|
320
247
|
def self_reflect(
|
321
248
|
reflect_model: Union[LLM, LMM],
|
322
249
|
question: str,
|
@@ -350,7 +277,7 @@ def parse_reflect(reflect: str) -> bool:
|
|
350
277
|
def visualize_result(all_tool_results: List[Dict]) -> List[str]:
|
351
278
|
image_to_data: Dict[str, Dict] = {}
|
352
279
|
for tool_result in all_tool_results:
|
353
|
-
if
|
280
|
+
if tool_result["tool_name"] not in ["grounding_sam_", "grounding_dino_"]:
|
354
281
|
continue
|
355
282
|
|
356
283
|
parameters = tool_result["parameters"]
|
@@ -368,7 +295,6 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
|
|
368
295
|
continue
|
369
296
|
|
370
297
|
for param, call_result in zip(parameters, tool_result["call_results"]):
|
371
|
-
|
372
298
|
# calls can fail, so we need to check if the call was successful
|
373
299
|
if not isinstance(call_result, dict):
|
374
300
|
continue
|
@@ -421,7 +347,18 @@ class VisionAgent(Agent):
|
|
421
347
|
reflect_model: Optional[Union[LLM, LMM]] = None,
|
422
348
|
max_retries: int = 2,
|
423
349
|
verbose: bool = False,
|
350
|
+
report_progress_callback: Optional[Callable[[str], None]] = None,
|
424
351
|
):
|
352
|
+
"""VisionAgent constructor.
|
353
|
+
|
354
|
+
Parameters
|
355
|
+
task_model: the model to use for task decomposition.
|
356
|
+
answer_model: the model to use for reasoning and concluding the answer.
|
357
|
+
reflect_model: the model to use for self reflection.
|
358
|
+
max_retries: maximum number of retries to attempt to complete the task.
|
359
|
+
verbose: whether to print more logs.
|
360
|
+
report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgent instances are running in parallel. This callback ensures that the progress are not mixed up.
|
361
|
+
"""
|
425
362
|
self.task_model = (
|
426
363
|
OpenAILLM(json_mode=True, temperature=0.1)
|
427
364
|
if task_model is None
|
@@ -434,8 +371,8 @@ class VisionAgent(Agent):
|
|
434
371
|
OpenAILMM(temperature=0.1) if reflect_model is None else reflect_model
|
435
372
|
)
|
436
373
|
self.max_retries = max_retries
|
437
|
-
|
438
374
|
self.tools = TOOLS
|
375
|
+
self.report_progress_callback = report_progress_callback
|
439
376
|
if verbose:
|
440
377
|
_LOGGER.setLevel(logging.INFO)
|
441
378
|
|
@@ -458,6 +395,11 @@ class VisionAgent(Agent):
|
|
458
395
|
input = [{"role": "user", "content": input}]
|
459
396
|
return self.chat(input, image=image)
|
460
397
|
|
398
|
+
def log_progress(self, description: str) -> None:
|
399
|
+
_LOGGER.info(description)
|
400
|
+
if self.report_progress_callback:
|
401
|
+
self.report_progress_callback(description)
|
402
|
+
|
461
403
|
def chat_with_workflow(
|
462
404
|
self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
|
463
405
|
) -> Tuple[str, List[Dict]]:
|
@@ -470,7 +412,9 @@ class VisionAgent(Agent):
|
|
470
412
|
all_tool_results: List[Dict] = []
|
471
413
|
|
472
414
|
for _ in range(self.max_retries):
|
473
|
-
task_list = create_tasks(
|
415
|
+
task_list = self.create_tasks(
|
416
|
+
self.task_model, question, self.tools, reflections
|
417
|
+
)
|
474
418
|
|
475
419
|
task_depend = {"Original Quesiton": question}
|
476
420
|
previous_log = ""
|
@@ -482,7 +426,7 @@ class VisionAgent(Agent):
|
|
482
426
|
for task in task_list:
|
483
427
|
task_str = task["task"]
|
484
428
|
previous_log = str(task_depend)
|
485
|
-
tool_results, call_results = retrieval(
|
429
|
+
tool_results, call_results = self.retrieval(
|
486
430
|
self.task_model,
|
487
431
|
task_str,
|
488
432
|
self.tools,
|
@@ -496,8 +440,8 @@ class VisionAgent(Agent):
|
|
496
440
|
tool_results["answer"] = answer
|
497
441
|
all_tool_results.append(tool_results)
|
498
442
|
|
499
|
-
|
500
|
-
|
443
|
+
self.log_progress(f"\tCall Result: {call_results}")
|
444
|
+
self.log_progress(f"\tAnswer: {answer}")
|
501
445
|
answers.append({"task": task_str, "answer": answer})
|
502
446
|
task_depend[task["id"]]["answer"] = answer # type: ignore
|
503
447
|
task_depend[task["id"]]["call_result"] = call_results # type: ignore
|
@@ -515,12 +459,15 @@ class VisionAgent(Agent):
|
|
515
459
|
final_answer,
|
516
460
|
visualized_images[0] if len(visualized_images) > 0 else image,
|
517
461
|
)
|
518
|
-
|
462
|
+
self.log_progress(f"Reflection: {reflection}")
|
519
463
|
if parse_reflect(reflection):
|
520
464
|
break
|
521
465
|
else:
|
522
466
|
reflections += reflection
|
523
|
-
|
467
|
+
# '<END>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
|
468
|
+
self.log_progress(
|
469
|
+
f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</<ANSWER>"
|
470
|
+
)
|
524
471
|
return final_answer, all_tool_results
|
525
472
|
|
526
473
|
def chat(
|
@@ -528,3 +475,87 @@ class VisionAgent(Agent):
|
|
528
475
|
) -> str:
|
529
476
|
answer, _ = self.chat_with_workflow(chat, image=image)
|
530
477
|
return answer
|
478
|
+
|
479
|
+
def retrieval(
|
480
|
+
self,
|
481
|
+
model: Union[LLM, LMM, Agent],
|
482
|
+
question: str,
|
483
|
+
tools: Dict[int, Any],
|
484
|
+
previous_log: str,
|
485
|
+
reflections: str,
|
486
|
+
) -> Tuple[Dict, str]:
|
487
|
+
tool_id = choose_tool(
|
488
|
+
model,
|
489
|
+
question,
|
490
|
+
{k: v["description"] for k, v in tools.items()},
|
491
|
+
reflections,
|
492
|
+
)
|
493
|
+
if tool_id is None:
|
494
|
+
return {}, ""
|
495
|
+
|
496
|
+
tool_instructions = tools[tool_id]
|
497
|
+
tool_usage = tool_instructions["usage"]
|
498
|
+
tool_name = tool_instructions["name"]
|
499
|
+
|
500
|
+
parameters = choose_parameter(
|
501
|
+
model, question, tool_usage, previous_log, reflections
|
502
|
+
)
|
503
|
+
if parameters is None:
|
504
|
+
return {}, ""
|
505
|
+
tool_results = {
|
506
|
+
"task": question,
|
507
|
+
"tool_name": tool_name,
|
508
|
+
"parameters": parameters,
|
509
|
+
}
|
510
|
+
|
511
|
+
self.log_progress(
|
512
|
+
f"""Going to run the following tool(s) in sequence:
|
513
|
+
{tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
|
514
|
+
)
|
515
|
+
|
516
|
+
def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
|
517
|
+
call_results: List[Any] = []
|
518
|
+
if isinstance(result["parameters"], Dict):
|
519
|
+
call_results.append(
|
520
|
+
function_call(tools[tool_id]["class"], result["parameters"])
|
521
|
+
)
|
522
|
+
elif isinstance(result["parameters"], List):
|
523
|
+
for parameters in result["parameters"]:
|
524
|
+
call_results.append(
|
525
|
+
function_call(tools[tool_id]["class"], parameters)
|
526
|
+
)
|
527
|
+
return call_results
|
528
|
+
|
529
|
+
call_results = parse_tool_results(tool_results)
|
530
|
+
tool_results["call_results"] = call_results
|
531
|
+
|
532
|
+
call_results_str = str(call_results)
|
533
|
+
return tool_results, call_results_str
|
534
|
+
|
535
|
+
def create_tasks(
|
536
|
+
self,
|
537
|
+
task_model: Union[LLM, LMM],
|
538
|
+
question: str,
|
539
|
+
tools: Dict[int, Any],
|
540
|
+
reflections: str,
|
541
|
+
) -> List[Dict]:
|
542
|
+
tasks = task_decompose(
|
543
|
+
task_model,
|
544
|
+
question,
|
545
|
+
{k: v["description"] for k, v in tools.items()},
|
546
|
+
reflections,
|
547
|
+
)
|
548
|
+
if tasks is not None:
|
549
|
+
task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)]
|
550
|
+
task_list = task_topology(task_model, question, task_list)
|
551
|
+
try:
|
552
|
+
task_list = topological_sort(task_list)
|
553
|
+
except Exception:
|
554
|
+
_LOGGER.error(f"Failed topological_sort on: {task_list}")
|
555
|
+
else:
|
556
|
+
task_list = []
|
557
|
+
self.log_progress(
|
558
|
+
f"""Planned tasks:
|
559
|
+
{tabulate(task_list, headers="keys", tablefmt="mixed_grid")}"""
|
560
|
+
)
|
561
|
+
return task_list
|
@@ -1,4 +1,4 @@
|
|
1
|
-
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self
|
1
|
+
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure with the tools available. Use complete sentences.
|
2
2
|
|
3
3
|
User's question: {question}
|
4
4
|
|
@@ -13,14 +13,14 @@ Final answer:
|
|
13
13
|
|
14
14
|
Reflection: """
|
15
15
|
|
16
|
-
TASK_DECOMPOSE = """You need to decompose a
|
16
|
+
TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
|
17
17
|
This is the user's question: {question}
|
18
|
-
This is tool list:
|
18
|
+
This is the tool list:
|
19
19
|
{tools}
|
20
20
|
|
21
21
|
Please note that:
|
22
|
-
1. You should only decompose this
|
23
|
-
2. If one subtask
|
22
|
+
1. You should only decompose this user's complex question into some simple subtasks which can be executed easily by using one single tool in the tool list.
|
23
|
+
2. If one subtask needs the results from another subtask, you should write clearly. For example:
|
24
24
|
{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
|
25
25
|
3. You must ONLY output in a parsible JSON format. An example output looks like:
|
26
26
|
|
@@ -28,18 +28,18 @@ Please note that:
|
|
28
28
|
|
29
29
|
Output: """
|
30
30
|
|
31
|
-
TASK_DECOMPOSE_DEPENDS = """You need to decompose a
|
31
|
+
TASK_DECOMPOSE_DEPENDS = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
|
32
32
|
This is the user's question: {question}
|
33
33
|
|
34
|
-
This is tool list:
|
34
|
+
This is the tool list:
|
35
35
|
{tools}
|
36
36
|
|
37
37
|
This is a reflection from a previous failed attempt:
|
38
38
|
{reflections}
|
39
39
|
|
40
40
|
Please note that:
|
41
|
-
1. You should only decompose this
|
42
|
-
2. If one subtask
|
41
|
+
1. You should only decompose this user's complex question into some simple subtasks which can be executed easily by using one single tool in the tool list.
|
42
|
+
2. If one subtask needs the results from another subtask, you should write clearly. For example:
|
43
43
|
{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
|
44
44
|
3. You must ONLY output in a parsible JSON format. An example output looks like:
|
45
45
|
|
@@ -53,7 +53,7 @@ These are the tools you can select to solve the question:
|
|
53
53
|
{tools}
|
54
54
|
|
55
55
|
Please note that:
|
56
|
-
1. You should only
|
56
|
+
1. You should only choose one tool from the Tool List to solve this question.
|
57
57
|
2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
58
58
|
|
59
59
|
Example 1: {{"ID": 1}}
|
@@ -70,7 +70,7 @@ This is a reflection from a previous failed attempt:
|
|
70
70
|
{reflections}
|
71
71
|
|
72
72
|
Please note that:
|
73
|
-
1. You should only
|
73
|
+
1. You should only choose one tool from the Tool List to solve this question.
|
74
74
|
2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
75
75
|
|
76
76
|
Example 1: {{"ID": 1}}
|
@@ -78,14 +78,14 @@ Example 2: {{"ID": 2}}
|
|
78
78
|
|
79
79
|
Output: """
|
80
80
|
|
81
|
-
CHOOSE_PARAMETER_DEPENDS = """Given a user's question and
|
81
|
+
CHOOSE_PARAMETER_DEPENDS = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
|
82
82
|
Please note that:
|
83
83
|
1. The Example in the API tool documentation can help you better understand the use of the API.
|
84
|
-
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
84
|
+
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
85
85
|
3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
|
86
86
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
|
87
|
-
5. If you need to use this API multiple times
|
88
|
-
6. You must ONLY output in a parsible JSON format. Two
|
87
|
+
5. If you need to use this API multiple times, please set "Parameters" to a list.
|
88
|
+
6. You must ONLY output in a parsible JSON format. Two example outputs look like:
|
89
89
|
|
90
90
|
Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
|
91
91
|
Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
|
@@ -93,16 +93,16 @@ Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
|
|
93
93
|
This is a reflection from a previous failed attempt:
|
94
94
|
{reflections}
|
95
95
|
|
96
|
-
|
96
|
+
These are logs of previous questions and answers:
|
97
97
|
{previous_log}
|
98
98
|
|
99
99
|
This is the current user's question: {question}
|
100
|
-
This is API tool documentation: {tool_usage}
|
100
|
+
This is the API tool documentation: {tool_usage}
|
101
101
|
Output: """
|
102
102
|
|
103
103
|
ANSWER_GENERATE_DEPENDS = """You should answer the question based on the response output by the API tool.
|
104
104
|
Please note that:
|
105
|
-
1.
|
105
|
+
1. You should try to organize the response into a natural language answer.
|
106
106
|
2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
107
107
|
3. If the API tool does not provide useful information in the response, please answer with your knowledge.
|
108
108
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
|
@@ -110,7 +110,7 @@ Please note that:
|
|
110
110
|
This is a reflection from a previous failed attempt:
|
111
111
|
{reflections}
|
112
112
|
|
113
|
-
|
113
|
+
These are logs of previous questions and answers:
|
114
114
|
{previous_log}
|
115
115
|
|
116
116
|
This is the user's question: {question}
|
@@ -121,7 +121,7 @@ This is the response output by the API tool:
|
|
121
121
|
We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
|
122
122
|
Output: """
|
123
123
|
|
124
|
-
ANSWER_SUMMARIZE_DEPENDS = """We break down a
|
124
|
+
ANSWER_SUMMARIZE_DEPENDS = """We break down a user's complex problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question
|
125
125
|
This is the user's question: {question}
|
126
126
|
|
127
127
|
These are subtasks and their answers:
|
vision_agent/image_utils.py
CHANGED
@@ -78,7 +78,7 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
|
78
78
|
data = Image.open(data)
|
79
79
|
if isinstance(data, Image.Image):
|
80
80
|
buffer = BytesIO()
|
81
|
-
data.save(buffer, format="
|
81
|
+
data.convert("RGB").save(buffer, format="JPEG")
|
82
82
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
83
83
|
else:
|
84
84
|
arr_bytes = data.tobytes()
|
vision_agent/llm/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
from .llm import LLM, OpenAILLM
|
1
|
+
from .llm import LLM, AzureOpenAILLM, OpenAILLM
|
vision_agent/llm/llm.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
from abc import ABC, abstractmethod
|
3
|
-
from typing import Any, Callable, Dict, List, Mapping, Union, cast
|
4
|
+
from typing import Any, Callable, Dict, List, Mapping, Optional, Union, cast
|
4
5
|
|
5
|
-
from openai import OpenAI
|
6
|
+
from openai import AzureOpenAI, OpenAI
|
6
7
|
|
7
8
|
from vision_agent.tools import (
|
8
9
|
CHOOSE_PARAMS,
|
@@ -33,11 +34,16 @@ class OpenAILLM(LLM):
|
|
33
34
|
def __init__(
|
34
35
|
self,
|
35
36
|
model_name: str = "gpt-4-turbo-preview",
|
37
|
+
api_key: Optional[str] = None,
|
36
38
|
json_mode: bool = False,
|
37
39
|
**kwargs: Any
|
38
40
|
):
|
41
|
+
if not api_key:
|
42
|
+
self.client = OpenAI()
|
43
|
+
else:
|
44
|
+
self.client = OpenAI(api_key=api_key)
|
45
|
+
|
39
46
|
self.model_name = model_name
|
40
|
-
self.client = OpenAI()
|
41
47
|
self.kwargs = kwargs
|
42
48
|
if json_mode:
|
43
49
|
self.kwargs["response_format"] = {"type": "json_object"}
|
@@ -120,3 +126,32 @@ class OpenAILLM(LLM):
|
|
120
126
|
]
|
121
127
|
|
122
128
|
return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
|
129
|
+
|
130
|
+
|
131
|
+
class AzureOpenAILLM(OpenAILLM):
|
132
|
+
def __init__(
|
133
|
+
self,
|
134
|
+
model_name: str = "gpt-4-turbo-preview",
|
135
|
+
api_key: Optional[str] = None,
|
136
|
+
api_version: str = "2024-02-01",
|
137
|
+
azure_endpoint: Optional[str] = None,
|
138
|
+
json_mode: bool = False,
|
139
|
+
**kwargs: Any
|
140
|
+
):
|
141
|
+
if not api_key:
|
142
|
+
api_key = os.getenv("AZURE_OPENAI_API_KEY")
|
143
|
+
if not azure_endpoint:
|
144
|
+
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
|
145
|
+
|
146
|
+
if not api_key:
|
147
|
+
raise ValueError("Azure OpenAI API key is required.")
|
148
|
+
if not azure_endpoint:
|
149
|
+
raise ValueError("Azure OpenAI endpoint is required.")
|
150
|
+
|
151
|
+
self.client = AzureOpenAI(
|
152
|
+
api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
|
153
|
+
)
|
154
|
+
self.model_name = model_name
|
155
|
+
self.kwargs = kwargs
|
156
|
+
if json_mode:
|
157
|
+
self.kwargs["response_format"] = {"type": "json_object"}
|
vision_agent/lmm/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
from .lmm import LMM, LLaVALMM, OpenAILMM, get_lmm
|
1
|
+
from .lmm import LMM, AzureOpenAILMM, LLaVALMM, OpenAILMM, get_lmm
|
vision_agent/lmm/lmm.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
import base64
|
2
2
|
import json
|
3
3
|
import logging
|
4
|
+
import os
|
4
5
|
from abc import ABC, abstractmethod
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
7
8
|
|
8
9
|
import requests
|
9
|
-
from openai import OpenAI
|
10
|
+
from openai import AzureOpenAI, OpenAI
|
10
11
|
|
11
12
|
from vision_agent.tools import (
|
12
13
|
CHOOSE_PARAMS,
|
@@ -99,12 +100,18 @@ class OpenAILMM(LMM):
|
|
99
100
|
def __init__(
|
100
101
|
self,
|
101
102
|
model_name: str = "gpt-4-vision-preview",
|
103
|
+
api_key: Optional[str] = None,
|
102
104
|
max_tokens: int = 1024,
|
103
105
|
**kwargs: Any,
|
104
106
|
):
|
107
|
+
if not api_key:
|
108
|
+
self.client = OpenAI()
|
109
|
+
else:
|
110
|
+
self.client = OpenAI(api_key=api_key)
|
111
|
+
|
112
|
+
self.client = OpenAI(api_key=api_key)
|
105
113
|
self.model_name = model_name
|
106
114
|
self.max_tokens = max_tokens
|
107
|
-
self.client = OpenAI()
|
108
115
|
self.kwargs = kwargs
|
109
116
|
|
110
117
|
def __call__(
|
@@ -248,6 +255,34 @@ class OpenAILMM(LMM):
|
|
248
255
|
return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
|
249
256
|
|
250
257
|
|
258
|
+
class AzureOpenAILMM(OpenAILMM):
|
259
|
+
def __init__(
|
260
|
+
self,
|
261
|
+
model_name: str = "gpt-4-vision-preview",
|
262
|
+
api_key: Optional[str] = None,
|
263
|
+
api_version: str = "2024-02-01",
|
264
|
+
azure_endpoint: Optional[str] = None,
|
265
|
+
max_tokens: int = 1024,
|
266
|
+
**kwargs: Any,
|
267
|
+
):
|
268
|
+
if not api_key:
|
269
|
+
api_key = os.getenv("AZURE_OPENAI_API_KEY")
|
270
|
+
if not azure_endpoint:
|
271
|
+
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
|
272
|
+
|
273
|
+
if not api_key:
|
274
|
+
raise ValueError("OpenAI API key is required.")
|
275
|
+
if not azure_endpoint:
|
276
|
+
raise ValueError("Azure OpenAI endpoint is required.")
|
277
|
+
|
278
|
+
self.client = AzureOpenAI(
|
279
|
+
api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
|
280
|
+
)
|
281
|
+
self.model_name = model_name
|
282
|
+
self.max_tokens = max_tokens
|
283
|
+
self.kwargs = kwargs
|
284
|
+
|
285
|
+
|
251
286
|
def get_lmm(name: str) -> LMM:
|
252
287
|
if name == "openai":
|
253
288
|
return OpenAILMM(name)
|
vision_agent/tools/prompts.py
CHANGED
@@ -6,14 +6,14 @@ CHOOSE_PARAMS = (
|
|
6
6
|
"This is the API tool documentation: {api_doc}\n"
|
7
7
|
"Please note that: \n"
|
8
8
|
"1. The Example in the API tool documentation can help you better understand the use of the API.\n"
|
9
|
-
'2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no
|
9
|
+
'2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no parameters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}\n'
|
10
10
|
"3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.\n"
|
11
11
|
'4. If you need to use this API multiple times, please set "Parameters" to a list.\n'
|
12
|
-
"5. You must ONLY output in a parsible JSON format. Two
|
12
|
+
"5. You must ONLY output in a parsible JSON format. Two example outputs look like:\n"
|
13
13
|
"'''\n"
|
14
14
|
'Example 1: {{"Parameters":{{"keyword": "Artificial Intelligence", "language": "English"}}}}\n'
|
15
15
|
'Example 2: {{"Parameters":[{{"keyword": "Artificial Intelligence", "language": "English"}}, {{"keyword": "Machine Learning", "language": "English"}}]}}\n'
|
16
16
|
"'''\n"
|
17
|
-
"This is user's question: {question}\n"
|
17
|
+
"This is the user's question: {question}\n"
|
18
18
|
"Output:\n"
|
19
19
|
)
|
vision_agent/tools/tools.py
CHANGED
@@ -78,32 +78,32 @@ class CLIP(Tool):
|
|
78
78
|
-------
|
79
79
|
>>> import vision_agent as va
|
80
80
|
>>> clip = va.tools.CLIP()
|
81
|
-
>>> clip(
|
81
|
+
>>> clip("red line, yellow dot", "ct_scan1.jpg"))
|
82
82
|
[{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
|
83
83
|
"""
|
84
84
|
|
85
|
-
_ENDPOINT = "https://
|
85
|
+
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
86
86
|
|
87
87
|
name = "clip_"
|
88
88
|
description = "'clip_' is a tool that can classify or tag any image given a set of input classes or tags."
|
89
89
|
usage = {
|
90
90
|
"required_parameters": [
|
91
|
-
{"name": "prompt", "type": "
|
91
|
+
{"name": "prompt", "type": "str"},
|
92
92
|
{"name": "image", "type": "str"},
|
93
93
|
],
|
94
94
|
"examples": [
|
95
95
|
{
|
96
96
|
"scenario": "Can you classify this image as a cat? Image name: cat.jpg",
|
97
|
-
"parameters": {"prompt":
|
97
|
+
"parameters": {"prompt": "cat", "image": "cat.jpg"},
|
98
98
|
},
|
99
99
|
{
|
100
100
|
"scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg",
|
101
|
-
"parameters": {"prompt":
|
101
|
+
"parameters": {"prompt": "cat, dog", "image": "cat_dog.jpg"},
|
102
102
|
},
|
103
103
|
{
|
104
104
|
"scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg",
|
105
105
|
"parameters": {
|
106
|
-
"prompt":
|
106
|
+
"prompt": "red shirt, green shirt, other",
|
107
107
|
"image": "shirts.jpg",
|
108
108
|
},
|
109
109
|
},
|
@@ -111,11 +111,11 @@ class CLIP(Tool):
|
|
111
111
|
}
|
112
112
|
|
113
113
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
114
|
-
def __call__(self, prompt:
|
114
|
+
def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
|
115
115
|
"""Invoke the CLIP model.
|
116
116
|
|
117
117
|
Parameters:
|
118
|
-
prompt: a list of classes or tags to classify the image.
|
118
|
+
prompt: a string includes a list of classes or tags to classify the image.
|
119
119
|
image: the input image to classify.
|
120
120
|
|
121
121
|
Returns:
|
@@ -123,8 +123,9 @@ class CLIP(Tool):
|
|
123
123
|
"""
|
124
124
|
image_b64 = convert_to_b64(image)
|
125
125
|
data = {
|
126
|
-
"
|
127
|
-
"
|
126
|
+
"prompt": prompt,
|
127
|
+
"image": image_b64,
|
128
|
+
"tool": "closed_set_image_classification",
|
128
129
|
}
|
129
130
|
res = requests.post(
|
130
131
|
self._ENDPOINT,
|
@@ -138,10 +139,11 @@ class CLIP(Tool):
|
|
138
139
|
_LOGGER.error(f"Request failed: {resp_json}")
|
139
140
|
raise ValueError(f"Request failed: {resp_json}")
|
140
141
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
142
|
+
resp_json["data"]["scores"] = [
|
143
|
+
round(prob, 4) for prob in resp_json["data"]["scores"]
|
144
|
+
]
|
145
|
+
|
146
|
+
return resp_json["data"] # type: ignore
|
145
147
|
|
146
148
|
|
147
149
|
class GroundingDINO(Tool):
|
@@ -158,7 +160,7 @@ class GroundingDINO(Tool):
|
|
158
160
|
'scores': [0.98, 0.02]}]
|
159
161
|
"""
|
160
162
|
|
161
|
-
_ENDPOINT = "https://
|
163
|
+
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
162
164
|
|
163
165
|
name = "grounding_dino_"
|
164
166
|
description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
|
@@ -167,6 +169,10 @@ class GroundingDINO(Tool):
|
|
167
169
|
{"name": "prompt", "type": "str"},
|
168
170
|
{"name": "image", "type": "str"},
|
169
171
|
],
|
172
|
+
"optional_parameters": [
|
173
|
+
{"name": "box_threshold", "type": "float"},
|
174
|
+
{"name": "iou_threshold", "type": "float"},
|
175
|
+
],
|
170
176
|
"examples": [
|
171
177
|
{
|
172
178
|
"scenario": "Can you build me a car detector?",
|
@@ -181,32 +187,44 @@ class GroundingDINO(Tool):
|
|
181
187
|
"parameters": {
|
182
188
|
"prompt": "red shirt. green shirt",
|
183
189
|
"image": "shirts.jpg",
|
190
|
+
"box_threshold": 0.20,
|
191
|
+
"iou_threshold": 0.75,
|
184
192
|
},
|
185
193
|
},
|
186
194
|
],
|
187
195
|
}
|
188
196
|
|
189
197
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
190
|
-
def __call__(
|
198
|
+
def __call__(
|
199
|
+
self,
|
200
|
+
prompt: str,
|
201
|
+
image: Union[str, Path, ImageType],
|
202
|
+
box_threshold: float = 0.20,
|
203
|
+
iou_threshold: float = 0.75,
|
204
|
+
) -> Dict:
|
191
205
|
"""Invoke the Grounding DINO model.
|
192
206
|
|
193
207
|
Parameters:
|
194
208
|
prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
|
195
209
|
image: the input image to run against.
|
210
|
+
box_threshold: the threshold to filter out the bounding boxes with low scores.
|
211
|
+
iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
|
196
212
|
|
197
213
|
Returns:
|
198
214
|
A list of dictionaries containing the labels, scores, and bboxes. Each dictionary contains the detection result for an image.
|
199
215
|
"""
|
200
216
|
image_size = get_image_size(image)
|
201
217
|
image_b64 = convert_to_b64(image)
|
202
|
-
|
218
|
+
request_data = {
|
203
219
|
"prompt": prompt,
|
204
|
-
"
|
220
|
+
"image": image_b64,
|
221
|
+
"tool": "visual_grounding",
|
222
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
205
223
|
}
|
206
224
|
res = requests.post(
|
207
225
|
self._ENDPOINT,
|
208
226
|
headers={"Content-Type": "application/json"},
|
209
|
-
json=
|
227
|
+
json=request_data,
|
210
228
|
)
|
211
229
|
resp_json: Dict[str, Any] = res.json()
|
212
230
|
if (
|
@@ -214,16 +232,15 @@ class GroundingDINO(Tool):
|
|
214
232
|
) or "statusCode" not in resp_json:
|
215
233
|
_LOGGER.error(f"Request failed: {resp_json}")
|
216
234
|
raise ValueError(f"Request failed: {resp_json}")
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
return cast(Dict, resp_data)
|
235
|
+
data: Dict[str, Any] = resp_json["data"]
|
236
|
+
if "bboxes" in data:
|
237
|
+
data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
|
238
|
+
if "scores" in data:
|
239
|
+
data["scores"] = [round(score, 2) for score in data["scores"]]
|
240
|
+
if "labels" in data:
|
241
|
+
data["labels"] = [label for label in data["labels"]]
|
242
|
+
data["size"] = (image_size[1], image_size[0])
|
243
|
+
return data
|
227
244
|
|
228
245
|
|
229
246
|
class GroundingSAM(Tool):
|
@@ -234,7 +251,7 @@ class GroundingSAM(Tool):
|
|
234
251
|
-------
|
235
252
|
>>> import vision_agent as va
|
236
253
|
>>> t = va.tools.GroundingSAM()
|
237
|
-
>>> t(
|
254
|
+
>>> t("red line, yellow dot", "ct_scan1.jpg"])
|
238
255
|
[{'labels': ['yellow dot', 'red line'],
|
239
256
|
'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
|
240
257
|
'masks': [array([[0, 0, 0, ..., 0, 0, 0],
|
@@ -249,55 +266,71 @@ class GroundingSAM(Tool):
|
|
249
266
|
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
250
267
|
"""
|
251
268
|
|
252
|
-
_ENDPOINT = "https://
|
269
|
+
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
253
270
|
|
254
271
|
name = "grounding_sam_"
|
255
272
|
description = "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions."
|
256
273
|
usage = {
|
257
274
|
"required_parameters": [
|
258
|
-
{"name": "prompt", "type": "
|
275
|
+
{"name": "prompt", "type": "str"},
|
259
276
|
{"name": "image", "type": "str"},
|
260
277
|
],
|
278
|
+
"optional_parameters": [
|
279
|
+
{"name": "box_threshold", "type": "float"},
|
280
|
+
{"name": "iou_threshold", "type": "float"},
|
281
|
+
],
|
261
282
|
"examples": [
|
262
283
|
{
|
263
284
|
"scenario": "Can you build me a car segmentor?",
|
264
|
-
"parameters": {"prompt":
|
285
|
+
"parameters": {"prompt": "car", "image": ""},
|
265
286
|
},
|
266
287
|
{
|
267
288
|
"scenario": "Can you segment the person on the left? Image name: person.jpg",
|
268
|
-
"parameters": {"prompt":
|
289
|
+
"parameters": {"prompt": "person on the left", "image": "person.jpg"},
|
269
290
|
},
|
270
291
|
{
|
271
292
|
"scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg",
|
272
293
|
"parameters": {
|
273
|
-
"prompt":
|
294
|
+
"prompt": "red shirt, green shirt",
|
274
295
|
"image": "shirts.jpg",
|
296
|
+
"box_threshold": 0.20,
|
297
|
+
"iou_threshold": 0.75,
|
275
298
|
},
|
276
299
|
},
|
277
300
|
],
|
278
301
|
}
|
279
302
|
|
280
303
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
281
|
-
def __call__(
|
304
|
+
def __call__(
|
305
|
+
self,
|
306
|
+
prompt: str,
|
307
|
+
image: Union[str, ImageType],
|
308
|
+
box_threshold: float = 0.2,
|
309
|
+
iou_threshold: float = 0.75,
|
310
|
+
) -> Dict:
|
282
311
|
"""Invoke the Grounding SAM model.
|
283
312
|
|
284
313
|
Parameters:
|
285
314
|
prompt: a list of classes to segment.
|
286
315
|
image: the input image to segment.
|
316
|
+
box_threshold: the threshold to filter out the bounding boxes with low scores.
|
317
|
+
iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
|
287
318
|
|
288
319
|
Returns:
|
289
320
|
A list of dictionaries containing the labels, scores, bboxes and masks. Each dictionary contains the segmentation result for an image.
|
290
321
|
"""
|
291
322
|
image_size = get_image_size(image)
|
292
323
|
image_b64 = convert_to_b64(image)
|
293
|
-
|
294
|
-
"
|
324
|
+
request_data = {
|
325
|
+
"prompt": prompt,
|
295
326
|
"image": image_b64,
|
327
|
+
"tool": "visual_grounding_segment",
|
328
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
296
329
|
}
|
297
330
|
res = requests.post(
|
298
331
|
self._ENDPOINT,
|
299
332
|
headers={"Content-Type": "application/json"},
|
300
|
-
json=
|
333
|
+
json=request_data,
|
301
334
|
)
|
302
335
|
resp_json: Dict[str, Any] = res.json()
|
303
336
|
if (
|
@@ -305,14 +338,19 @@ class GroundingSAM(Tool):
|
|
305
338
|
) or "statusCode" not in resp_json:
|
306
339
|
_LOGGER.error(f"Request failed: {resp_json}")
|
307
340
|
raise ValueError(f"Request failed: {resp_json}")
|
308
|
-
|
341
|
+
data: Dict[str, Any] = resp_json["data"]
|
309
342
|
ret_pred: Dict[str, List] = {"labels": [], "bboxes": [], "masks": []}
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
ret_pred["masks"]
|
343
|
+
if "bboxes" in data:
|
344
|
+
ret_pred["bboxes"] = [
|
345
|
+
normalize_bbox(box, image_size) for box in data["bboxes"]
|
346
|
+
]
|
347
|
+
if "masks" in data:
|
348
|
+
ret_pred["masks"] = [
|
349
|
+
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
350
|
+
for mask in data["masks"]
|
351
|
+
]
|
352
|
+
ret_pred["labels"] = data["labels"]
|
353
|
+
ret_pred["scores"] = data["scores"]
|
316
354
|
return ret_pred
|
317
355
|
|
318
356
|
|
@@ -321,8 +359,14 @@ class AgentGroundingSAM(GroundingSAM):
|
|
321
359
|
returns the file name. This makes it easier for agents to use.
|
322
360
|
"""
|
323
361
|
|
324
|
-
def __call__(
|
325
|
-
|
362
|
+
def __call__(
|
363
|
+
self,
|
364
|
+
prompt: str,
|
365
|
+
image: Union[str, ImageType],
|
366
|
+
box_threshold: float = 0.2,
|
367
|
+
iou_threshold: float = 0.75,
|
368
|
+
) -> Dict:
|
369
|
+
rets = super().__call__(prompt, image, box_threshold, iou_threshold)
|
326
370
|
mask_files = []
|
327
371
|
for mask in rets["masks"]:
|
328
372
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
@@ -403,7 +447,7 @@ class BboxArea(Tool):
|
|
403
447
|
name = "bbox_area_"
|
404
448
|
description = "'bbox_area_' returns the area of the bounding box in pixels normalized to 2 decimal places."
|
405
449
|
usage = {
|
406
|
-
"required_parameters": [{"name": "
|
450
|
+
"required_parameters": [{"name": "bboxes", "type": "List[int]"}],
|
407
451
|
"examples": [
|
408
452
|
{
|
409
453
|
"scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
@@ -445,7 +489,8 @@ class SegArea(Tool):
|
|
445
489
|
def __call__(self, masks: Union[str, Path]) -> float:
|
446
490
|
pil_mask = Image.open(str(masks))
|
447
491
|
np_mask = np.array(pil_mask)
|
448
|
-
|
492
|
+
np_mask = np.clip(np_mask, 0, 1)
|
493
|
+
return cast(float, round(np.sum(np_mask), 2))
|
449
494
|
|
450
495
|
|
451
496
|
class BboxIoU(Tool):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.49
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -59,7 +59,8 @@ To get started, you can install the library using pip:
|
|
59
59
|
pip install vision-agent
|
60
60
|
```
|
61
61
|
|
62
|
-
Ensure you have an OpenAI API key and set it as an environment variable
|
62
|
+
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
63
|
+
using Azure OpenAI please see the additional setup section):
|
63
64
|
|
64
65
|
```bash
|
65
66
|
export OPENAI_API_KEY="your-api-key"
|
@@ -139,3 +140,23 @@ you. For example:
|
|
139
140
|
|
140
141
|
It also has a basic set of calculate tools such as add, subtract, multiply and divide.
|
141
142
|
|
143
|
+
### Additional Setup
|
144
|
+
If you want to use Azure OpenAI models, you can set the environment variable:
|
145
|
+
|
146
|
+
```bash
|
147
|
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
148
|
+
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
149
|
+
```
|
150
|
+
|
151
|
+
You can then run Vision Agent using the Azure OpenAI models:
|
152
|
+
|
153
|
+
```python
|
154
|
+
>>> import vision_agent as va
|
155
|
+
>>> agent = va.agent.VisionAgent(
|
156
|
+
>>> task_model=va.llm.AzureOpenAILLM(),
|
157
|
+
>>> answer_model=va.lmm.AzureOpenAILMM(),
|
158
|
+
>>> reflection_model=va.lmm.AzureOpenAILMM(),
|
159
|
+
>>> )
|
160
|
+
```
|
161
|
+
|
162
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
vision_agent/__init__.py,sha256=wD1cssVTAJ55uTViNfBGooqJUV0p9fmVAuTMHHrmUBU,229
|
2
|
+
vision_agent/agent/__init__.py,sha256=B4JVrbY4IRVCJfjmrgvcp7h1mTUEk8MZvL0Zmej4Ka0,127
|
3
|
+
vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
|
4
|
+
vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
|
5
|
+
vision_agent/agent/easytool_prompts.py,sha256=dYzWa_RaiaFSQ-CowoQOcFmjZtBTTljRyA809bLgrvU,4519
|
6
|
+
vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
|
7
|
+
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=DgvRra_1e05xyo8vIwD8TwZDcd5v-KdfaGB_QJLh62o,19101
|
9
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=fYnOT6z7DmuVTfUknUuc6b_vPmO0vgCyVJRQSR5M-G8,6192
|
10
|
+
vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
|
11
|
+
vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
|
12
|
+
vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
|
13
|
+
vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
|
14
|
+
vision_agent/image_utils.py,sha256=_hDikKa40U-2nQufKMRDgU9t-OmwCK9Rb_6O3v1U3nE,4436
|
15
|
+
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
16
|
+
vision_agent/llm/llm.py,sha256=tgL6ZtuwZKuxSNiCxJCuP2ETjNMrosdgxXkZJb0_00E,5024
|
17
|
+
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
18
|
+
vision_agent/lmm/lmm.py,sha256=LxwxCArp7DfnPbjf_Gl55xBxPwo2Qx8eDp1gCnGYSO0,9535
|
19
|
+
vision_agent/tools/__init__.py,sha256=AKN-T659HpwVearRnkCd6wWNoJ6K5kW9gAZwb8IQSLE,235
|
20
|
+
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
21
|
+
vision_agent/tools/tools.py,sha256=bYc3Xeg0wDjpfd8WGxRPCSaGQxUHRLI2PJk-SThqjHY,25644
|
22
|
+
vision_agent/tools/video.py,sha256=40rscP8YvKN3lhZ4PDcOK4XbdFX2duCRpHY_krmBYKU,7476
|
23
|
+
vision_agent-0.0.49.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
24
|
+
vision_agent-0.0.49.dist-info/METADATA,sha256=PgExhHIptlfP38agIfQIqbj0LEhjlBLcapULWU3o2YM,6142
|
25
|
+
vision_agent-0.0.49.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
26
|
+
vision_agent-0.0.49.dist-info/RECORD,,
|
@@ -1,26 +0,0 @@
|
|
1
|
-
vision_agent/__init__.py,sha256=wD1cssVTAJ55uTViNfBGooqJUV0p9fmVAuTMHHrmUBU,229
|
2
|
-
vision_agent/agent/__init__.py,sha256=B4JVrbY4IRVCJfjmrgvcp7h1mTUEk8MZvL0Zmej4Ka0,127
|
3
|
-
vision_agent/agent/agent.py,sha256=PRLItaPfMc94H6mAIPj_gBvJ8RezDEPanB6Cmu81A0M,306
|
4
|
-
vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
|
5
|
-
vision_agent/agent/easytool_prompts.py,sha256=uNp12LOFRLr3i2zLhNuLuyFms2-s8es2t6P6h76QDow,4493
|
6
|
-
vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
|
7
|
-
vision_agent/agent/reflexion_prompts.py,sha256=UPGkt_qgHBMUY0VPVoF-BqhR0d_6WPjjrhbYLBYOtnQ,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=P2melU6XQCCiiL1C_4QsxGUaWbwahuJA90eIcQJTR4U,17449
|
9
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=fSYO-6D-7rExS8tyZyZewrzAWsn2ZiqjBfoODL9m5Yk,6152
|
10
|
-
vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
|
11
|
-
vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
|
12
|
-
vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
|
13
|
-
vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
|
14
|
-
vision_agent/image_utils.py,sha256=XiOLpHAvlk55URw6iG7hl1OY71FVRA9_25b650amZXA,4420
|
15
|
-
vision_agent/llm/__init__.py,sha256=fBKsIjL4z08eA0QYx6wvhRe4Nkp2pJ4VrZK0-uUL5Ec,32
|
16
|
-
vision_agent/llm/llm.py,sha256=l8ZVh6vCZOJBHfenfOoHwPySXEUQoNt_gbL14gkvu2g,3904
|
17
|
-
vision_agent/lmm/__init__.py,sha256=I8mbeNUajTfWVNqLsuFQVOaNBDlkIhYp9DFU8H4kB7g,51
|
18
|
-
vision_agent/lmm/lmm.py,sha256=s_A3SKCoWm2biOt-gS9PXOsa9l-zrmR6mInLjAqam-A,8438
|
19
|
-
vision_agent/tools/__init__.py,sha256=AKN-T659HpwVearRnkCd6wWNoJ6K5kW9gAZwb8IQSLE,235
|
20
|
-
vision_agent/tools/prompts.py,sha256=9RBbyqlNlExsGKlJ89Jkph83DAEJ8PCVGaHoNbyN7TM,1416
|
21
|
-
vision_agent/tools/tools.py,sha256=VD80cINHyesmGAfiCMrK506Q-G9QU_Srzey5wJ3aJGQ,23884
|
22
|
-
vision_agent/tools/video.py,sha256=40rscP8YvKN3lhZ4PDcOK4XbdFX2duCRpHY_krmBYKU,7476
|
23
|
-
vision_agent-0.0.48.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
24
|
-
vision_agent-0.0.48.dist-info/METADATA,sha256=y5wDj2u8p8zlIhxBh87SRWXAlc1hcMWd_aaLyuOKTbI,5581
|
25
|
-
vision_agent-0.0.48.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
26
|
-
vision_agent-0.0.48.dist-info/RECORD,,
|
File without changes
|
File without changes
|