vision-agent 0.0.47__tar.gz → 0.0.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {vision_agent-0.0.47 → vision_agent-0.0.50}/PKG-INFO +23 -2
  2. {vision_agent-0.0.47 → vision_agent-0.0.50}/README.md +22 -1
  3. {vision_agent-0.0.47 → vision_agent-0.0.50}/pyproject.toml +1 -1
  4. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/agent.py +7 -0
  5. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/easytool_prompts.py +14 -14
  6. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/reflexion_prompts.py +1 -1
  7. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/vision_agent.py +113 -82
  8. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/vision_agent_prompts.py +20 -20
  9. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/image_utils.py +1 -1
  10. vision_agent-0.0.50/vision_agent/llm/__init__.py +1 -0
  11. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/llm/llm.py +38 -3
  12. vision_agent-0.0.50/vision_agent/lmm/__init__.py +1 -0
  13. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/lmm/lmm.py +37 -2
  14. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/tools/prompts.py +3 -3
  15. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/tools/tools.py +115 -50
  16. vision_agent-0.0.47/vision_agent/llm/__init__.py +0 -1
  17. vision_agent-0.0.47/vision_agent/lmm/__init__.py +0 -1
  18. {vision_agent-0.0.47 → vision_agent-0.0.50}/LICENSE +0 -0
  19. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/__init__.py +0 -0
  20. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/__init__.py +0 -0
  21. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/easytool.py +0 -0
  22. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/agent/reflexion.py +0 -0
  23. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/data/__init__.py +0 -0
  24. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/data/data.py +0 -0
  25. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/emb/__init__.py +0 -0
  26. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/emb/emb.py +0 -0
  27. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/tools/__init__.py +0 -0
  28. {vision_agent-0.0.47 → vision_agent-0.0.50}/vision_agent/tools/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.0.47
3
+ Version: 0.0.50
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -59,7 +59,8 @@ To get started, you can install the library using pip:
59
59
  pip install vision-agent
60
60
  ```
61
61
 
62
- Ensure you have an OpenAI API key and set it as an environment variable:
62
+ Ensure you have an OpenAI API key and set it as an environment variable (if you are
63
+ using Azure OpenAI please see the additional setup section):
63
64
 
64
65
  ```bash
65
66
  export OPENAI_API_KEY="your-api-key"
@@ -139,3 +140,23 @@ you. For example:
139
140
 
140
141
  It also has a basic set of calculate tools such as add, subtract, multiply and divide.
141
142
 
143
+ ### Additional Setup
144
+ If you want to use Azure OpenAI models, you can set the environment variable:
145
+
146
+ ```bash
147
+ export AZURE_OPENAI_API_KEY="your-api-key"
148
+ export AZURE_OPENAI_ENDPOINT="your-endpoint"
149
+ ```
150
+
151
+ You can then run Vision Agent using the Azure OpenAI models:
152
+
153
+ ```python
154
+ >>> import vision_agent as va
155
+ >>> agent = va.agent.VisionAgent(
156
+ >>> task_model=va.llm.AzureOpenAILLM(),
157
+ >>> answer_model=va.lmm.AzureOpenAILMM(),
158
+ >>> reflection_model=va.lmm.AzureOpenAILMM(),
159
+ >>> )
160
+ ```
161
+
162
+
@@ -30,7 +30,8 @@ To get started, you can install the library using pip:
30
30
  pip install vision-agent
31
31
  ```
32
32
 
33
- Ensure you have an OpenAI API key and set it as an environment variable:
33
+ Ensure you have an OpenAI API key and set it as an environment variable (if you are
34
+ using Azure OpenAI please see the additional setup section):
34
35
 
35
36
  ```bash
36
37
  export OPENAI_API_KEY="your-api-key"
@@ -109,3 +110,23 @@ you. For example:
109
110
 
110
111
 
111
112
  It also has a basic set of calculate tools such as add, subtract, multiply and divide.
113
+
114
+ ### Additional Setup
115
+ If you want to use Azure OpenAI models, you can set the environment variable:
116
+
117
+ ```bash
118
+ export AZURE_OPENAI_API_KEY="your-api-key"
119
+ export AZURE_OPENAI_ENDPOINT="your-endpoint"
120
+ ```
121
+
122
+ You can then run Vision Agent using the Azure OpenAI models:
123
+
124
+ ```python
125
+ >>> import vision_agent as va
126
+ >>> agent = va.agent.VisionAgent(
127
+ >>> task_model=va.llm.AzureOpenAILLM(),
128
+ >>> answer_model=va.lmm.AzureOpenAILMM(),
129
+ >>> reflection_model=va.lmm.AzureOpenAILMM(),
130
+ >>> )
131
+ ```
132
+
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.0.47"
7
+ version = "0.0.50"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -11,3 +11,10 @@ class Agent(ABC):
11
11
  image: Optional[Union[str, Path]] = None,
12
12
  ) -> str:
13
13
  pass
14
+
15
+ @abstractmethod
16
+ def log_progress(self, description: str) -> None:
17
+ """Log the progress of the agent.
18
+ This is a hook that is intended for reporting the progress of the agent.
19
+ """
20
+ pass
@@ -1,11 +1,11 @@
1
- TASK_DECOMPOSE = """You need to decompose a complex user's question into some simple subtasks and let the model execute it step by step.
1
+ TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
2
2
  This is the user's question: {question}
3
- This is tool list:
3
+ This is the tool list:
4
4
  {tools}
5
5
 
6
6
  Please note that:
7
7
  1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list.
8
- 2. If one subtask need the results from other subtask, you can should write clearly. For example:
8
+ 2. If one subtask needs the results from another subtask, you should write clearly. For example:
9
9
  {{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
10
10
  3. You must ONLY output in a parsible JSON format. An example output looks like:
11
11
 
@@ -13,7 +13,7 @@ Please note that:
13
13
 
14
14
  Output: """
15
15
 
16
- TASK_TOPOLOGY = """Given a complex user's question, I have decompose this question into some simple subtasks. I think there exists a logical connections and order amontg the tasks. Thus you need to help me output this logical connections and order.
16
+ TASK_TOPOLOGY = """Given a user's complex question, I have decomposed this question into some simple subtasks. I think there exist logical connections and order among the tasks. Thus, you need to help me output these logical connections and order.
17
17
  You must ONLY output in a parsible JSON format with the following format:
18
18
 
19
19
  {{"Tasks": [{{"task": task, "id", task_id, "dep": [dependency_task_id1, dependency_task_id2, ...]}}]}}
@@ -21,7 +21,7 @@ You must ONLY output in a parsible JSON format with the following format:
21
21
  The "dep" field denotes the id of the previous task which generates a new resource upon which the current task depends. If there are no dependencies, set "dep" to -1.
22
22
 
23
23
 
24
- This is user's question: {question}
24
+ This is the user's question: {question}
25
25
 
26
26
  These are subtasks of this question:
27
27
 
@@ -34,7 +34,7 @@ These are the tools you can select to solve the question:
34
34
  {tools}
35
35
 
36
36
  Please note that:
37
- 1. You should only chooce one tool the Tool List to solve this question.
37
+ 1. You should only choose one tool from the Tool List to solve this question.
38
38
  2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
39
39
 
40
40
  Example 1: {{"ID": 1}}
@@ -42,22 +42,22 @@ Example 2: {{"ID": 2}}
42
42
 
43
43
  Output: """
44
44
 
45
- CHOOSE_PARAMETER = """Given a user's question and a API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
45
+ CHOOSE_PARAMETER = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
46
46
  Please note that:
47
47
  1. The Example in the API tool documentation can help you better understand the use of the API.
48
- 2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
48
+ 2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
49
49
  3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
50
50
  4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
51
- 5. If you need to use this API multiple times,, please set "Parameters" to a list.
52
- 6. You must ONLY output in a parsible JSON format. Two examples output looks like:
51
+ 5. If you need to use this API multiple times, please set "Parameters" to a list.
52
+ 6. You must ONLY output in a parsible JSON format. Two example outputs looks like:
53
53
 
54
54
  Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
55
55
  Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
56
56
 
57
- There are logs of previous questions and answers:
57
+ These are logs of previous questions and answers:
58
58
  {previous_log}
59
59
  This is the current user's question: {question}
60
- This is API tool documentation: {tool_usage}
60
+ This is the API tool documentation: {tool_usage}
61
61
  Output: """
62
62
 
63
63
 
@@ -67,7 +67,7 @@ Please note that:
67
67
  2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
68
68
  3. If the API tool does not provide useful information in the response, please answer with your knowledge.
69
69
  4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
70
- There are logs of previous questions and answers:
70
+ These are logs of previous questions and answers:
71
71
  {previous_log}
72
72
  This is the user's question: {question}
73
73
  This is the response output by the API tool:
@@ -75,7 +75,7 @@ This is the response output by the API tool:
75
75
  We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
76
76
  Output: """
77
77
 
78
- ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question
78
+ ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question.
79
79
  This is the user's question: {question}
80
80
  These are subtasks and their answers: {answers}
81
81
  Final answer: """
@@ -9,7 +9,7 @@ Relevant Context: {context}
9
9
  Question: {question}{scratchpad}"""
10
10
 
11
11
 
12
- COT_REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given access to relevant context and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, Diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
12
+ COT_REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given access to relevant context and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
13
13
  Here are some examples:
14
14
  {examples}
15
15
  (END OF EXAMPLES)
@@ -244,79 +244,6 @@ def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any:
244
244
  return str(e)
245
245
 
246
246
 
247
- def retrieval(
248
- model: Union[LLM, LMM, Agent],
249
- question: str,
250
- tools: Dict[int, Any],
251
- previous_log: str,
252
- reflections: str,
253
- ) -> Tuple[Dict, str]:
254
- tool_id = choose_tool(
255
- model, question, {k: v["description"] for k, v in tools.items()}, reflections
256
- )
257
- if tool_id is None:
258
- return {}, ""
259
-
260
- tool_instructions = tools[tool_id]
261
- tool_usage = tool_instructions["usage"]
262
- tool_name = tool_instructions["name"]
263
-
264
- parameters = choose_parameter(
265
- model, question, tool_usage, previous_log, reflections
266
- )
267
- if parameters is None:
268
- return {}, ""
269
- tool_results = {"task": question, "tool_name": tool_name, "parameters": parameters}
270
-
271
- _LOGGER.info(
272
- f"""Going to run the following tool(s) in sequence:
273
- {tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
274
- )
275
-
276
- def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
277
- call_results: List[Any] = []
278
- if isinstance(result["parameters"], Dict):
279
- call_results.append(
280
- function_call(tools[tool_id]["class"], result["parameters"])
281
- )
282
- elif isinstance(result["parameters"], List):
283
- for parameters in result["parameters"]:
284
- call_results.append(function_call(tools[tool_id]["class"], parameters))
285
- return call_results
286
-
287
- call_results = parse_tool_results(tool_results)
288
- tool_results["call_results"] = call_results
289
-
290
- call_results_str = str(call_results)
291
- # _LOGGER.info(f"\tCall Results: {call_results_str}")
292
- return tool_results, call_results_str
293
-
294
-
295
- def create_tasks(
296
- task_model: Union[LLM, LMM], question: str, tools: Dict[int, Any], reflections: str
297
- ) -> List[Dict]:
298
- tasks = task_decompose(
299
- task_model,
300
- question,
301
- {k: v["description"] for k, v in tools.items()},
302
- reflections,
303
- )
304
- if tasks is not None:
305
- task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)]
306
- task_list = task_topology(task_model, question, task_list)
307
- try:
308
- task_list = topological_sort(task_list)
309
- except Exception:
310
- _LOGGER.error(f"Failed topological_sort on: {task_list}")
311
- else:
312
- task_list = []
313
- _LOGGER.info(
314
- f"""Planned tasks:
315
- {tabulate(task_list, headers="keys", tablefmt="mixed_grid")}"""
316
- )
317
- return task_list
318
-
319
-
320
247
  def self_reflect(
321
248
  reflect_model: Union[LLM, LMM],
322
249
  question: str,
@@ -350,7 +277,7 @@ def parse_reflect(reflect: str) -> bool:
350
277
  def visualize_result(all_tool_results: List[Dict]) -> List[str]:
351
278
  image_to_data: Dict[str, Dict] = {}
352
279
  for tool_result in all_tool_results:
353
- if not tool_result["tool_name"] in ["grounding_sam_", "grounding_dino_"]:
280
+ if tool_result["tool_name"] not in ["grounding_sam_", "grounding_dino_"]:
354
281
  continue
355
282
 
356
283
  parameters = tool_result["parameters"]
@@ -368,7 +295,6 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
368
295
  continue
369
296
 
370
297
  for param, call_result in zip(parameters, tool_result["call_results"]):
371
-
372
298
  # calls can fail, so we need to check if the call was successful
373
299
  if not isinstance(call_result, dict):
374
300
  continue
@@ -421,7 +347,18 @@ class VisionAgent(Agent):
421
347
  reflect_model: Optional[Union[LLM, LMM]] = None,
422
348
  max_retries: int = 2,
423
349
  verbose: bool = False,
350
+ report_progress_callback: Optional[Callable[[str], None]] = None,
424
351
  ):
352
+ """VisionAgent constructor.
353
+
354
+ Parameters
355
+ task_model: the model to use for task decomposition.
356
+ answer_model: the model to use for reasoning and concluding the answer.
357
+ reflect_model: the model to use for self reflection.
358
+ max_retries: maximum number of retries to attempt to complete the task.
359
+ verbose: whether to print more logs.
360
+ report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgent instances are running in parallel. This callback ensures that the progress are not mixed up.
361
+ """
425
362
  self.task_model = (
426
363
  OpenAILLM(json_mode=True, temperature=0.1)
427
364
  if task_model is None
@@ -434,8 +371,8 @@ class VisionAgent(Agent):
434
371
  OpenAILMM(temperature=0.1) if reflect_model is None else reflect_model
435
372
  )
436
373
  self.max_retries = max_retries
437
-
438
374
  self.tools = TOOLS
375
+ self.report_progress_callback = report_progress_callback
439
376
  if verbose:
440
377
  _LOGGER.setLevel(logging.INFO)
441
378
 
@@ -458,6 +395,11 @@ class VisionAgent(Agent):
458
395
  input = [{"role": "user", "content": input}]
459
396
  return self.chat(input, image=image)
460
397
 
398
+ def log_progress(self, description: str) -> None:
399
+ _LOGGER.info(description)
400
+ if self.report_progress_callback:
401
+ self.report_progress_callback(description)
402
+
461
403
  def chat_with_workflow(
462
404
  self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
463
405
  ) -> Tuple[str, List[Dict]]:
@@ -470,7 +412,9 @@ class VisionAgent(Agent):
470
412
  all_tool_results: List[Dict] = []
471
413
 
472
414
  for _ in range(self.max_retries):
473
- task_list = create_tasks(self.task_model, question, self.tools, reflections)
415
+ task_list = self.create_tasks(
416
+ self.task_model, question, self.tools, reflections
417
+ )
474
418
 
475
419
  task_depend = {"Original Quesiton": question}
476
420
  previous_log = ""
@@ -482,7 +426,7 @@ class VisionAgent(Agent):
482
426
  for task in task_list:
483
427
  task_str = task["task"]
484
428
  previous_log = str(task_depend)
485
- tool_results, call_results = retrieval(
429
+ tool_results, call_results = self.retrieval(
486
430
  self.task_model,
487
431
  task_str,
488
432
  self.tools,
@@ -496,8 +440,8 @@ class VisionAgent(Agent):
496
440
  tool_results["answer"] = answer
497
441
  all_tool_results.append(tool_results)
498
442
 
499
- _LOGGER.info(f"\tCall Result: {call_results}")
500
- _LOGGER.info(f"\tAnswer: {answer}")
443
+ self.log_progress(f"\tCall Result: {call_results}")
444
+ self.log_progress(f"\tAnswer: {answer}")
501
445
  answers.append({"task": task_str, "answer": answer})
502
446
  task_depend[task["id"]]["answer"] = answer # type: ignore
503
447
  task_depend[task["id"]]["call_result"] = call_results # type: ignore
@@ -515,12 +459,15 @@ class VisionAgent(Agent):
515
459
  final_answer,
516
460
  visualized_images[0] if len(visualized_images) > 0 else image,
517
461
  )
518
- _LOGGER.info(f"Reflection: {reflection}")
462
+ self.log_progress(f"Reflection: {reflection}")
519
463
  if parse_reflect(reflection):
520
464
  break
521
465
  else:
522
466
  reflections += reflection
523
-
467
+ # '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
468
+ self.log_progress(
469
+ f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</<ANSWER>"
470
+ )
524
471
  return final_answer, all_tool_results
525
472
 
526
473
  def chat(
@@ -528,3 +475,87 @@ class VisionAgent(Agent):
528
475
  ) -> str:
529
476
  answer, _ = self.chat_with_workflow(chat, image=image)
530
477
  return answer
478
+
479
+ def retrieval(
480
+ self,
481
+ model: Union[LLM, LMM, Agent],
482
+ question: str,
483
+ tools: Dict[int, Any],
484
+ previous_log: str,
485
+ reflections: str,
486
+ ) -> Tuple[Dict, str]:
487
+ tool_id = choose_tool(
488
+ model,
489
+ question,
490
+ {k: v["description"] for k, v in tools.items()},
491
+ reflections,
492
+ )
493
+ if tool_id is None:
494
+ return {}, ""
495
+
496
+ tool_instructions = tools[tool_id]
497
+ tool_usage = tool_instructions["usage"]
498
+ tool_name = tool_instructions["name"]
499
+
500
+ parameters = choose_parameter(
501
+ model, question, tool_usage, previous_log, reflections
502
+ )
503
+ if parameters is None:
504
+ return {}, ""
505
+ tool_results = {
506
+ "task": question,
507
+ "tool_name": tool_name,
508
+ "parameters": parameters,
509
+ }
510
+
511
+ self.log_progress(
512
+ f"""Going to run the following tool(s) in sequence:
513
+ {tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
514
+ )
515
+
516
+ def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
517
+ call_results: List[Any] = []
518
+ if isinstance(result["parameters"], Dict):
519
+ call_results.append(
520
+ function_call(tools[tool_id]["class"], result["parameters"])
521
+ )
522
+ elif isinstance(result["parameters"], List):
523
+ for parameters in result["parameters"]:
524
+ call_results.append(
525
+ function_call(tools[tool_id]["class"], parameters)
526
+ )
527
+ return call_results
528
+
529
+ call_results = parse_tool_results(tool_results)
530
+ tool_results["call_results"] = call_results
531
+
532
+ call_results_str = str(call_results)
533
+ return tool_results, call_results_str
534
+
535
+ def create_tasks(
536
+ self,
537
+ task_model: Union[LLM, LMM],
538
+ question: str,
539
+ tools: Dict[int, Any],
540
+ reflections: str,
541
+ ) -> List[Dict]:
542
+ tasks = task_decompose(
543
+ task_model,
544
+ question,
545
+ {k: v["description"] for k, v in tools.items()},
546
+ reflections,
547
+ )
548
+ if tasks is not None:
549
+ task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)]
550
+ task_list = task_topology(task_model, question, task_list)
551
+ try:
552
+ task_list = topological_sort(task_list)
553
+ except Exception:
554
+ _LOGGER.error(f"Failed topological_sort on: {task_list}")
555
+ else:
556
+ task_list = []
557
+ self.log_progress(
558
+ f"""Planned tasks:
559
+ {tabulate(task_list, headers="keys", tablefmt="mixed_grid")}"""
560
+ )
561
+ return task_list
@@ -1,4 +1,4 @@
1
- VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure with the tools avilable. Use complete sentences.
1
+ VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure with the tools available. Use complete sentences.
2
2
 
3
3
  User's question: {question}
4
4
 
@@ -13,14 +13,14 @@ Final answer:
13
13
 
14
14
  Reflection: """
15
15
 
16
- TASK_DECOMPOSE = """You need to decompose a complex user's question into some simple subtasks and let the model execute it step by step.
16
+ TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
17
17
  This is the user's question: {question}
18
- This is tool list:
18
+ This is the tool list:
19
19
  {tools}
20
20
 
21
21
  Please note that:
22
- 1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list.
23
- 2. If one subtask need the results from other subtask, you can should write clearly. For example:
22
+ 1. You should only decompose this user's complex question into some simple subtasks which can be executed easily by using one single tool in the tool list.
23
+ 2. If one subtask needs the results from another subtask, you should write clearly. For example:
24
24
  {{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
25
25
  3. You must ONLY output in a parsible JSON format. An example output looks like:
26
26
 
@@ -28,18 +28,18 @@ Please note that:
28
28
 
29
29
  Output: """
30
30
 
31
- TASK_DECOMPOSE_DEPENDS = """You need to decompose a complex user's question into some simple subtasks and let the model execute it step by step.
31
+ TASK_DECOMPOSE_DEPENDS = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
32
32
  This is the user's question: {question}
33
33
 
34
- This is tool list:
34
+ This is the tool list:
35
35
  {tools}
36
36
 
37
37
  This is a reflection from a previous failed attempt:
38
38
  {reflections}
39
39
 
40
40
  Please note that:
41
- 1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list.
42
- 2. If one subtask need the results from other subtask, you can should write clearly. For example:
41
+ 1. You should only decompose this user's complex question into some simple subtasks which can be executed easily by using one single tool in the tool list.
42
+ 2. If one subtask needs the results from another subtask, you should write clearly. For example:
43
43
  {{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
44
44
  3. You must ONLY output in a parsible JSON format. An example output looks like:
45
45
 
@@ -53,7 +53,7 @@ These are the tools you can select to solve the question:
53
53
  {tools}
54
54
 
55
55
  Please note that:
56
- 1. You should only chooce one tool the Tool List to solve this question.
56
+ 1. You should only choose one tool from the Tool List to solve this question.
57
57
  2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
58
58
 
59
59
  Example 1: {{"ID": 1}}
@@ -70,7 +70,7 @@ This is a reflection from a previous failed attempt:
70
70
  {reflections}
71
71
 
72
72
  Please note that:
73
- 1. You should only chooce one tool the Tool List to solve this question.
73
+ 1. You should only choose one tool from the Tool List to solve this question.
74
74
  2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
75
75
 
76
76
  Example 1: {{"ID": 1}}
@@ -78,14 +78,14 @@ Example 2: {{"ID": 2}}
78
78
 
79
79
  Output: """
80
80
 
81
- CHOOSE_PARAMETER_DEPENDS = """Given a user's question and a API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
81
+ CHOOSE_PARAMETER_DEPENDS = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
82
82
  Please note that:
83
83
  1. The Example in the API tool documentation can help you better understand the use of the API.
84
- 2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
84
+ 2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
85
85
  3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
86
86
  4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
87
- 5. If you need to use this API multiple times,, please set "Parameters" to a list.
88
- 6. You must ONLY output in a parsible JSON format. Two examples output looks like:
87
+ 5. If you need to use this API multiple times, please set "Parameters" to a list.
88
+ 6. You must ONLY output in a parsible JSON format. Two example outputs look like:
89
89
 
90
90
  Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
91
91
  Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
@@ -93,16 +93,16 @@ Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
93
93
  This is a reflection from a previous failed attempt:
94
94
  {reflections}
95
95
 
96
- There are logs of previous questions and answers:
96
+ These are logs of previous questions and answers:
97
97
  {previous_log}
98
98
 
99
99
  This is the current user's question: {question}
100
- This is API tool documentation: {tool_usage}
100
+ This is the API tool documentation: {tool_usage}
101
101
  Output: """
102
102
 
103
103
  ANSWER_GENERATE_DEPENDS = """You should answer the question based on the response output by the API tool.
104
104
  Please note that:
105
- 1. Try to organize the response into a natural language answer.
105
+ 1. You should try to organize the response into a natural language answer.
106
106
  2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
107
107
  3. If the API tool does not provide useful information in the response, please answer with your knowledge.
108
108
  4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
@@ -110,7 +110,7 @@ Please note that:
110
110
  This is a reflection from a previous failed attempt:
111
111
  {reflections}
112
112
 
113
- There are logs of previous questions and answers:
113
+ These are logs of previous questions and answers:
114
114
  {previous_log}
115
115
 
116
116
  This is the user's question: {question}
@@ -121,7 +121,7 @@ This is the response output by the API tool:
121
121
  We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
122
122
  Output: """
123
123
 
124
- ANSWER_SUMMARIZE_DEPENDS = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question
124
+ ANSWER_SUMMARIZE_DEPENDS = """We break down a user's complex problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question
125
125
  This is the user's question: {question}
126
126
 
127
127
  These are subtasks and their answers:
@@ -78,7 +78,7 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
78
78
  data = Image.open(data)
79
79
  if isinstance(data, Image.Image):
80
80
  buffer = BytesIO()
81
- data.save(buffer, format="PNG")
81
+ data.convert("RGB").save(buffer, format="JPEG")
82
82
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
83
83
  else:
84
84
  arr_bytes = data.tobytes()
@@ -0,0 +1 @@
1
+ from .llm import LLM, AzureOpenAILLM, OpenAILLM
@@ -1,8 +1,9 @@
1
1
  import json
2
+ import os
2
3
  from abc import ABC, abstractmethod
3
- from typing import Any, Callable, Dict, List, Mapping, Union, cast
4
+ from typing import Any, Callable, Dict, List, Mapping, Optional, Union, cast
4
5
 
5
- from openai import OpenAI
6
+ from openai import AzureOpenAI, OpenAI
6
7
 
7
8
  from vision_agent.tools import (
8
9
  CHOOSE_PARAMS,
@@ -33,11 +34,16 @@ class OpenAILLM(LLM):
33
34
  def __init__(
34
35
  self,
35
36
  model_name: str = "gpt-4-turbo-preview",
37
+ api_key: Optional[str] = None,
36
38
  json_mode: bool = False,
37
39
  **kwargs: Any
38
40
  ):
41
+ if not api_key:
42
+ self.client = OpenAI()
43
+ else:
44
+ self.client = OpenAI(api_key=api_key)
45
+
39
46
  self.model_name = model_name
40
- self.client = OpenAI()
41
47
  self.kwargs = kwargs
42
48
  if json_mode:
43
49
  self.kwargs["response_format"] = {"type": "json_object"}
@@ -120,3 +126,32 @@ class OpenAILLM(LLM):
120
126
  ]
121
127
 
122
128
  return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
129
+
130
+
131
+ class AzureOpenAILLM(OpenAILLM):
132
+ def __init__(
133
+ self,
134
+ model_name: str = "gpt-4-turbo-preview",
135
+ api_key: Optional[str] = None,
136
+ api_version: str = "2024-02-01",
137
+ azure_endpoint: Optional[str] = None,
138
+ json_mode: bool = False,
139
+ **kwargs: Any
140
+ ):
141
+ if not api_key:
142
+ api_key = os.getenv("AZURE_OPENAI_API_KEY")
143
+ if not azure_endpoint:
144
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
145
+
146
+ if not api_key:
147
+ raise ValueError("Azure OpenAI API key is required.")
148
+ if not azure_endpoint:
149
+ raise ValueError("Azure OpenAI endpoint is required.")
150
+
151
+ self.client = AzureOpenAI(
152
+ api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
153
+ )
154
+ self.model_name = model_name
155
+ self.kwargs = kwargs
156
+ if json_mode:
157
+ self.kwargs["response_format"] = {"type": "json_object"}
@@ -0,0 +1 @@
1
+ from .lmm import LMM, AzureOpenAILMM, LLaVALMM, OpenAILMM, get_lmm
@@ -1,12 +1,13 @@
1
1
  import base64
2
2
  import json
3
3
  import logging
4
+ import os
4
5
  from abc import ABC, abstractmethod
5
6
  from pathlib import Path
6
7
  from typing import Any, Callable, Dict, List, Optional, Union, cast
7
8
 
8
9
  import requests
9
- from openai import OpenAI
10
+ from openai import AzureOpenAI, OpenAI
10
11
 
11
12
  from vision_agent.tools import (
12
13
  CHOOSE_PARAMS,
@@ -99,12 +100,18 @@ class OpenAILMM(LMM):
99
100
  def __init__(
100
101
  self,
101
102
  model_name: str = "gpt-4-vision-preview",
103
+ api_key: Optional[str] = None,
102
104
  max_tokens: int = 1024,
103
105
  **kwargs: Any,
104
106
  ):
107
+ if not api_key:
108
+ self.client = OpenAI()
109
+ else:
110
+ self.client = OpenAI(api_key=api_key)
111
+
112
+ self.client = OpenAI(api_key=api_key)
105
113
  self.model_name = model_name
106
114
  self.max_tokens = max_tokens
107
- self.client = OpenAI()
108
115
  self.kwargs = kwargs
109
116
 
110
117
  def __call__(
@@ -248,6 +255,34 @@ class OpenAILMM(LMM):
248
255
  return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
249
256
 
250
257
 
258
+ class AzureOpenAILMM(OpenAILMM):
259
+ def __init__(
260
+ self,
261
+ model_name: str = "gpt-4-vision-preview",
262
+ api_key: Optional[str] = None,
263
+ api_version: str = "2024-02-01",
264
+ azure_endpoint: Optional[str] = None,
265
+ max_tokens: int = 1024,
266
+ **kwargs: Any,
267
+ ):
268
+ if not api_key:
269
+ api_key = os.getenv("AZURE_OPENAI_API_KEY")
270
+ if not azure_endpoint:
271
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
272
+
273
+ if not api_key:
274
+ raise ValueError("OpenAI API key is required.")
275
+ if not azure_endpoint:
276
+ raise ValueError("Azure OpenAI endpoint is required.")
277
+
278
+ self.client = AzureOpenAI(
279
+ api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
280
+ )
281
+ self.model_name = model_name
282
+ self.max_tokens = max_tokens
283
+ self.kwargs = kwargs
284
+
285
+
251
286
  def get_lmm(name: str) -> LMM:
252
287
  if name == "openai":
253
288
  return OpenAILMM(name)
@@ -6,14 +6,14 @@ CHOOSE_PARAMS = (
6
6
  "This is the API tool documentation: {api_doc}\n"
7
7
  "Please note that: \n"
8
8
  "1. The Example in the API tool documentation can help you better understand the use of the API.\n"
9
- '2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}\n'
9
+ '2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no parameters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}\n'
10
10
  "3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.\n"
11
11
  '4. If you need to use this API multiple times, please set "Parameters" to a list.\n'
12
- "5. You must ONLY output in a parsible JSON format. Two examples output looks like:\n"
12
+ "5. You must ONLY output in a parsible JSON format. Two example outputs look like:\n"
13
13
  "'''\n"
14
14
  'Example 1: {{"Parameters":{{"keyword": "Artificial Intelligence", "language": "English"}}}}\n'
15
15
  'Example 2: {{"Parameters":[{{"keyword": "Artificial Intelligence", "language": "English"}}, {{"keyword": "Machine Learning", "language": "English"}}]}}\n'
16
16
  "'''\n"
17
- "This is user's question: {question}\n"
17
+ "This is the user's question: {question}\n"
18
18
  "Output:\n"
19
19
  )
@@ -51,6 +51,25 @@ class Tool(ABC):
51
51
  usage: Dict
52
52
 
53
53
 
54
+ class NoOp(Tool):
55
+ name = "noop_"
56
+ description = (
57
+ "'noop_' is a no-op tool that does nothing if you do not need to use a tool."
58
+ )
59
+ usage = {
60
+ "required_parameters": [],
61
+ "examples": [
62
+ {
63
+ "scenario": "If you do not want to use a tool.",
64
+ "parameters": {},
65
+ }
66
+ ],
67
+ }
68
+
69
+ def __call__(self) -> None:
70
+ return None
71
+
72
+
54
73
  class CLIP(Tool):
55
74
  r"""CLIP is a tool that can classify or tag any image given a set if input classes
56
75
  or tags.
@@ -59,32 +78,32 @@ class CLIP(Tool):
59
78
  -------
60
79
  >>> import vision_agent as va
61
80
  >>> clip = va.tools.CLIP()
62
- >>> clip(["red line", "yellow dot"], "ct_scan1.jpg"))
81
+ >>> clip("red line, yellow dot", "ct_scan1.jpg"))
63
82
  [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
64
83
  """
65
84
 
66
- _ENDPOINT = "https://rb4ii6dfacmwqfxivi4aedyyfm0endsv.lambda-url.us-east-2.on.aws"
85
+ _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
67
86
 
68
87
  name = "clip_"
69
88
  description = "'clip_' is a tool that can classify or tag any image given a set of input classes or tags."
70
89
  usage = {
71
90
  "required_parameters": [
72
- {"name": "prompt", "type": "List[str]"},
91
+ {"name": "prompt", "type": "str"},
73
92
  {"name": "image", "type": "str"},
74
93
  ],
75
94
  "examples": [
76
95
  {
77
96
  "scenario": "Can you classify this image as a cat? Image name: cat.jpg",
78
- "parameters": {"prompt": ["cat"], "image": "cat.jpg"},
97
+ "parameters": {"prompt": "cat", "image": "cat.jpg"},
79
98
  },
80
99
  {
81
100
  "scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg",
82
- "parameters": {"prompt": ["cat", "dog"], "image": "cat_dog.jpg"},
101
+ "parameters": {"prompt": "cat, dog", "image": "cat_dog.jpg"},
83
102
  },
84
103
  {
85
104
  "scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg",
86
105
  "parameters": {
87
- "prompt": ["red shirt", "green shirt", "other"],
106
+ "prompt": "red shirt, green shirt, other",
88
107
  "image": "shirts.jpg",
89
108
  },
90
109
  },
@@ -92,11 +111,11 @@ class CLIP(Tool):
92
111
  }
93
112
 
94
113
  # TODO: Add support for input multiple images, which aligns with the output type.
95
- def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
114
+ def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
96
115
  """Invoke the CLIP model.
97
116
 
98
117
  Parameters:
99
- prompt: a list of classes or tags to classify the image.
118
+ prompt: a string includes a list of classes or tags to classify the image.
100
119
  image: the input image to classify.
101
120
 
102
121
  Returns:
@@ -104,8 +123,9 @@ class CLIP(Tool):
104
123
  """
105
124
  image_b64 = convert_to_b64(image)
106
125
  data = {
107
- "classes": prompt,
108
- "images": [image_b64],
126
+ "prompt": prompt,
127
+ "image": image_b64,
128
+ "tool": "closed_set_image_classification",
109
129
  }
110
130
  res = requests.post(
111
131
  self._ENDPOINT,
@@ -119,10 +139,11 @@ class CLIP(Tool):
119
139
  _LOGGER.error(f"Request failed: {resp_json}")
120
140
  raise ValueError(f"Request failed: {resp_json}")
121
141
 
122
- rets = []
123
- for elt in resp_json["data"]:
124
- rets.append({"labels": prompt, "scores": [round(prob, 2) for prob in elt]})
125
- return cast(Dict, rets[0])
142
+ resp_json["data"]["scores"] = [
143
+ round(prob, 4) for prob in resp_json["data"]["scores"]
144
+ ]
145
+
146
+ return resp_json["data"] # type: ignore
126
147
 
127
148
 
128
149
  class GroundingDINO(Tool):
@@ -139,7 +160,7 @@ class GroundingDINO(Tool):
139
160
  'scores': [0.98, 0.02]}]
140
161
  """
141
162
 
142
- _ENDPOINT = "https://chnicr4kes5ku77niv2zoytggq0qyqlp.lambda-url.us-east-2.on.aws"
163
+ _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
143
164
 
144
165
  name = "grounding_dino_"
145
166
  description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
@@ -148,6 +169,10 @@ class GroundingDINO(Tool):
148
169
  {"name": "prompt", "type": "str"},
149
170
  {"name": "image", "type": "str"},
150
171
  ],
172
+ "optional_parameters": [
173
+ {"name": "box_threshold", "type": "float"},
174
+ {"name": "iou_threshold", "type": "float"},
175
+ ],
151
176
  "examples": [
152
177
  {
153
178
  "scenario": "Can you build me a car detector?",
@@ -162,32 +187,44 @@ class GroundingDINO(Tool):
162
187
  "parameters": {
163
188
  "prompt": "red shirt. green shirt",
164
189
  "image": "shirts.jpg",
190
+ "box_threshold": 0.20,
191
+ "iou_threshold": 0.75,
165
192
  },
166
193
  },
167
194
  ],
168
195
  }
169
196
 
170
197
  # TODO: Add support for input multiple images, which aligns with the output type.
171
- def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> Dict:
198
+ def __call__(
199
+ self,
200
+ prompt: str,
201
+ image: Union[str, Path, ImageType],
202
+ box_threshold: float = 0.20,
203
+ iou_threshold: float = 0.75,
204
+ ) -> Dict:
172
205
  """Invoke the Grounding DINO model.
173
206
 
174
207
  Parameters:
175
208
  prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
176
209
  image: the input image to run against.
210
+ box_threshold: the threshold to filter out the bounding boxes with low scores.
211
+ iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
177
212
 
178
213
  Returns:
179
214
  A list of dictionaries containing the labels, scores, and bboxes. Each dictionary contains the detection result for an image.
180
215
  """
181
216
  image_size = get_image_size(image)
182
217
  image_b64 = convert_to_b64(image)
183
- data = {
218
+ request_data = {
184
219
  "prompt": prompt,
185
- "images": [image_b64],
220
+ "image": image_b64,
221
+ "tool": "visual_grounding",
222
+ "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
186
223
  }
187
224
  res = requests.post(
188
225
  self._ENDPOINT,
189
226
  headers={"Content-Type": "application/json"},
190
- json=data,
227
+ json=request_data,
191
228
  )
192
229
  resp_json: Dict[str, Any] = res.json()
193
230
  if (
@@ -195,16 +232,15 @@ class GroundingDINO(Tool):
195
232
  ) or "statusCode" not in resp_json:
196
233
  _LOGGER.error(f"Request failed: {resp_json}")
197
234
  raise ValueError(f"Request failed: {resp_json}")
198
- resp_data = resp_json["data"]
199
- for elt in resp_data:
200
- if "bboxes" in elt:
201
- elt["bboxes"] = [
202
- normalize_bbox(box, image_size) for box in elt["bboxes"]
203
- ]
204
- if "scores" in elt:
205
- elt["scores"] = [round(score, 2) for score in elt["scores"]]
206
- elt["size"] = (image_size[1], image_size[0])
207
- return cast(Dict, resp_data)
235
+ data: Dict[str, Any] = resp_json["data"]
236
+ if "bboxes" in data:
237
+ data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
238
+ if "scores" in data:
239
+ data["scores"] = [round(score, 2) for score in data["scores"]]
240
+ if "labels" in data:
241
+ data["labels"] = [label for label in data["labels"]]
242
+ data["size"] = (image_size[1], image_size[0])
243
+ return data
208
244
 
209
245
 
210
246
  class GroundingSAM(Tool):
@@ -215,7 +251,7 @@ class GroundingSAM(Tool):
215
251
  -------
216
252
  >>> import vision_agent as va
217
253
  >>> t = va.tools.GroundingSAM()
218
- >>> t(["red line", "yellow dot"], ct_scan1.jpg"])
254
+ >>> t("red line, yellow dot", "ct_scan1.jpg"])
219
255
  [{'labels': ['yellow dot', 'red line'],
220
256
  'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
221
257
  'masks': [array([[0, 0, 0, ..., 0, 0, 0],
@@ -230,55 +266,71 @@ class GroundingSAM(Tool):
230
266
  [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
231
267
  """
232
268
 
233
- _ENDPOINT = "https://cou5lfmus33jbddl6hoqdfbw7e0qidrw.lambda-url.us-east-2.on.aws"
269
+ _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
234
270
 
235
271
  name = "grounding_sam_"
236
272
  description = "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions."
237
273
  usage = {
238
274
  "required_parameters": [
239
- {"name": "prompt", "type": "List[str]"},
275
+ {"name": "prompt", "type": "str"},
240
276
  {"name": "image", "type": "str"},
241
277
  ],
278
+ "optional_parameters": [
279
+ {"name": "box_threshold", "type": "float"},
280
+ {"name": "iou_threshold", "type": "float"},
281
+ ],
242
282
  "examples": [
243
283
  {
244
284
  "scenario": "Can you build me a car segmentor?",
245
- "parameters": {"prompt": ["car"], "image": ""},
285
+ "parameters": {"prompt": "car", "image": ""},
246
286
  },
247
287
  {
248
288
  "scenario": "Can you segment the person on the left? Image name: person.jpg",
249
- "parameters": {"prompt": ["person on the left"], "image": "person.jpg"},
289
+ "parameters": {"prompt": "person on the left", "image": "person.jpg"},
250
290
  },
251
291
  {
252
292
  "scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg",
253
293
  "parameters": {
254
- "prompt": ["red shirt", "green shirt"],
294
+ "prompt": "red shirt, green shirt",
255
295
  "image": "shirts.jpg",
296
+ "box_threshold": 0.20,
297
+ "iou_threshold": 0.75,
256
298
  },
257
299
  },
258
300
  ],
259
301
  }
260
302
 
261
303
  # TODO: Add support for input multiple images, which aligns with the output type.
262
- def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
304
+ def __call__(
305
+ self,
306
+ prompt: str,
307
+ image: Union[str, ImageType],
308
+ box_threshold: float = 0.2,
309
+ iou_threshold: float = 0.75,
310
+ ) -> Dict:
263
311
  """Invoke the Grounding SAM model.
264
312
 
265
313
  Parameters:
266
314
  prompt: a list of classes to segment.
267
315
  image: the input image to segment.
316
+ box_threshold: the threshold to filter out the bounding boxes with low scores.
317
+ iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
268
318
 
269
319
  Returns:
270
320
  A list of dictionaries containing the labels, scores, bboxes and masks. Each dictionary contains the segmentation result for an image.
271
321
  """
272
322
  image_size = get_image_size(image)
273
323
  image_b64 = convert_to_b64(image)
274
- data = {
275
- "classes": prompt,
324
+ request_data = {
325
+ "prompt": prompt,
276
326
  "image": image_b64,
327
+ "tool": "visual_grounding_segment",
328
+ "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
277
329
  }
278
330
  res = requests.post(
279
331
  self._ENDPOINT,
280
332
  headers={"Content-Type": "application/json"},
281
- json=data,
333
+ json=request_data,
282
334
  )
283
335
  resp_json: Dict[str, Any] = res.json()
284
336
  if (
@@ -286,14 +338,19 @@ class GroundingSAM(Tool):
286
338
  ) or "statusCode" not in resp_json:
287
339
  _LOGGER.error(f"Request failed: {resp_json}")
288
340
  raise ValueError(f"Request failed: {resp_json}")
289
- resp_data = resp_json["data"]
341
+ data: Dict[str, Any] = resp_json["data"]
290
342
  ret_pred: Dict[str, List] = {"labels": [], "bboxes": [], "masks": []}
291
- for pred in resp_data["preds"]:
292
- encoded_mask = pred["encoded_mask"]
293
- mask = rle_decode(mask_rle=encoded_mask, shape=pred["mask_shape"])
294
- ret_pred["labels"].append(pred["label_name"])
295
- ret_pred["bboxes"].append(normalize_bbox(pred["bbox"], image_size))
296
- ret_pred["masks"].append(mask)
343
+ if "bboxes" in data:
344
+ ret_pred["bboxes"] = [
345
+ normalize_bbox(box, image_size) for box in data["bboxes"]
346
+ ]
347
+ if "masks" in data:
348
+ ret_pred["masks"] = [
349
+ rle_decode(mask_rle=mask, shape=data["mask_shape"])
350
+ for mask in data["masks"]
351
+ ]
352
+ ret_pred["labels"] = data["labels"]
353
+ ret_pred["scores"] = data["scores"]
297
354
  return ret_pred
298
355
 
299
356
 
@@ -302,8 +359,14 @@ class AgentGroundingSAM(GroundingSAM):
302
359
  returns the file name. This makes it easier for agents to use.
303
360
  """
304
361
 
305
- def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
306
- rets = super().__call__(prompt, image)
362
+ def __call__(
363
+ self,
364
+ prompt: str,
365
+ image: Union[str, ImageType],
366
+ box_threshold: float = 0.2,
367
+ iou_threshold: float = 0.75,
368
+ ) -> Dict:
369
+ rets = super().__call__(prompt, image, box_threshold, iou_threshold)
307
370
  mask_files = []
308
371
  for mask in rets["masks"]:
309
372
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
@@ -384,7 +447,7 @@ class BboxArea(Tool):
384
447
  name = "bbox_area_"
385
448
  description = "'bbox_area_' returns the area of the bounding box in pixels normalized to 2 decimal places."
386
449
  usage = {
387
- "required_parameters": [{"name": "bbox", "type": "List[int]"}],
450
+ "required_parameters": [{"name": "bboxes", "type": "List[int]"}],
388
451
  "examples": [
389
452
  {
390
453
  "scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
@@ -426,7 +489,8 @@ class SegArea(Tool):
426
489
  def __call__(self, masks: Union[str, Path]) -> float:
427
490
  pil_mask = Image.open(str(masks))
428
491
  np_mask = np.array(pil_mask)
429
- return cast(float, round(np.sum(np_mask) / 255, 2))
492
+ np_mask = np.clip(np_mask, 0, 1)
493
+ return cast(float, round(np.sum(np_mask), 2))
430
494
 
431
495
 
432
496
  class BboxIoU(Tool):
@@ -612,6 +676,7 @@ TOOLS = {
612
676
  i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
613
677
  for i, c in enumerate(
614
678
  [
679
+ NoOp,
615
680
  CLIP,
616
681
  GroundingDINO,
617
682
  AgentGroundingSAM,
@@ -1 +0,0 @@
1
- from .llm import LLM, OpenAILLM
@@ -1 +0,0 @@
1
- from .lmm import LMM, LLaVALMM, OpenAILMM, get_lmm
File without changes