vision-agent 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {vision_agent-0.2.2 → vision_agent-0.2.4}/PKG-INFO +34 -5
  2. {vision_agent-0.2.2 → vision_agent-0.2.4}/README.md +33 -4
  3. {vision_agent-0.2.2 → vision_agent-0.2.4}/pyproject.toml +1 -1
  4. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/agent/vision_agent.py +28 -12
  5. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/llm/llm.py +5 -0
  6. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/lmm/lmm.py +13 -4
  7. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/tools/__init__.py +4 -0
  8. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/tools/tools.py +233 -20
  9. {vision_agent-0.2.2 → vision_agent-0.2.4}/LICENSE +0 -0
  10. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/__init__.py +0 -0
  11. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/agent/__init__.py +0 -0
  12. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/agent/agent.py +0 -0
  13. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/agent/easytool.py +0 -0
  14. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/agent/easytool_prompts.py +0 -0
  15. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/agent/reflexion.py +0 -0
  16. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/agent/reflexion_prompts.py +0 -0
  17. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/agent/vision_agent_prompts.py +0 -0
  18. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/fonts/__init__.py +0 -0
  19. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  20. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/image_utils.py +0 -0
  21. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/llm/__init__.py +0 -0
  22. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/lmm/__init__.py +0 -0
  23. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/tools/prompts.py +0 -0
  24. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/tools/video.py +0 -0
  25. {vision_agent-0.2.2 → vision_agent-0.2.4}/vision_agent/type_defs.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -58,7 +58,7 @@ pip install vision-agent
58
58
  ```
59
59
 
60
60
  Ensure you have an OpenAI API key and set it as an environment variable (if you are
61
- using Azure OpenAI please see the additional setup section):
61
+ using Azure OpenAI please see the Azure setup section):
62
62
 
63
63
  ```bash
64
64
  export OPENAI_API_KEY="your-api-key"
@@ -123,26 +123,55 @@ you. For example:
123
123
  }]
124
124
  ```
125
125
 
126
+ #### Custom Tools
127
+ You can also add your own custom tools for your vision agent to use:
128
+
129
+ ```python
130
+ >>> from vision_agent.tools import Tool, register_tool
131
+ >>> @register_tool
132
+ >>> class NumItems(Tool):
133
+ >>> name = "num_items_"
134
+ >>> description = "Returns the number of items in a list."
135
+ >>> usage = {
136
+ >>> "required_parameters": [{"name": "prompt", "type": "list"}],
137
+ >>> "examples": [
138
+ >>> {
139
+ >>> "scenario": "How many items are in this list? ['a', 'b', 'c']",
140
+ >>> "parameters": {"prompt": "['a', 'b', 'c']"},
141
+ >>> }
142
+ >>> ],
143
+ >>> }
144
+ >>> def __call__(self, prompt: list[str]) -> int:
145
+ >>> return len(prompt)
146
+ ```
147
+ This will register it with the list of tools Vision Agent has access to. It will be able
148
+ to pick it based on the tool description and use it based on the usage provided.
149
+
150
+ #### Tool List
126
151
  | Tool | Description |
127
152
  | --- | --- |
128
153
  | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
154
+ | ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
129
155
  | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
130
156
  | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
131
- | Counter | Counter detects and counts the number of objects in an image given an input such as a category name or referring expression. |
157
+ | DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
158
+ | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
132
159
  | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
133
160
  | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
134
161
  | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
135
162
  | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
136
163
  | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
137
- | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
164
+ | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
165
+ | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
138
166
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
139
167
  | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
140
168
  | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
169
+ | OCR | OCR returns the text detected in an image along with the location. |
141
170
 
142
171
 
143
172
  It also has a basic set of calculate tools such as add, subtract, multiply and divide.
144
173
 
145
- ### Additional Setup
174
+ ### Azure Setup
146
175
  If you want to use Azure OpenAI models, you can set the environment variable:
147
176
 
148
177
  ```bash
@@ -31,7 +31,7 @@ pip install vision-agent
31
31
  ```
32
32
 
33
33
  Ensure you have an OpenAI API key and set it as an environment variable (if you are
34
- using Azure OpenAI please see the additional setup section):
34
+ using Azure OpenAI please see the Azure setup section):
35
35
 
36
36
  ```bash
37
37
  export OPENAI_API_KEY="your-api-key"
@@ -96,26 +96,55 @@ you. For example:
96
96
  }]
97
97
  ```
98
98
 
99
+ #### Custom Tools
100
+ You can also add your own custom tools for your vision agent to use:
101
+
102
+ ```python
103
+ >>> from vision_agent.tools import Tool, register_tool
104
+ >>> @register_tool
105
+ >>> class NumItems(Tool):
106
+ >>> name = "num_items_"
107
+ >>> description = "Returns the number of items in a list."
108
+ >>> usage = {
109
+ >>> "required_parameters": [{"name": "prompt", "type": "list"}],
110
+ >>> "examples": [
111
+ >>> {
112
+ >>> "scenario": "How many items are in this list? ['a', 'b', 'c']",
113
+ >>> "parameters": {"prompt": "['a', 'b', 'c']"},
114
+ >>> }
115
+ >>> ],
116
+ >>> }
117
+ >>> def __call__(self, prompt: list[str]) -> int:
118
+ >>> return len(prompt)
119
+ ```
120
+ This will register it with the list of tools Vision Agent has access to. It will be able
121
+ to pick it based on the tool description and use it based on the usage provided.
122
+
123
+ #### Tool List
99
124
  | Tool | Description |
100
125
  | --- | --- |
101
126
  | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
127
+ | ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
102
128
  | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
103
129
  | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
104
- | Counter | Counter detects and counts the number of objects in an image given an input such as a category name or referring expression. |
130
+ | DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
131
+ | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
105
132
  | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
106
133
  | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
107
134
  | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
108
135
  | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
109
136
  | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
110
- | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
137
+ | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
138
+ | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
111
139
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
112
140
  | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
113
141
  | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
142
+ | OCR | OCR returns the text detected in an image along with the location. |
114
143
 
115
144
 
116
145
  It also has a basic set of calculate tools such as add, subtract, multiply and divide.
117
146
 
118
- ### Additional Setup
147
+ ### Azure Setup
119
148
  If you want to use Azure OpenAI models, you can set the environment variable:
120
149
 
121
150
  ```bash
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -377,6 +377,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
377
377
  "dinov_",
378
378
  "zero_shot_counting_",
379
379
  "visual_prompt_counting_",
380
+ "ocr_",
380
381
  ]:
381
382
  continue
382
383
 
@@ -428,7 +429,7 @@ class VisionAgent(Agent):
428
429
  ):
429
430
  """VisionAgent constructor.
430
431
 
431
- Parameters
432
+ Parameters:
432
433
  task_model: the model to use for task decomposition.
433
434
  answer_model: the model to use for reasoning and concluding the answer.
434
435
  reflect_model: the model to use for self reflection.
@@ -504,24 +505,39 @@ class VisionAgent(Agent):
504
505
  reference_data: Optional[Dict[str, str]] = None,
505
506
  visualize_output: Optional[bool] = False,
506
507
  ) -> Tuple[str, List[Dict]]:
508
+ """Chat with the vision agent and return the final answer and all tool results.
509
+
510
+ Parameters:
511
+ chat: a conversation in the format of
512
+ [{"role": "user", "content": "describe your task here..."}].
513
+ image: the input image referenced in the chat parameter.
514
+ reference_data: a dictionary containing the reference image and mask. in the
515
+ format of {"image": "image.jpg", "mask": "mask.jpg}
516
+ visualize_output: whether to visualize the output.
517
+
518
+ Returns:
519
+ A tuple where the first item is the final answer and the second item is a
520
+ list of all the tool results. The last item in the tool results also
521
+ contains the visualized output.
522
+ """
507
523
  question = chat[0]["content"]
508
524
  if image:
509
525
  question += f" Image name: {image}"
510
526
  if reference_data:
511
- if not (
512
- "image" in reference_data
513
- and ("mask" in reference_data or "bbox" in reference_data)
514
- ):
515
- raise ValueError(
516
- f"Reference data must contain 'image' and a visual prompt which can be 'mask' or 'bbox'. but got {reference_data}"
517
- )
518
- visual_prompt_data = (
519
- f"Reference mask: {reference_data['mask']}"
527
+ question += (
528
+ f" Reference image: {reference_data['image']}"
529
+ if "image" in reference_data
530
+ else ""
531
+ )
532
+ question += (
533
+ f" Reference mask: {reference_data['mask']}"
520
534
  if "mask" in reference_data
521
- else f"Reference bbox: {reference_data['bbox']}"
535
+ else ""
522
536
  )
523
537
  question += (
524
- f" Reference image: {reference_data['image']}, {visual_prompt_data}"
538
+ f" Reference bbox: {reference_data['bbox']}"
539
+ if "bbox" in reference_data
540
+ else ""
525
541
  )
526
542
 
527
543
  reflections = ""
@@ -131,6 +131,11 @@ class OpenAILLM(LLM):
131
131
  def generate_zero_shot_counter(self, question: str) -> Callable:
132
132
  return lambda x: ZeroShotCounting()(**{"image": x})
133
133
 
134
+ def generate_image_qa_tool(self, question: str) -> Callable:
135
+ from vision_agent.tools import ImageQuestionAnswering
136
+
137
+ return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
138
+
134
139
 
135
140
  class AzureOpenAILLM(OpenAILLM):
136
141
  def __init__(
@@ -11,11 +11,7 @@ from openai import AzureOpenAI, OpenAI
11
11
 
12
12
  from vision_agent.tools import (
13
13
  CHOOSE_PARAMS,
14
- CLIP,
15
14
  SYSTEM_PROMPT,
16
- GroundingDINO,
17
- GroundingSAM,
18
- ZeroShotCounting,
19
15
  )
20
16
 
21
17
  _LOGGER = logging.getLogger(__name__)
@@ -205,6 +201,8 @@ class OpenAILMM(LMM):
205
201
  return cast(str, response.choices[0].message.content)
206
202
 
207
203
  def generate_classifier(self, question: str) -> Callable:
204
+ from vision_agent.tools import CLIP
205
+
208
206
  api_doc = CLIP.description + "\n" + str(CLIP.usage)
209
207
  prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
210
208
  response = self.client.chat.completions.create(
@@ -228,6 +226,8 @@ class OpenAILMM(LMM):
228
226
  return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x})
229
227
 
230
228
  def generate_detector(self, question: str) -> Callable:
229
+ from vision_agent.tools import GroundingDINO
230
+
231
231
  api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage)
232
232
  prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
233
233
  response = self.client.chat.completions.create(
@@ -251,6 +251,8 @@ class OpenAILMM(LMM):
251
251
  return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x})
252
252
 
253
253
  def generate_segmentor(self, question: str) -> Callable:
254
+ from vision_agent.tools import GroundingSAM
255
+
254
256
  api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage)
255
257
  prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
256
258
  response = self.client.chat.completions.create(
@@ -274,8 +276,15 @@ class OpenAILMM(LMM):
274
276
  return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
275
277
 
276
278
  def generate_zero_shot_counter(self, question: str) -> Callable:
279
+ from vision_agent.tools import ZeroShotCounting
280
+
277
281
  return lambda x: ZeroShotCounting()(**{"image": x})
278
282
 
283
+ def generate_image_qa_tool(self, question: str) -> Callable:
284
+ from vision_agent.tools import ImageQuestionAnswering
285
+
286
+ return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
287
+
279
288
 
280
289
  class AzureOpenAILMM(OpenAILMM):
281
290
  def __init__(
@@ -1,6 +1,7 @@
1
1
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
2
2
  from .tools import ( # Counter,
3
3
  CLIP,
4
+ OCR,
4
5
  TOOLS,
5
6
  BboxArea,
6
7
  BboxIoU,
@@ -13,7 +14,10 @@ from .tools import ( # Counter,
13
14
  ImageCaption,
14
15
  ZeroShotCounting,
15
16
  VisualPromptCounting,
17
+ VisualQuestionAnswering,
18
+ ImageQuestionAnswering,
16
19
  SegArea,
17
20
  SegIoU,
18
21
  Tool,
22
+ register_tool,
19
23
  )
@@ -1,8 +1,9 @@
1
+ import io
1
2
  import logging
2
3
  import tempfile
3
4
  from abc import ABC
4
5
  from pathlib import Path
5
- from typing import Any, Dict, List, Tuple, Union, cast
6
+ from typing import Any, Dict, List, Tuple, Type, Union, cast
6
7
 
7
8
  import numpy as np
8
9
  import requests
@@ -11,13 +12,14 @@ from PIL.Image import Image as ImageType
11
12
 
12
13
  from vision_agent.image_utils import (
13
14
  convert_to_b64,
15
+ denormalize_bbox,
14
16
  get_image_size,
15
- rle_decode,
16
17
  normalize_bbox,
17
- denormalize_bbox,
18
+ rle_decode,
18
19
  )
19
20
  from vision_agent.tools.video import extract_frames_from_video
20
21
  from vision_agent.type_defs import LandingaiAPIKey
22
+ from vision_agent.lmm import OpenAILMM
21
23
 
22
24
  _LOGGER = logging.getLogger(__name__)
23
25
  _LND_API_KEY = LandingaiAPIKey().api_key
@@ -29,6 +31,9 @@ class Tool(ABC):
29
31
  description: str
30
32
  usage: Dict
31
33
 
34
+ def __call__(self, *args: Any, **kwargs: Any) -> Any:
35
+ raise NotImplementedError
36
+
32
37
 
33
38
  class NoOp(Tool):
34
39
  name = "noop_"
@@ -108,8 +113,7 @@ class CLIP(Tool):
108
113
 
109
114
 
110
115
  class ImageCaption(Tool):
111
- r"""ImageCaption is a tool that can caption an image based on its contents
112
- or tags.
116
+ r"""ImageCaption is a tool that can caption an image based on its contents or tags.
113
117
 
114
118
  Example
115
119
  -------
@@ -120,26 +124,20 @@ class ImageCaption(Tool):
120
124
  """
121
125
 
122
126
  name = "image_caption_"
123
- description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
127
+ description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
124
128
  usage = {
125
129
  "required_parameters": [
126
130
  {"name": "image", "type": "str"},
127
131
  ],
128
132
  "examples": [
129
133
  {
130
- "scenario": "Can you describe this image ? Image name: cat.jpg",
134
+ "scenario": "Can you describe this image? Image name: cat.jpg",
131
135
  "parameters": {"image": "cat.jpg"},
132
136
  },
133
137
  {
134
- "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
138
+ "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
135
139
  "parameters": {"image": "cat_dog.jpg"},
136
140
  },
137
- {
138
- "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
139
- "parameters": {
140
- "image": "shirts.jpg",
141
- },
142
- },
143
141
  ],
144
142
  }
145
143
 
@@ -487,15 +485,15 @@ class ZeroShotCounting(Tool):
487
485
  ],
488
486
  "examples": [
489
487
  {
490
- "scenario": "Can you count the lids in the image ? Image name: lids.jpg",
488
+ "scenario": "Can you count the lids in the image? Image name: lids.jpg",
491
489
  "parameters": {"image": "lids.jpg"},
492
490
  },
493
491
  {
494
- "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
492
+ "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
495
493
  "parameters": {"image": "tray.jpg"},
496
494
  },
497
495
  {
498
- "scenario": "Can you build me an object counting tool ? Image name: shirts.jpg",
496
+ "scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
499
497
  "parameters": {
500
498
  "image": "shirts.jpg",
501
499
  },
@@ -505,7 +503,7 @@ class ZeroShotCounting(Tool):
505
503
 
506
504
  # TODO: Add support for input multiple images, which aligns with the output type.
507
505
  def __call__(self, image: Union[str, ImageType]) -> Dict:
508
- """Invoke the Image captioning model.
506
+ """Invoke the Zero shot counting model.
509
507
 
510
508
  Parameters:
511
509
  image: the input image.
@@ -569,7 +567,7 @@ class VisualPromptCounting(Tool):
569
567
 
570
568
  # TODO: Add support for input multiple images, which aligns with the output type.
571
569
  def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
572
- """Invoke the Image captioning model.
570
+ """Invoke the few shot counting model.
573
571
 
574
572
  Parameters:
575
573
  image: the input image.
@@ -590,6 +588,144 @@ class VisualPromptCounting(Tool):
590
588
  return _send_inference_request(data, "tools")
591
589
 
592
590
 
591
+ class VisualQuestionAnswering(Tool):
592
+ r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
593
+
594
+ Example
595
+ -------
596
+ >>> import vision_agent as va
597
+ >>> vqa_tool = va.tools.VisualQuestionAnswering()
598
+ >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
599
+ {'text': "The image contains a cat sitting on a table with a bowl of milk."}
600
+ """
601
+
602
+ name = "visual_question_answering_"
603
+ description = "'visual_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
604
+
605
+ usage = {
606
+ "required_parameters": [
607
+ {"name": "image", "type": "str"},
608
+ {"name": "prompt", "type": "str"},
609
+ ],
610
+ "examples": [
611
+ {
612
+ "scenario": "Describe this image in detail. Image name: cat.jpg",
613
+ "parameters": {
614
+ "image": "cats.jpg",
615
+ "prompt": "Describe this image in detail",
616
+ },
617
+ },
618
+ {
619
+ "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
620
+ "parameters": {
621
+ "image": "sign.jpg",
622
+ "prompt": "Can you help me with this street sign ? What does it say ?",
623
+ },
624
+ },
625
+ {
626
+ "scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
627
+ "parameters": {
628
+ "image": "weather.jpg",
629
+ "prompt": "Describe the weather in the image for me ",
630
+ },
631
+ },
632
+ {
633
+ "scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
634
+ "parameters": {
635
+ "image": "chart.jpg",
636
+ "prompt": "Which 2 are the least frequent bins in this histogram",
637
+ },
638
+ },
639
+ ],
640
+ }
641
+
642
+ def __call__(self, image: str, prompt: str) -> Dict:
643
+ """Invoke the visual question answering model.
644
+
645
+ Parameters:
646
+ image: the input image.
647
+
648
+ Returns:
649
+ A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
650
+ """
651
+
652
+ gpt = OpenAILMM()
653
+ return {"text": gpt(input=prompt, images=[image])}
654
+
655
+
656
+ class ImageQuestionAnswering(Tool):
657
+ r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
658
+ It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
659
+ It is also useful if the user wants the data to be not exposed to OpenAI endpoints
660
+
661
+ Example
662
+ -------
663
+ >>> import vision_agent as va
664
+ >>> vqa_tool = va.tools.ImageQuestionAnswering()
665
+ >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
666
+ {'text': "The image contains a cat sitting on a table with a bowl of milk."}
667
+ """
668
+
669
+ name = "image_question_answering_"
670
+ description = "'image_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
671
+
672
+ usage = {
673
+ "required_parameters": [
674
+ {"name": "image", "type": "str"},
675
+ {"name": "prompt", "type": "str"},
676
+ ],
677
+ "examples": [
678
+ {
679
+ "scenario": "Describe this image in detail. Image name: cat.jpg",
680
+ "parameters": {
681
+ "image": "cats.jpg",
682
+ "prompt": "Describe this image in detail",
683
+ },
684
+ },
685
+ {
686
+ "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
687
+ "parameters": {
688
+ "image": "sign.jpg",
689
+ "prompt": "Can you help me with this street sign ? What does it say ?",
690
+ },
691
+ },
692
+ {
693
+ "scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
694
+ "parameters": {
695
+ "image": "weather.jpg",
696
+ "prompt": "Describe the weather in the image for me ",
697
+ },
698
+ },
699
+ {
700
+ "scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
701
+ "parameters": {
702
+ "image": "chart.jpg",
703
+ "prompt": "Which 2 are the least frequent bins in this histogram",
704
+ },
705
+ },
706
+ ],
707
+ }
708
+
709
+ def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
710
+ """Invoke the visual question answering model.
711
+
712
+ Parameters:
713
+ image: the input image.
714
+
715
+ Returns:
716
+ A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
717
+ """
718
+
719
+ image_b64 = convert_to_b64(image)
720
+ data = {
721
+ "image": image_b64,
722
+ "prompt": prompt,
723
+ "tool": "image_question_answering",
724
+ }
725
+
726
+ return _send_inference_request(data, "tools")
727
+
728
+
593
729
  class Crop(Tool):
594
730
  r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
595
731
 
@@ -865,6 +1001,57 @@ class ExtractFrames(Tool):
865
1001
  return result
866
1002
 
867
1003
 
1004
+ class OCR(Tool):
1005
+ name = "ocr_"
1006
+ description = "'ocr_' extracts text from an image."
1007
+ usage = {
1008
+ "required_parameters": [
1009
+ {"name": "image", "type": "str"},
1010
+ ],
1011
+ "examples": [
1012
+ {
1013
+ "scenario": "Can you extract the text from this image? Image name: image.png",
1014
+ "parameters": {"image": "image.png"},
1015
+ },
1016
+ ],
1017
+ }
1018
+ _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
1019
+ _URL = "https://app.landing.ai/ocr/v1/detect-text"
1020
+
1021
+ def __call__(self, image: str) -> dict:
1022
+ pil_image = Image.open(image).convert("RGB")
1023
+ image_size = pil_image.size[::-1]
1024
+ image_buffer = io.BytesIO()
1025
+ pil_image.save(image_buffer, format="PNG")
1026
+ buffer_bytes = image_buffer.getvalue()
1027
+ image_buffer.close()
1028
+
1029
+ res = requests.post(
1030
+ self._URL,
1031
+ files={"images": buffer_bytes},
1032
+ data={"language": "en"},
1033
+ headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
1034
+ )
1035
+ if res.status_code != 200:
1036
+ _LOGGER.error(f"Request failed: {res.text}")
1037
+ raise ValueError(f"Request failed: {res.text}")
1038
+
1039
+ data = res.json()
1040
+ output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
1041
+ for det in data[0]:
1042
+ output["labels"].append(det["text"])
1043
+ box = [
1044
+ det["location"][0]["x"],
1045
+ det["location"][0]["y"],
1046
+ det["location"][2]["x"],
1047
+ det["location"][2]["y"],
1048
+ ]
1049
+ box = normalize_bbox(box, image_size)
1050
+ output["bboxes"].append(box)
1051
+ output["scores"].append(round(det["score"], 2))
1052
+ return output
1053
+
1054
+
868
1055
  class Calculator(Tool):
869
1056
  r"""Calculator is a tool that can perform basic arithmetic operations."""
870
1057
 
@@ -896,11 +1083,11 @@ TOOLS = {
896
1083
  [
897
1084
  NoOp,
898
1085
  CLIP,
899
- ImageCaption,
900
1086
  GroundingDINO,
901
1087
  AgentGroundingSAM,
902
1088
  ZeroShotCounting,
903
1089
  VisualPromptCounting,
1090
+ VisualQuestionAnswering,
904
1091
  AgentDINOv,
905
1092
  ExtractFrames,
906
1093
  Crop,
@@ -910,6 +1097,7 @@ TOOLS = {
910
1097
  SegIoU,
911
1098
  BboxContains,
912
1099
  BoxDistance,
1100
+ OCR,
913
1101
  Calculator,
914
1102
  ]
915
1103
  )
@@ -917,6 +1105,31 @@ TOOLS = {
917
1105
  }
918
1106
 
919
1107
 
1108
+ def register_tool(tool: Type[Tool]) -> Type[Tool]:
1109
+ r"""Add a tool to the list of available tools.
1110
+
1111
+ Parameters:
1112
+ tool: The tool to add.
1113
+ """
1114
+
1115
+ if (
1116
+ not hasattr(tool, "name")
1117
+ or not hasattr(tool, "description")
1118
+ or not hasattr(tool, "usage")
1119
+ ):
1120
+ raise ValueError(
1121
+ "The tool must have 'name', 'description' and 'usage' attributes."
1122
+ )
1123
+
1124
+ TOOLS[len(TOOLS)] = {
1125
+ "name": tool.name,
1126
+ "description": tool.description,
1127
+ "usage": tool.usage,
1128
+ "class": tool,
1129
+ }
1130
+ return tool
1131
+
1132
+
920
1133
  def _send_inference_request(
921
1134
  payload: Dict[str, Any], endpoint_name: str
922
1135
  ) -> Dict[str, Any]:
File without changes