PyPI - vision-agent - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

vision-agent 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

vision_agent/agent/vision_agent.py +12 -11
vision_agent/llm/llm.py +5 -0
vision_agent/lmm/lmm.py +13 -4
vision_agent/tools/__init__.py +4 -0
vision_agent/tools/tools.py +226 -6
{vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/METADATA +29 -3
{vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/RECORD +9 -9
{vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/LICENSE +0 -0
{vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -377,6 +377,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
             "dinov_",
             "zero_shot_counting_",
             "visual_prompt_counting_",
+            "ocr_",
         ]:
             continue
@@ -523,20 +524,20 @@ class VisionAgent(Agent):
         if image:
             question += f" Image name: {image}"
         if reference_data:
-            if not (
-                "image" in reference_data
-                and ("mask" in reference_data or "bbox" in reference_data)
-            ):
-                raise ValueError(
-                    f"Reference data must contain 'image' and a visual prompt which can be 'mask' or 'bbox'. but got {reference_data}"
-                )
-            visual_prompt_data = (
-                f"Reference mask: {reference_data['mask']}"
+            question += (
+                f" Reference image: {reference_data['image']}"
+                if "image" in reference_data
+                else ""
+            )
+            question += (
+                f" Reference mask: {reference_data['mask']}"
                 if "mask" in reference_data
-                else f"Reference bbox: {reference_data['bbox']}"
+                else ""
             )
             question += (
-                f" Reference image: {reference_data['image']}, {visual_prompt_data}"
+                f" Reference bbox: {reference_data['bbox']}"
+                if "bbox" in reference_data
+                else ""
             )
         reflections = ""

vision_agent/llm/llm.py CHANGED Viewed

@@ -131,6 +131,11 @@ class OpenAILLM(LLM):
     def generate_zero_shot_counter(self, question: str) -> Callable:
         return lambda x: ZeroShotCounting()(**{"image": x})
+    def generate_image_qa_tool(self, question: str) -> Callable:
+        from vision_agent.tools import ImageQuestionAnswering
+        return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
 class AzureOpenAILLM(OpenAILLM):
     def __init__(

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -11,11 +11,7 @@ from openai import AzureOpenAI, OpenAI
 from vision_agent.tools import (
     CHOOSE_PARAMS,
-    CLIP,
     SYSTEM_PROMPT,
-    GroundingDINO,
-    GroundingSAM,
-    ZeroShotCounting,
 )
 _LOGGER = logging.getLogger(__name__)
@@ -205,6 +201,8 @@ class OpenAILMM(LMM):
         return cast(str, response.choices[0].message.content)
     def generate_classifier(self, question: str) -> Callable:
+        from vision_agent.tools import CLIP
         api_doc = CLIP.description + "\n" + str(CLIP.usage)
         prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
         response = self.client.chat.completions.create(
@@ -228,6 +226,8 @@ class OpenAILMM(LMM):
         return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x})
     def generate_detector(self, question: str) -> Callable:
+        from vision_agent.tools import GroundingDINO
         api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage)
         prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
         response = self.client.chat.completions.create(
@@ -251,6 +251,8 @@ class OpenAILMM(LMM):
         return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x})
     def generate_segmentor(self, question: str) -> Callable:
+        from vision_agent.tools import GroundingSAM
         api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage)
         prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
         response = self.client.chat.completions.create(
@@ -274,8 +276,15 @@ class OpenAILMM(LMM):
         return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
     def generate_zero_shot_counter(self, question: str) -> Callable:
+        from vision_agent.tools import ZeroShotCounting
         return lambda x: ZeroShotCounting()(**{"image": x})
+    def generate_image_qa_tool(self, question: str) -> Callable:
+        from vision_agent.tools import ImageQuestionAnswering
+        return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
 class AzureOpenAILMM(OpenAILMM):
     def __init__(

vision_agent/tools/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 from .tools import (  # Counter,
     CLIP,
+    OCR,
     TOOLS,
     BboxArea,
     BboxIoU,
@@ -13,7 +14,10 @@ from .tools import (  # Counter,
     ImageCaption,
     ZeroShotCounting,
     VisualPromptCounting,
+    VisualQuestionAnswering,
+    ImageQuestionAnswering,
     SegArea,
     SegIoU,
     Tool,
+    register_tool,
 )

vision_agent/tools/tools.py CHANGED Viewed

@@ -1,8 +1,9 @@
+import io
 import logging
 import tempfile
 from abc import ABC
 from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union, cast
+from typing import Any, Dict, List, Tuple, Type, Union, cast
 import numpy as np
 import requests
@@ -11,13 +12,14 @@ from PIL.Image import Image as ImageType
 from vision_agent.image_utils import (
     convert_to_b64,
+    denormalize_bbox,
     get_image_size,
-    rle_decode,
     normalize_bbox,
-    denormalize_bbox,
+    rle_decode,
 )
 from vision_agent.tools.video import extract_frames_from_video
 from vision_agent.type_defs import LandingaiAPIKey
+from vision_agent.lmm import OpenAILMM
 _LOGGER = logging.getLogger(__name__)
 _LND_API_KEY = LandingaiAPIKey().api_key
@@ -29,6 +31,9 @@ class Tool(ABC):
     description: str
     usage: Dict
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        raise NotImplementedError
 class NoOp(Tool):
     name = "noop_"
@@ -498,7 +503,7 @@ class ZeroShotCounting(Tool):
     # TODO: Add support for input multiple images, which aligns with the output type.
     def __call__(self, image: Union[str, ImageType]) -> Dict:
-        """Invoke the Image captioning model.
+        """Invoke the Zero shot counting model.
         Parameters:
             image: the input image.
@@ -562,7 +567,7 @@ class VisualPromptCounting(Tool):
     # TODO: Add support for input multiple images, which aligns with the output type.
     def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
-        """Invoke the Image captioning model.
+        """Invoke the few shot counting model.
         Parameters:
             image: the input image.
@@ -583,6 +588,144 @@ class VisualPromptCounting(Tool):
         return _send_inference_request(data, "tools")
+class VisualQuestionAnswering(Tool):
+    r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
+    Example
+    -------
+        >>> import vision_agent as va
+        >>> vqa_tool = va.tools.VisualQuestionAnswering()
+        >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
+        {'text': "The image contains a cat sitting on a table with a bowl of milk."}
+    """
+    name = "visual_question_answering_"
+    description = "'visual_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
+    usage = {
+        "required_parameters": [
+            {"name": "image", "type": "str"},
+            {"name": "prompt", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Describe this image in detail. Image name: cat.jpg",
+                "parameters": {
+                    "image": "cats.jpg",
+                    "prompt": "Describe this image in detail",
+                },
+            },
+            {
+                "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
+                "parameters": {
+                    "image": "sign.jpg",
+                    "prompt": "Can you help me with this street sign ? What does it say ?",
+                },
+            },
+            {
+                "scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
+                "parameters": {
+                    "image": "weather.jpg",
+                    "prompt": "Describe the weather in the image for me ",
+                },
+            },
+            {
+                "scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
+                "parameters": {
+                    "image": "chart.jpg",
+                    "prompt": "Which 2 are the least frequent bins in this histogram",
+                },
+            },
+        ],
+    }
+    def __call__(self, image: str, prompt: str) -> Dict:
+        """Invoke the visual question answering model.
+        Parameters:
+            image: the input image.
+        Returns:
+            A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
+        """
+        gpt = OpenAILMM()
+        return {"text": gpt(input=prompt, images=[image])}
+class ImageQuestionAnswering(Tool):
+    r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
+    It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
+    It is also useful if the user wants the data to be not exposed to OpenAI endpoints
+    Example
+    -------
+        >>> import vision_agent as va
+        >>> vqa_tool = va.tools.ImageQuestionAnswering()
+        >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
+        {'text': "The image contains a cat sitting on a table with a bowl of milk."}
+    """
+    name = "image_question_answering_"
+    description = "'image_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
+    usage = {
+        "required_parameters": [
+            {"name": "image", "type": "str"},
+            {"name": "prompt", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Describe this image in detail. Image name: cat.jpg",
+                "parameters": {
+                    "image": "cats.jpg",
+                    "prompt": "Describe this image in detail",
+                },
+            },
+            {
+                "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
+                "parameters": {
+                    "image": "sign.jpg",
+                    "prompt": "Can you help me with this street sign ? What does it say ?",
+                },
+            },
+            {
+                "scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
+                "parameters": {
+                    "image": "weather.jpg",
+                    "prompt": "Describe the weather in the image for me ",
+                },
+            },
+            {
+                "scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
+                "parameters": {
+                    "image": "chart.jpg",
+                    "prompt": "Which 2 are the least frequent bins in this histogram",
+                },
+            },
+        ],
+    }
+    def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
+        """Invoke the visual question answering model.
+        Parameters:
+            image: the input image.
+        Returns:
+            A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
+        """
+        image_b64 = convert_to_b64(image)
+        data = {
+            "image": image_b64,
+            "prompt": prompt,
+            "tool": "image_question_answering",
+        }
+        return _send_inference_request(data, "tools")
 class Crop(Tool):
     r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
@@ -858,6 +1001,57 @@ class ExtractFrames(Tool):
         return result
+class OCR(Tool):
+    name = "ocr_"
+    description = "'ocr_' extracts text from an image."
+    usage = {
+        "required_parameters": [
+            {"name": "image", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Can you extract the text from this image? Image name: image.png",
+                "parameters": {"image": "image.png"},
+            },
+        ],
+    }
+    _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
+    _URL = "https://app.landing.ai/ocr/v1/detect-text"
+    def __call__(self, image: str) -> dict:
+        pil_image = Image.open(image).convert("RGB")
+        image_size = pil_image.size[::-1]
+        image_buffer = io.BytesIO()
+        pil_image.save(image_buffer, format="PNG")
+        buffer_bytes = image_buffer.getvalue()
+        image_buffer.close()
+        res = requests.post(
+            self._URL,
+            files={"images": buffer_bytes},
+            data={"language": "en"},
+            headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
+        )
+        if res.status_code != 200:
+            _LOGGER.error(f"Request failed: {res.text}")
+            raise ValueError(f"Request failed: {res.text}")
+        data = res.json()
+        output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
+        for det in data[0]:
+            output["labels"].append(det["text"])
+            box = [
+                det["location"][0]["x"],
+                det["location"][0]["y"],
+                det["location"][2]["x"],
+                det["location"][2]["y"],
+            ]
+            box = normalize_bbox(box, image_size)
+            output["bboxes"].append(box)
+            output["scores"].append(round(det["score"], 2))
+        return output
 class Calculator(Tool):
     r"""Calculator is a tool that can perform basic arithmetic operations."""
@@ -889,11 +1083,11 @@ TOOLS = {
         [
             NoOp,
             CLIP,
-            ImageCaption,
             GroundingDINO,
             AgentGroundingSAM,
             ZeroShotCounting,
             VisualPromptCounting,
+            VisualQuestionAnswering,
             AgentDINOv,
             ExtractFrames,
             Crop,
@@ -903,6 +1097,7 @@ TOOLS = {
             SegIoU,
             BboxContains,
             BoxDistance,
+            OCR,
             Calculator,
         ]
     )
@@ -910,6 +1105,31 @@ TOOLS = {
 }
+def register_tool(tool: Type[Tool]) -> Type[Tool]:
+    r"""Add a tool to the list of available tools.
+    Parameters:
+        tool: The tool to add.
+    """
+    if (
+        not hasattr(tool, "name")
+        or not hasattr(tool, "description")
+        or not hasattr(tool, "usage")
+    ):
+        raise ValueError(
+            "The tool must have 'name', 'description' and 'usage' attributes."
+        )
+    TOOLS[len(TOOLS)] = {
+        "name": tool.name,
+        "description": tool.description,
+        "usage": tool.usage,
+        "class": tool,
+    }
+    return tool
 def _send_inference_request(
     payload: Dict[str, Any], endpoint_name: str
 ) -> Dict[str, Any]:

{vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.3
+Version: 0.2.4
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -58,7 +58,7 @@ pip install vision-agent
 ```
 Ensure you have an OpenAI API key and set it as an environment variable (if you are
-using Azure OpenAI please see the additional setup section):
+using Azure OpenAI please see the Azure setup section):
 ```bash
 export OPENAI_API_KEY="your-api-key"
@@ -123,6 +123,31 @@ you. For example:
 }]
 ```
+#### Custom Tools
+You can also add your own custom tools for your vision agent to use:
+```python
+>>> from vision_agent.tools import Tool, register_tool
+>>> @register_tool
+>>> class NumItems(Tool):
+>>>    name = "num_items_"
+>>>    description = "Returns the number of items in a list."
+>>>    usage = {
+>>>        "required_parameters": [{"name": "prompt", "type": "list"}],
+>>>        "examples": [
+>>>            {
+>>>                "scenario": "How many items are in this list? ['a', 'b', 'c']",
+>>>                "parameters": {"prompt": "['a', 'b', 'c']"},
+>>>            }
+>>>        ],
+>>>    }
+>>>    def __call__(self, prompt: list[str]) -> int:
+>>>        return len(prompt)
+```
+This will register it with the list of tools Vision Agent has access to. It will be able
+to pick it based on the tool description and use it based on the usage provided.
+#### Tool List
 | Tool | Description |
 | --- | --- |
 | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
@@ -141,11 +166,12 @@ you. For example:
 | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
 | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
+| OCR | OCR returns the text detected in an image along with the location. |
 It also has a basic set of calculate tools such as add, subtract, multiply and divide.
-### Additional Setup
+### Azure Setup
 If you want to use Azure OpenAI models, you can set the environment variable:
 ```bash

{vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/RECORD RENAMED Viewed

@@ -5,21 +5,21 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
 vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
 vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=6AtVaEQL0ksg1QkUBn_YhytYjRfH7-M4q7G6pnds9Ds,25002
+vision_agent/agent/vision_agent.py,sha256=Ehb97lyPs7lYM9ipx07yxm6c2kUqz2OnjGQsv-nMwKA,24849
 vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
-vision_agent/llm/llm.py,sha256=gwDQ9-p9wEn24xi1019e5jzTGQg4xWDSqBCsqIqGcU4,5168
+vision_agent/llm/llm.py,sha256=1BkrSVBWEClyqLc0Rmyw4heLhi_ZVm6JO7-i1wd1ziw,5383
 vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
-vision_agent/lmm/lmm.py,sha256=FjxCuIk0KXuWnfY4orVmdyhJW2I4C6i5QNNEXk7gybk,10197
-vision_agent/tools/__init__.py,sha256=BlfxqbYkB0oODhnSmQg1UyzQm73AvvjCjrIiOWBIYDs,328
+vision_agent/lmm/lmm.py,sha256=sECjGMaGrv1QHq7OiFr-9LoBM5uRLjAqd0Ypp-zyFlw,10552
+vision_agent/tools/__init__.py,sha256=X6yJhWa8iKkQm4Mgf1KcV0_o39-Nrg3E56QAB5gWCO0,413
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tools.py,sha256=Cwh7GNSnCYxyKKgusHlf-Cqd9NBjlbZG7d-GauQJCwI,34751
+vision_agent/tools/tools.py,sha256=hYgRTHMCBwjT0kkT2SY5MN0FK89vuuecu-x1VqRlGbU,42779
 vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
 vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
-vision_agent-0.2.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.3.dist-info/METADATA,sha256=cQnQTRlWBxf0aVwsMoJS4TiiAtN3SbU00nlCrbNNb9w,6748
-vision_agent-0.2.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.3.dist-info/RECORD,,
+vision_agent-0.2.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.4.dist-info/METADATA,sha256=2T1YLGMh2-n8F0gGf1P2BDhgzxmtmAiylpfW3E3Q4_c,7697
+vision_agent-0.2.4.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.4.dist-info/RECORD,,

{vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

vision-agent 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl