PyPI - vision-agent - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

vision-agent 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -8,7 +8,12 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 from PIL import Image
 from tabulate import tabulate
-from vision_agent.image_utils import overlay_bboxes, overlay_masks, overlay_heat_map
+from vision_agent.image_utils import (
+    convert_to_b64,
+    overlay_bboxes,
+    overlay_heat_map,
+    overlay_masks,
+)
 from vision_agent.llm import LLM, OpenAILLM
 from vision_agent.lmm import LMM, OpenAILMM
 from vision_agent.tools import TOOLS
@@ -423,7 +428,7 @@ class VisionAgent(Agent):
     ):
         """VisionAgent constructor.
-        Parameters
+        Parameters:
             task_model: the model to use for task decomposition.
             answer_model: the model to use for reasoning and concluding the answer.
             reflect_model: the model to use for self reflection.
@@ -481,6 +486,17 @@ class VisionAgent(Agent):
         if self.report_progress_callback:
             self.report_progress_callback(description)
+    def _report_visualization_via_callback(
+        self, images: Sequence[Union[str, Path]]
+    ) -> None:
+        """This is intended for streaming the visualization images via the callback to the client side."""
+        if self.report_progress_callback:
+            self.report_progress_callback("<VIZ>")
+            if images:
+                for img in images:
+                    self.report_progress_callback(f"<IMG>{convert_to_b64(img)}</IMG>")
+            self.report_progress_callback("</VIZ>")
     def chat_with_workflow(
         self,
         chat: List[Dict[str, str]],
@@ -488,6 +504,21 @@ class VisionAgent(Agent):
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
     ) -> Tuple[str, List[Dict]]:
+        """Chat with the vision agent and return the final answer and all tool results.
+        Parameters:
+            chat: a conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
+            image: the input image referenced in the chat parameter.
+            reference_data: a dictionary containing the reference image and mask. in the
+                format of {"image": "image.jpg", "mask": "mask.jpg}
+            visualize_output: whether to visualize the output.
+        Returns:
+            A tuple where the first item is the final answer and the second item is a
+            list of all the tool results. The last item in the tool results also
+            contains the visualized output.
+        """
         question = chat[0]["content"]
         if image:
             question += f" Image name: {image}"
@@ -577,9 +608,12 @@ class VisionAgent(Agent):
         )
         if visualize_output:
-            visualized_output = all_tool_results[-1]["visualized_output"]
-            for image in visualized_output:
-                Image.open(image).show()
+            viz_images: Sequence[Union[str, Path]] = all_tool_results[-1][
+                "visualized_output"
+            ]
+            self._report_visualization_via_callback(viz_images)
+            for img in viz_images:
+                Image.open(img).show()
         return final_answer, all_tool_results

vision_agent/image_utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ import base64
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, Tuple, Union, List
+from typing import Dict, List, Tuple, Union
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
@@ -108,7 +108,7 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
         data = Image.open(data)
     if isinstance(data, Image.Image):
         buffer = BytesIO()
-        data.convert("RGB").save(buffer, format="JPEG")
+        data.convert("RGB").save(buffer, format="PNG")
         return base64.b64encode(buffer.getvalue()).decode("utf-8")
     else:
         arr_bytes = data.tobytes()

vision_agent/tools/tools.py CHANGED Viewed

@@ -108,8 +108,7 @@ class CLIP(Tool):
 class ImageCaption(Tool):
-    r"""ImageCaption is a tool that can caption an image based on its contents
-    or tags.
+    r"""ImageCaption is a tool that can caption an image based on its contents or tags.
     Example
     -------
@@ -120,26 +119,20 @@ class ImageCaption(Tool):
     """
     name = "image_caption_"
-    description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
+    description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
     usage = {
         "required_parameters": [
             {"name": "image", "type": "str"},
         ],
         "examples": [
             {
-                "scenario": "Can you describe this image ? Image name: cat.jpg",
+                "scenario": "Can you describe this image? Image name: cat.jpg",
                 "parameters": {"image": "cat.jpg"},
             },
             {
-                "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
+                "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
                 "parameters": {"image": "cat_dog.jpg"},
             },
-            {
-                "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
-                "parameters": {
-                    "image": "shirts.jpg",
-                },
-            },
         ],
     }
@@ -487,15 +480,15 @@ class ZeroShotCounting(Tool):
         ],
         "examples": [
             {
-                "scenario": "Can you count the lids in the image ? Image name: lids.jpg",
+                "scenario": "Can you count the lids in the image? Image name: lids.jpg",
                 "parameters": {"image": "lids.jpg"},
             },
             {
-                "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
+                "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
                 "parameters": {"image": "tray.jpg"},
             },
             {
-                "scenario": "Can you build me an object counting tool ? Image name: shirts.jpg",
+                "scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
                 "parameters": {
                     "image": "shirts.jpg",
                 },

{vision_agent-0.2.1.dist-info → vision_agent-0.2.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.1
+Version: 0.2.3
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -126,15 +126,18 @@ you. For example:
 | Tool | Description |
 | --- | --- |
 | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
+| ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
 | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
 | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
-| Counter | Counter detects and counts the number of objects in an image given an input such as a category name or referring expression. |
+| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
+| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
 | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
 | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
 | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
 | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
-| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
+| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
+| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
 | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
 | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |

{vision_agent-0.2.1.dist-info → vision_agent-0.2.3.dist-info}/RECORD RENAMED Viewed

@@ -5,21 +5,21 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
 vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
 vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=MTxeV5_Sghqoe2aOW9EbNgiq61sVCcF3ZndJ7BZl6x0,23588
+vision_agent/agent/vision_agent.py,sha256=6AtVaEQL0ksg1QkUBn_YhytYjRfH7-M4q7G6pnds9Ds,25002
 vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
-vision_agent/image_utils.py,sha256=Cg4aKO1tQiETT1gdsZ50XzORBtJnBFfMG2cKJyjaY6Q,7555
+vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
 vision_agent/llm/llm.py,sha256=gwDQ9-p9wEn24xi1019e5jzTGQg4xWDSqBCsqIqGcU4,5168
 vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
 vision_agent/lmm/lmm.py,sha256=FjxCuIk0KXuWnfY4orVmdyhJW2I4C6i5QNNEXk7gybk,10197
 vision_agent/tools/__init__.py,sha256=BlfxqbYkB0oODhnSmQg1UyzQm73AvvjCjrIiOWBIYDs,328
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tools.py,sha256=gCjHs5vJuGNBFsnJWFT7PX3wTyfHgtrgX1Eq9vqknN0,34979
+vision_agent/tools/tools.py,sha256=Cwh7GNSnCYxyKKgusHlf-Cqd9NBjlbZG7d-GauQJCwI,34751
 vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
 vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
-vision_agent-0.2.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.1.dist-info/METADATA,sha256=RAD8NCAo5N12sccgSC5Q0j4hKwU_rVKg5p_eLE-Njdc,6434
-vision_agent-0.2.1.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.1.dist-info/RECORD,,
+vision_agent-0.2.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.3.dist-info/METADATA,sha256=cQnQTRlWBxf0aVwsMoJS4TiiAtN3SbU00nlCrbNNb9w,6748
+vision_agent-0.2.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.3.dist-info/RECORD,,

{vision_agent-0.2.1.dist-info → vision_agent-0.2.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.1.dist-info → vision_agent-0.2.3.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

vision-agent 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl