PyPI - vision-agent - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

vision-agent 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -365,6 +365,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
             "grounding_sam_",
             "grounding_dino_",
             "extract_frames_",
+            "dinov_",
         ]:
             continue
@@ -444,6 +445,7 @@ class VisionAgent(Agent):
         self,
         input: Union[List[Dict[str, str]], str],
         image: Optional[Union[str, Path]] = None,
+        reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
     ) -> str:
         """Invoke the vision agent.
@@ -458,7 +460,12 @@ class VisionAgent(Agent):
         """
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
-        return self.chat(input, image=image, visualize_output=visualize_output)
+        return self.chat(
+            input,
+            image=image,
+            visualize_output=visualize_output,
+            reference_data=reference_data,
+        )
     def log_progress(self, description: str) -> None:
         _LOGGER.info(description)
@@ -469,11 +476,18 @@ class VisionAgent(Agent):
         self,
         chat: List[Dict[str, str]],
         image: Optional[Union[str, Path]] = None,
+        reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
     ) -> Tuple[str, List[Dict]]:
         question = chat[0]["content"]
         if image:
             question += f" Image name: {image}"
+        if reference_data:
+            if not ("image" in reference_data and "mask" in reference_data):
+                raise ValueError(
+                    f"Reference data must contain 'image' and 'mask'. but got {reference_data}"
+                )
+            question += f" Reference image: {reference_data['image']}, Reference mask: {reference_data['mask']}"
         reflections = ""
         final_answer = ""
@@ -555,10 +569,14 @@ class VisionAgent(Agent):
         self,
         chat: List[Dict[str, str]],
         image: Optional[Union[str, Path]] = None,
+        reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
     ) -> str:
         answer, _ = self.chat_with_workflow(
-            chat, image=image, visualize_output=visualize_output
+            chat,
+            image=image,
+            visualize_output=visualize_output,
+            reference_data=reference_data,
         )
         return answer

vision_agent/image_utils.py CHANGED Viewed

@@ -103,7 +103,9 @@ def overlay_bboxes(
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
-    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
+    color = {
+        label: COLORS[i % len(COLORS)] for i, label in enumerate(set(bboxes["labels"]))
+    }
     width, height = image.size
     fontsize = max(12, int(min(width, height) / 40))

vision_agent/tools/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ from .tools import (  # Counter,
     BboxIoU,
     BoxDistance,
     Crop,
+    DINOv,
     ExtractFrames,
     GroundingDINO,
     GroundingSAM,

vision_agent/tools/tools.py CHANGED Viewed

@@ -372,6 +372,104 @@ class GroundingSAM(Tool):
         return ret_pred
+class DINOv(Tool):
+    r"""DINOv is a tool that can detect and segment similar objects with the given input masks.
+    Example
+    -------
+        >>> import vision_agent as va
+        >>> t = va.tools.DINOv()
+        >>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"])
+        [{'scores': [0.512, 0.212],
+        'masks': [array([[0, 0, 0, ..., 0, 0, 0],
+           ...,
+           [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
+        array([[0, 0, 0, ..., 0, 0, 0],
+           ...,
+           [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
+    """
+    name = "dinov_"
+    description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask."
+    usage = {
+        "required_parameters": [
+            {"name": "prompt", "type": "List[Dict[str, str]]"},
+            {"name": "image", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
+                "parameters": {
+                    "prompt": [
+                        {"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
+                    ],
+                    "image": "input.jpg",
+                },
+            },
+            {
+                "scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
+                "parameters": {
+                    "prompt": [
+                        {"mask": "mask.png", "image": "background.png"},
+                    ],
+                    "image": "original.jpg",
+                },
+            },
+        ],
+    }
+    def __call__(
+        self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
+    ) -> Dict:
+        """Invoke the DINOv model.
+        Parameters:
+            prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
+            image: the input image to segment.
+        Returns:
+            A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
+        """
+        image_b64 = convert_to_b64(image)
+        for p in prompt:
+            p["mask"] = convert_to_b64(p["mask"])
+            p["image"] = convert_to_b64(p["image"])
+        request_data = {
+            "prompt": prompt,
+            "image": image_b64,
+            "tool": "dinov",
+        }
+        data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
+        if "bboxes" in data:
+            data["bboxes"] = [
+                normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
+            ]
+        if "masks" in data:
+            data["masks"] = [
+                rle_decode(mask_rle=mask, shape=data["mask_shape"])
+                for mask in data["masks"]
+            ]
+        data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
+        return data
+class AgentDINOv(DINOv):
+    def __call__(
+        self,
+        prompt: List[Dict[str, str]],
+        image: Union[str, ImageType],
+    ) -> Dict:
+        rets = super().__call__(prompt, image)
+        mask_files = []
+        for mask in rets["masks"]:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                file_name = Path(tmp.name).with_suffix(".mask.png")
+                Image.fromarray(mask * 255).save(file_name)
+                mask_files.append(str(file_name))
+        rets["masks"] = mask_files
+        return rets
 class AgentGroundingSAM(GroundingSAM):
     r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
     returns the file name. This makes it easier for agents to use.
@@ -652,6 +750,7 @@ TOOLS = {
             ImageCaption,
             GroundingDINO,
             AgentGroundingSAM,
+            AgentDINOv,
             ExtractFrames,
             Crop,
             BboxArea,

{vision_agent-0.1.3.dist-info → vision_agent-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.1.3
+Version: 0.1.4
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.1.3.dist-info → vision_agent-0.1.4.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
 vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
 vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=4-milD0iSY_vKdpAIctba04Ak_In5tMBE8gATdaGIr0,22019
+vision_agent/agent/vision_agent.py,sha256=QWIirRBB3ZPg3figWcf8-g9ltFydM1BDn75LbXWbep0,22735
 vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
 vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
 vision_agent/data/data.py,sha256=Z2l76OrT0GgyuN52OeJqDitUcP0q1rhfdXd1of3GsVo,5128
@@ -13,17 +13,17 @@ vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,
 vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
-vision_agent/image_utils.py,sha256=hFdPoRmeVU5jErFr5xaagMQ6Wy7Xbw8H8HXuLGdJIAM,4786
+vision_agent/image_utils.py,sha256=qRN_Y1XXBm9EL6V53OZUq21h0spIa1J6X9YDbe6B87o,4805
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
 vision_agent/llm/llm.py,sha256=Jty_RHdqVmIM0Mm31JNk50c882Tx7hHtkmh0WyXeJd8,5016
 vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
 vision_agent/lmm/lmm.py,sha256=1E7e_S_0fOKnf6mSsEdkXvsIjGmhBGl5XW4By2jvhbY,10045
-vision_agent/tools/__init__.py,sha256=lKv90gLu-mNp4uyGtJ8AUG-73xKwFEugZpe0atpsscA,269
+vision_agent/tools/__init__.py,sha256=dkzk9amNzTEKULMB1xRJspqEGpzNPGuccWeXrv1xI0U,280
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tools.py,sha256=EK9HauKZ1gq795wBZNER6-8PiDTNZwJ1sXYhDeplDZ0,25410
+vision_agent/tools/tools.py,sha256=ybhCyutEGzHPKuR0Cu--Nb--KubjYvyzLEzVQYzIMTw,29148
 vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
 vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
-vision_agent-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.1.3.dist-info/METADATA,sha256=iBoN2GBvALl6XxhxRo4o9WaqLgI-UAobSymuZ1RHd9o,6233
-vision_agent-0.1.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.1.3.dist-info/RECORD,,
+vision_agent-0.1.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.1.4.dist-info/METADATA,sha256=FyBYGPHgC0uV7uy7wph8yvdQpEWSACnGR96y6Jt-E6A,6233
+vision_agent-0.1.4.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.1.4.dist-info/RECORD,,

{vision_agent-0.1.3.dist-info → vision_agent-0.1.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.1.3.dist-info → vision_agent-0.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

vision-agent 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl