PyPI - vision-agent - Versions diffs - 0.2.5__tar.gz → 0.2.7__tar.gz - Mend

vision-agent 0.2.5tar.gz → 0.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{vision_agent-0.2.5 → vision_agent-0.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.5
+Version: 0.2.7
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.5 → vision_agent-0.2.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.5"
+version = "0.2.7"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -314,6 +314,7 @@ def _handle_extract_frames(
                 image_to_data[image] = {
                     "bboxes": [],
                     "masks": [],
+                    "heat_map": [],
                     "labels": [],
                     "scores": [],
                 }
@@ -340,9 +341,12 @@ def _handle_viz_tools(
             return image_to_data
     for param, call_result in zip(parameters, tool_result["call_results"]):
-        # calls can fail, so we need to check if the call was successful
+        # Calls can fail, so we need to check if the call was successful. It can either:
+        # 1. return a str or some error that's not a dictionary
+        # 2. return a dictionary but not have the necessary keys
         if not isinstance(call_result, dict) or (
-            "bboxes" not in call_result and "masks" not in call_result
+            "bboxes" not in call_result and "heat_map" not in call_result
         ):
             return image_to_data
@@ -352,6 +356,7 @@ def _handle_viz_tools(
             image_to_data[image] = {
                 "bboxes": [],
                 "masks": [],
+                "heat_map": [],
                 "labels": [],
                 "scores": [],
             }
@@ -360,6 +365,8 @@ def _handle_viz_tools(
         image_to_data[image]["labels"].extend(call_result.get("labels", []))
         image_to_data[image]["scores"].extend(call_result.get("scores", []))
         image_to_data[image]["masks"].extend(call_result.get("masks", []))
+        # only single heatmap is returned
+        image_to_data[image]["heat_map"].append(call_result.get("heat_map", []))
         if "mask_shape" in call_result:
             image_to_data[image]["mask_shape"] = call_result["mask_shape"]
@@ -480,9 +487,14 @@ class VisionAgent(Agent):
         """Invoke the vision agent.
         Parameters:
-            input: a prompt that describe the task or a conversation in the format of
+            chat: A conversation in the format of
                 [{"role": "user", "content": "describe your task here..."}].
-            image: the input image referenced in the prompt parameter.
+            image: The input image referenced in the chat parameter.
+            reference_data: A dictionary containing the reference image, mask or bounding
+                box in the format of:
+                {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
+                where the bounding box coordinates are normalized.
+            visualize_output: Whether to visualize the output.
         Returns:
             The result of the vision agent in text.
@@ -509,7 +521,9 @@ class VisionAgent(Agent):
             self.report_progress_callback("<VIZ>")
             if images:
                 for img in images:
-                    self.report_progress_callback(f"<IMG>{convert_to_b64(img)}</IMG>")
+                    self.report_progress_callback(
+                        f"<IMG>base:64{convert_to_b64(img)}</IMG>"
+                    )
             self.report_progress_callback("</VIZ>")
     def chat_with_workflow(
@@ -522,12 +536,14 @@ class VisionAgent(Agent):
         """Chat with the vision agent and return the final answer and all tool results.
         Parameters:
-            chat: a conversation in the format of
+            chat: A conversation in the format of
                 [{"role": "user", "content": "describe your task here..."}].
-            image: the input image referenced in the chat parameter.
-            reference_data: a dictionary containing the reference image and mask. in the
-                format of {"image": "image.jpg", "mask": "mask.jpg}
-            visualize_output: whether to visualize the output.
+            image: The input image referenced in the chat parameter.
+            reference_data: A dictionary containing the reference image, mask or bounding
+                box in the format of:
+                {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
+                where the bounding box coordinates are normalized.
+            visualize_output: Whether to visualize the output.
         Returns:
             A tuple where the first item is the final answer and the second item is a

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/image_utils.py RENAMED Viewed

@@ -211,7 +211,7 @@ def overlay_masks(
     }
     for label, mask in zip(masks["labels"], masks["masks"]):
-        if isinstance(mask, str):
+        if isinstance(mask, str) or isinstance(mask, Path):
             mask = np.array(Image.open(mask))
         np_mask = np.zeros((image.size[1], image.size[0], 4))
         np_mask[mask > 0, :] = color[label] + (255 * alpha,)
@@ -221,7 +221,7 @@ def overlay_masks(
 def overlay_heat_map(
-    image: Union[str, Path, np.ndarray, ImageType], masks: Dict, alpha: float = 0.8
+    image: Union[str, Path, np.ndarray, ImageType], heat_map: Dict, alpha: float = 0.8
 ) -> ImageType:
     r"""Plots heat map on to an image.
@@ -238,14 +238,12 @@ def overlay_heat_map(
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
-    if "masks" not in masks:
+    if "heat_map" not in heat_map:
         return image.convert("RGB")
-    # Only one heat map per image, so no need to loop through masks
     image = image.convert("L")
-    if isinstance(masks["masks"][0], str):
-        mask = b64_to_pil(masks["masks"][0])
+    # Only one heat map per image, so no need to loop through masks
+    mask = Image.fromarray(heat_map["heat_map"][0])
     overlay = Image.new("RGBA", mask.size)
     odraw = ImageDraw.Draw(overlay)

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/tools.py RENAMED Viewed

@@ -11,6 +11,7 @@ from PIL import Image
 from PIL.Image import Image as ImageType
 from vision_agent.image_utils import (
+    b64_to_pil,
     convert_to_b64,
     denormalize_bbox,
     get_image_size,
@@ -516,7 +517,9 @@ class ZeroShotCounting(Tool):
             "image": image_b64,
             "tool": "zero_shot_counting",
         }
-        return _send_inference_request(data, "tools")
+        resp_data = _send_inference_request(data, "tools")
+        resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
+        return resp_data
 class VisualPromptCounting(Tool):
@@ -585,7 +588,9 @@ class VisualPromptCounting(Tool):
             "prompt": prompt,
             "tool": "few_shot_counting",
         }
-        return _send_inference_request(data, "tools")
+        resp_data = _send_inference_request(data, "tools")
+        resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
+        return resp_data
 class VisualQuestionAnswering(Tool):

{vision_agent-0.2.5 → vision_agent-0.2.7}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/README.md RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/agent.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/easytool.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/easytool_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/reflexion.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/reflexion_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/fonts/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/fonts/default_font_ch_en.ttf RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/llm/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/llm/llm.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/lmm/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/lmm/lmm.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/video.py RENAMED Viewed

File without changes

{vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/type_defs.py RENAMED Viewed

File without changes

vision-agent 0.2.5__tar.gz → 0.2.7__tar.gz

vision-agent 0.2.5tar.gz → 0.2.7tar.gz