PyPI - vision-agent - Versions diffs - 0.2.117__tar.gz → 0.2.118__tar.gz - Mend

vision-agent 0.2.117tar.gz → 0.2.118tar.gz

Files changed (33) hide show

{vision_agent-0.2.117 → vision_agent-0.2.118}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.117
+Version: 0.2.118
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.117 → vision_agent-0.2.118}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.117"
+version = "0.2.118"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -718,7 +718,12 @@ class VisionAgentCoder(Agent):
             for chat_i in chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = code_interpreter.upload_file(media)
+                        media = (
+                            media
+                            if type(media) is str
+                            and media.startswith(("http", "https"))
+                            else code_interpreter.upload_file(media)
+                        )
                         chat_i["content"] += f" Media name {media}"  # type: ignore
                         media_list.append(media)
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
             results = {"code": "", "test": "", "plan": []}
             plan = []
             success = False
-            self.log_progress(
-                {
-                    "type": "log",
-                    "log_content": "Creating plans",
-                    "status": "started",
-                }
-            )
-            plans = write_plans(
-                int_chat,
-                T.get_tool_descriptions_by_names(
-                    customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
-                ),
-                format_memory(working_memory),
-                self.planner,
+            plans = self._create_plans(
+                int_chat, customized_tool_names, working_memory, self.planner
             )
-            if self.verbosity >= 1:
-                for p in plans:
-                    # tabulate will fail if the keys are not the same for all elements
-                    p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
-                    _LOGGER.info(
-                        f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
-                    )
+            if test_multi_plan:
+                self._log_plans(plans, self.verbosity)
             tool_infos = retrieve_tools(
                 plans,
                 self.tool_recommender,
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
         if self.report_progress_callback is not None:
             self.report_progress_callback(data)
+    def _create_plans(
+        self,
+        int_chat: List[Message],
+        customized_tool_names: Optional[List[str]],
+        working_memory: List[Dict[str, str]],
+        planner: LMM,
+    ) -> Dict[str, Any]:
+        self.log_progress(
+            {
+                "type": "log",
+                "log_content": "Creating plans",
+                "status": "started",
+            }
+        )
+        plans = write_plans(
+            int_chat,
+            T.get_tool_descriptions_by_names(
+                customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
+            ),
+            format_memory(working_memory),
+            planner,
+        )
+        return plans
+    def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
+        if verbosity >= 1:
+            for p in plans:
+                # tabulate will fail if the keys are not the same for all elements
+                p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
+                _LOGGER.info(
+                    f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                )
 class OllamaVisionAgentCoder(VisionAgentCoder):
     """VisionAgentCoder that uses Ollama models for planning, coding, testing.

{vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/lmm/lmm.py RENAMED Viewed

@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
 def encode_media(media: Union[str, Path]) -> str:
+    if type(media) is str and media.startswith(("http", "https")):
+        # for mp4 video url, we assume there is a same url but ends with png
+        # vision-agent-ui will upload this png when uploading the video
+        if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
+            return media[:-4] + ".png"
+        return media
     extension = "png"
     extension = Path(media).suffix
     if extension.lower() not in {
@@ -138,7 +144,11 @@ class OpenAILMM(LMM):
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/png;base64,{encoded_media}",
+                                "url": (
+                                    encoded_media
+                                    if encoded_media.startswith(("http", "https"))
+                                    else f"data:image/png;base64,{encoded_media}"
+                                ),
                                 "detail": "low",
                             },
                         },
@@ -390,7 +400,6 @@ class OllamaLMM(LMM):
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
         if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
             json_data = json.dumps(data)
             def f() -> Iterator[Optional[str]]:
@@ -424,7 +433,6 @@ class OllamaLMM(LMM):
         media: Optional[List[Union[str, Path]]] = None,
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         url = f"{self.url}/generate"
         data: Dict[str, Any] = {
             "model": self.model_name,
@@ -439,7 +447,6 @@ class OllamaLMM(LMM):
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
         if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
             json_data = json.dumps(data)
             def f() -> Iterator[Optional[str]]:

{vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -21,7 +21,7 @@ from .tools import (
     dpt_hybrid_midas,
     extract_frames,
     florence2_image_caption,
-    florence2_object_detection,
+    florence2_phrase_grounding,
     florence2_ocr,
     florence2_roberta_vqa,
     florence2_sam2_image,

{vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/tools/tools.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import os
 import io
 import json
 import logging
@@ -14,6 +15,7 @@ from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
+import urllib.request
 from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.tools.tool_utils import (
@@ -760,10 +762,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
-def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florencev2_object_detection' is a tool that can detect and count multiple
-    objects given a text prompt such as category names or referring expressions. You
-    can optionally separate the categories in the text with commas. It returns a list
+def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florence2_phrase_grounding' is a tool that can detect multiple
+    objects given a text prompt which can be object names or caption. You
+    can optionally separate the object names in the text with commas. It returns a list
     of bounding boxes with normalized coordinates, label names and associated
     probability scores of 1.0.
@@ -780,7 +782,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
     Example
     -------
-        >>> florence2_object_detection('person looking at a coyote', image)
+        >>> florence2_phrase_grounding('person looking at a coyote', image)
         [
             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -792,7 +794,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
         "image": image_b64,
         "task": "<CAPTION_TO_PHRASE_GROUNDING>",
         "prompt": prompt,
-        "function_name": "florence2_object_detection",
+        "function_name": "florence2_phrase_grounding",
     }
     detections = send_inference_request(data, "florence2", v2=True)
@@ -1220,6 +1222,13 @@ def extract_frames(
             video_file_path = video.download(output_path=temp_dir)
             return extract_frames_from_video(video_file_path, fps)
+    elif str(video_uri).startswith(("http", "https")):
+        _, image_suffix = os.path.splitext(video_uri)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
+            # Download the video and save it to the temporary file
+            with urllib.request.urlopen(str(video_uri)) as response:
+                tmp_file.write(response.read())
+            return extract_frames_from_video(tmp_file.name, fps)
     return extract_frames_from_video(str(video_uri), fps)
@@ -1250,10 +1259,10 @@ def save_json(data: Any, file_path: str) -> None:
 def load_image(image_path: str) -> np.ndarray:
-    """'load_image' is a utility function that loads an image from the given file path string.
+    """'load_image' is a utility function that loads an image from the given file path string or an URL.
     Parameters:
-        image_path (str): The path to the image.
+        image_path (str): The path or URL to the image.
     Returns:
         np.ndarray: The image as a NumPy array.
@@ -1265,6 +1274,13 @@ def load_image(image_path: str) -> np.ndarray:
     # NOTE: sometimes the generated code pass in a NumPy array
     if isinstance(image_path, np.ndarray):
         return image_path
+    if image_path.startswith(("http", "https")):
+        _, image_suffix = os.path.splitext(image_path)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
+            # Download the image and save it to the temporary file
+            with urllib.request.urlopen(image_path) as response:
+                tmp_file.write(response.read())
+            image_path = tmp_file.name
     image = Image.open(image_path).convert("RGB")
     return np.array(image)
@@ -1418,6 +1434,7 @@ def overlay_segmentation_masks(
     medias: Union[np.ndarray, List[np.ndarray]],
     masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
     draw_label: bool = True,
+    secondary_label_key: str = "tracking_label",
 ) -> Union[np.ndarray, List[np.ndarray]]:
     """'overlay_segmentation_masks' is a utility function that displays segmentation
     masks.
@@ -1426,7 +1443,10 @@ def overlay_segmentation_masks(
         medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
             the masks on.
         masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
-            dictionaries containing the masks.
+            dictionaries containing the masks, labels and scores.
+        draw_label (bool, optional): If True, the labels will be displayed on the image.
+        secondary_label_key (str, optional): The key to use for the secondary
+            tracking label which is needed in videos to display tracking information.
     Returns:
         np.ndarray: The image with the masks displayed.
@@ -1471,6 +1491,7 @@ def overlay_segmentation_masks(
         for elt in masks_int[i]:
             mask = elt["mask"]
             label = elt["label"]
+            tracking_lbl = elt.get(secondary_label_key, None)
             np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
             np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
             mask_img = Image.fromarray(np_mask.astype(np.uint8))
@@ -1478,16 +1499,17 @@ def overlay_segmentation_masks(
             if draw_label:
                 draw = ImageDraw.Draw(pil_image)
-                text_box = draw.textbbox((0, 0), text=label, font=font)
+                text = tracking_lbl if tracking_lbl else label
+                text_box = draw.textbbox((0, 0), text=text, font=font)
                 x, y = _get_text_coords_from_mask(
                     mask,
                     v_gap=(text_box[3] - text_box[1]) + 10,
                     h_gap=(text_box[2] - text_box[0]) // 2,
                 )
                 if x != 0 and y != 0:
-                    text_box = draw.textbbox((x, y), text=label, font=font)
+                    text_box = draw.textbbox((x, y), text=text, font=font)
                     draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
-                    draw.text((x, y), label, fill="black", font=font)
+                    draw.text((x, y), text, fill="black", font=font)
         frame_out.append(np.array(pil_image))
     return frame_out[0] if len(frame_out) == 1 else frame_out
@@ -1663,7 +1685,7 @@ FUNCTION_TOOLS = [
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video,
-    florence2_object_detection,
+    florence2_phrase_grounding,
     ixc25_image_vqa,
     ixc25_video_vqa,
     detr_segmentation,