PyPI - vision-agent - Versions diffs - 0.2.116__py3-none-any.whl → 0.2.118__py3-none-any.whl - Mend

vision-agent 0.2.116py3-none-any.whl → 0.2.118py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -718,7 +718,12 @@ class VisionAgentCoder(Agent):
             for chat_i in chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = code_interpreter.upload_file(media)
+                        media = (
+                            media
+                            if type(media) is str
+                            and media.startswith(("http", "https"))
+                            else code_interpreter.upload_file(media)
+                        )
                         chat_i["content"] += f" Media name {media}"  # type: ignore
                         media_list.append(media)
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
             results = {"code": "", "test": "", "plan": []}
             plan = []
             success = False
-            self.log_progress(
-                {
-                    "type": "log",
-                    "log_content": "Creating plans",
-                    "status": "started",
-                }
-            )
-            plans = write_plans(
-                int_chat,
-                T.get_tool_descriptions_by_names(
-                    customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
-                ),
-                format_memory(working_memory),
-                self.planner,
+            plans = self._create_plans(
+                int_chat, customized_tool_names, working_memory, self.planner
             )
-            if self.verbosity >= 1:
-                for p in plans:
-                    # tabulate will fail if the keys are not the same for all elements
-                    p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
-                    _LOGGER.info(
-                        f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
-                    )
+            if test_multi_plan:
+                self._log_plans(plans, self.verbosity)
             tool_infos = retrieve_tools(
                 plans,
                 self.tool_recommender,
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
         if self.report_progress_callback is not None:
             self.report_progress_callback(data)
+    def _create_plans(
+        self,
+        int_chat: List[Message],
+        customized_tool_names: Optional[List[str]],
+        working_memory: List[Dict[str, str]],
+        planner: LMM,
+    ) -> Dict[str, Any]:
+        self.log_progress(
+            {
+                "type": "log",
+                "log_content": "Creating plans",
+                "status": "started",
+            }
+        )
+        plans = write_plans(
+            int_chat,
+            T.get_tool_descriptions_by_names(
+                customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
+            ),
+            format_memory(working_memory),
+            planner,
+        )
+        return plans
+    def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
+        if verbosity >= 1:
+            for p in plans:
+                # tabulate will fail if the keys are not the same for all elements
+                p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
+                _LOGGER.info(
+                    f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                )
 class OllamaVisionAgentCoder(VisionAgentCoder):
     """VisionAgentCoder that uses Ollama models for planning, coding, testing.

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
 def encode_media(media: Union[str, Path]) -> str:
+    if type(media) is str and media.startswith(("http", "https")):
+        # for mp4 video url, we assume there is a same url but ends with png
+        # vision-agent-ui will upload this png when uploading the video
+        if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
+            return media[:-4] + ".png"
+        return media
     extension = "png"
     extension = Path(media).suffix
     if extension.lower() not in {
@@ -138,7 +144,11 @@ class OpenAILMM(LMM):
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/png;base64,{encoded_media}",
+                                "url": (
+                                    encoded_media
+                                    if encoded_media.startswith(("http", "https"))
+                                    else f"data:image/png;base64,{encoded_media}"
+                                ),
                                 "detail": "low",
                             },
                         },
@@ -390,7 +400,6 @@ class OllamaLMM(LMM):
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
         if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
             json_data = json.dumps(data)
             def f() -> Iterator[Optional[str]]:
@@ -424,7 +433,6 @@ class OllamaLMM(LMM):
         media: Optional[List[Union[str, Path]]] = None,
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         url = f"{self.url}/generate"
         data: Dict[str, Any] = {
             "model": self.model_name,
@@ -439,7 +447,6 @@ class OllamaLMM(LMM):
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
         if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
             json_data = json.dumps(data)
             def f() -> Iterator[Optional[str]]:

vision_agent/tools/__init__.py CHANGED Viewed

@@ -21,7 +21,7 @@ from .tools import (
     dpt_hybrid_midas,
     extract_frames,
     florence2_image_caption,
-    florence2_object_detection,
+    florence2_phrase_grounding,
     florence2_ocr,
     florence2_roberta_vqa,
     florence2_sam2_image,

vision_agent/tools/tools.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 import io
 import json
 import logging
@@ -14,6 +15,7 @@ from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
+import urllib.request
 from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.tools.tool_utils import (
@@ -760,10 +762,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
-def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florencev2_object_detection' is a tool that can detect and count multiple
-    objects given a text prompt such as category names or referring expressions. You
-    can optionally separate the categories in the text with commas. It returns a list
+def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florence2_phrase_grounding' is a tool that can detect multiple
+    objects given a text prompt which can be object names or caption. You
+    can optionally separate the object names in the text with commas. It returns a list
     of bounding boxes with normalized coordinates, label names and associated
     probability scores of 1.0.
@@ -780,7 +782,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
     Example
     -------
-        >>> florence2_object_detection('person looking at a coyote', image)
+        >>> florence2_phrase_grounding('person looking at a coyote', image)
         [
             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -792,7 +794,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
         "image": image_b64,
         "task": "<CAPTION_TO_PHRASE_GROUNDING>",
         "prompt": prompt,
-        "function_name": "florence2_object_detection",
+        "function_name": "florence2_phrase_grounding",
     }
     detections = send_inference_request(data, "florence2", v2=True)
@@ -1220,6 +1222,13 @@ def extract_frames(
             video_file_path = video.download(output_path=temp_dir)
             return extract_frames_from_video(video_file_path, fps)
+    elif str(video_uri).startswith(("http", "https")):
+        _, image_suffix = os.path.splitext(video_uri)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
+            # Download the video and save it to the temporary file
+            with urllib.request.urlopen(str(video_uri)) as response:
+                tmp_file.write(response.read())
+            return extract_frames_from_video(tmp_file.name, fps)
     return extract_frames_from_video(str(video_uri), fps)
@@ -1250,10 +1259,10 @@ def save_json(data: Any, file_path: str) -> None:
 def load_image(image_path: str) -> np.ndarray:
-    """'load_image' is a utility function that loads an image from the given file path string.
+    """'load_image' is a utility function that loads an image from the given file path string or an URL.
     Parameters:
-        image_path (str): The path to the image.
+        image_path (str): The path or URL to the image.
     Returns:
         np.ndarray: The image as a NumPy array.
@@ -1265,6 +1274,13 @@ def load_image(image_path: str) -> np.ndarray:
     # NOTE: sometimes the generated code pass in a NumPy array
     if isinstance(image_path, np.ndarray):
         return image_path
+    if image_path.startswith(("http", "https")):
+        _, image_suffix = os.path.splitext(image_path)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
+            # Download the image and save it to the temporary file
+            with urllib.request.urlopen(image_path) as response:
+                tmp_file.write(response.read())
+            image_path = tmp_file.name
     image = Image.open(image_path).convert("RGB")
     return np.array(image)
@@ -1418,6 +1434,7 @@ def overlay_segmentation_masks(
     medias: Union[np.ndarray, List[np.ndarray]],
     masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
     draw_label: bool = True,
+    secondary_label_key: str = "tracking_label",
 ) -> Union[np.ndarray, List[np.ndarray]]:
     """'overlay_segmentation_masks' is a utility function that displays segmentation
     masks.
@@ -1426,7 +1443,10 @@ def overlay_segmentation_masks(
         medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
             the masks on.
         masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
-            dictionaries containing the masks.
+            dictionaries containing the masks, labels and scores.
+        draw_label (bool, optional): If True, the labels will be displayed on the image.
+        secondary_label_key (str, optional): The key to use for the secondary
+            tracking label which is needed in videos to display tracking information.
     Returns:
         np.ndarray: The image with the masks displayed.
@@ -1471,6 +1491,7 @@ def overlay_segmentation_masks(
         for elt in masks_int[i]:
             mask = elt["mask"]
             label = elt["label"]
+            tracking_lbl = elt.get(secondary_label_key, None)
             np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
             np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
             mask_img = Image.fromarray(np_mask.astype(np.uint8))
@@ -1478,16 +1499,17 @@ def overlay_segmentation_masks(
             if draw_label:
                 draw = ImageDraw.Draw(pil_image)
-                text_box = draw.textbbox((0, 0), text=label, font=font)
+                text = tracking_lbl if tracking_lbl else label
+                text_box = draw.textbbox((0, 0), text=text, font=font)
                 x, y = _get_text_coords_from_mask(
                     mask,
                     v_gap=(text_box[3] - text_box[1]) + 10,
                     h_gap=(text_box[2] - text_box[0]) // 2,
                 )
                 if x != 0 and y != 0:
-                    text_box = draw.textbbox((x, y), text=label, font=font)
+                    text_box = draw.textbbox((x, y), text=text, font=font)
                     draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
-                    draw.text((x, y), label, fill="black", font=font)
+                    draw.text((x, y), text, fill="black", font=font)
         frame_out.append(np.array(pil_image))
     return frame_out[0] if len(frame_out) == 1 else frame_out
@@ -1663,7 +1685,7 @@ FUNCTION_TOOLS = [
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video,
-    florence2_object_detection,
+    florence2_phrase_grounding,
     ixc25_image_vqa,
     ixc25_video_vqa,
     detr_segmentation,

{vision_agent-0.2.116.dist-info → vision_agent-0.2.118.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.116
+Version: 0.2.118
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -252,7 +252,7 @@ function. Make sure the documentation is in the same format above with descripti
 `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
 [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
-## Additional LLMs
+## Additional Backends
 ### Ollama
 We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
 a few models:

{vision_agent-0.2.116.dist-info → vision_agent-0.2.118.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,7 @@ vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepov
 vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
 vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
 vision_agent/agent/vision_agent.py,sha256=5rgO-pScVOS3t4sWnLBnGYYkGftGgF4U0FpZzFVrDAY,8447
-vision_agent/agent/vision_agent_coder.py,sha256=qRSv_krY6-uHJC8exo3Nw0dPJ81jSzhKw2WTCHw1XVE,33733
+vision_agent/agent/vision_agent_coder.py,sha256=tE-15ttnDxUsEdB0XJP4AVNyOU89KS8ZvXZDPcNKA-8,34380
 vision_agent/agent/vision_agent_coder_prompts.py,sha256=xIya1txRZM8qoQHAWTEkEFCL8L3iZD7QD09t3ZtdxSE,11305
 vision_agent/agent/vision_agent_prompts.py,sha256=ydUU_Wvw-jqdL_vObSUr-VCQvjSwA5Fd74TbbhUzyxk,6112
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -12,13 +12,13 @@ vision_agent/clients/landing_public_api.py,sha256=6L15zh5lP5JHCpGnYpHMREgrrKiJin
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
-vision_agent/lmm/lmm.py,sha256=cuXtfFb7kJwVTyHTeK_t1bYItPiNjmDI2gF8vJs4gsM,20231
+vision_agent/lmm/lmm.py,sha256=xkAxunToISzo5rCcjekqQBvm5SRW-98htieLuztKNbk,20802
 vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
-vision_agent/tools/__init__.py,sha256=Y6Y7McmdC8cm6UsJgExBLEPi4StBkqfY4y8_Mp7LlWU,2190
+vision_agent/tools/__init__.py,sha256=lUUc2HV13eSxg5KPZop1D-mB4ecmiQ5fYlBTQLNSbYg,2190
 vision_agent/tools/meta_tools.py,sha256=q6h7hZarZrsWRloVE6PbTZwW8J2N1uUM9Ac-XxsT6hk,13365
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=qMsb9d8QtpXGgF9rpPO2dA390BewKdYO68oWKDu-TGg,6504
-vision_agent/tools/tools.py,sha256=JscejDn05jpYW6psPkRDesegPtZJshNWCncGFPOpI7c,58626
+vision_agent/tools/tools.py,sha256=gAW6G9k1vzy8jwRACNnw2Vihsajm_oSlVJqd6E4JSRA,59957
 vision_agent/tools/tools_types.py,sha256=z6_XtUhWgh201yM7Z0CYtiLBEGdHPc_QUydMDHZ84EA,2216
 vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=c1LrmaHD331za8DbA1myJpgUmWoDzePaOK6-dsd
 vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
-vision_agent-0.2.116.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.116.dist-info/METADATA,sha256=iUNOaT5grsrdL_2yCiUqhaBvXoWtuFdxGSFlsJYF-nQ,11993
-vision_agent-0.2.116.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.116.dist-info/RECORD,,
+vision_agent-0.2.118.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.118.dist-info/METADATA,sha256=4ilO7j9MOLCtaNekUUVlhMNdDKMk02ecx7ipnXT9RC8,11997
+vision_agent-0.2.118.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.118.dist-info/RECORD,,

{vision_agent-0.2.116.dist-info → vision_agent-0.2.118.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.116.dist-info → vision_agent-0.2.118.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.116__py3-none-any.whl → 0.2.118__py3-none-any.whl

vision-agent 0.2.116py3-none-any.whl → 0.2.118py3-none-any.whl