PyPI - vision-agent - Versions diffs - 0.2.63__py3-none-any.whl → 0.2.65__py3-none-any.whl - Mend

vision-agent 0.2.63py3-none-any.whl → 0.2.65py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -7,8 +7,8 @@ import tempfile
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
-from PIL import Image
 from langsmith import traceable
+from PIL import Image
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
@@ -43,6 +43,8 @@ class DefaultImports:
     common_imports = [
         "from typing import *",
+        "from pillow_heif import register_heif_opener",
+        "register_heif_opener()",
     ]
     @staticmethod

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -224,10 +224,10 @@ class OpenAILMM(LMM):
         return lambda x: T.grounding_sam(params["prompt"], x)
     def generate_zero_shot_counter(self, question: str) -> Callable:
-        return T.zero_shot_counting
+        return T.loca_zero_shot_counting
     def generate_image_qa_tool(self, question: str) -> Callable:
-        return lambda x: T.image_question_answering(question, x)
+        return lambda x: T.git_vqa_v2(question, x)
 class AzureOpenAILMM(OpenAILMM):

vision_agent/tools/__init__.py CHANGED Viewed

@@ -7,25 +7,28 @@ from .tools import (
     TOOLS,
     TOOLS_DF,
     UTILITIES_DOCSTRING,
+    blip_image_caption,
     clip,
     closest_box_distance,
     closest_mask_distance,
     extract_frames,
     get_tool_documentation,
+    git_vqa_v2,
     grounding_dino,
     grounding_sam,
-    image_caption,
-    image_question_answering,
     load_image,
+    loca_visual_prompt_counting,
+    loca_zero_shot_counting,
     ocr,
     overlay_bounding_boxes,
     overlay_heat_map,
     overlay_segmentation_masks,
+    owl_v2,
     save_image,
     save_json,
     save_video,
-    visual_prompt_counting,
-    zero_shot_counting,
+    vit_image_classification,
+    vit_nsfw_classification,
 )
 __new_tools__ = [

vision_agent/tools/tools.py CHANGED Viewed

@@ -13,6 +13,7 @@ import pandas as pd
 import requests
 from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
+from pillow_heif import register_heif_opener  # type: ignore
 from vision_agent.tools.tool_utils import _send_inference_request
 from vision_agent.utils import extract_frames_from_video
@@ -26,6 +27,8 @@ from vision_agent.utils.image_utils import (
     rle_decode,
 )
+register_heif_opener()
 COLORS = [
     (158, 218, 229),
     (219, 219, 141),
@@ -59,6 +62,7 @@ def grounding_dino(
     image: np.ndarray,
     box_threshold: float = 0.20,
     iou_threshold: float = 0.20,
+    model_size: str = "large",
 ) -> List[Dict[str, Any]]:
     """'grounding_dino' is a tool that can detect and count multiple objects given a text
     prompt such as category names or referring expressions. The categories in text prompt
@@ -72,6 +76,7 @@ def grounding_dino(
             to 0.20.
         iou_threshold (float, optional): The threshold for the Intersection over Union
             (IoU). Defaults to 0.20.
+        model_size (str, optional): The size of the model to use.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -90,10 +95,14 @@ def grounding_dino(
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
+    if model_size not in ["large", "tiny"]:
+        raise ValueError("model_size must be either 'large' or 'tiny'")
     request_data = {
         "prompt": prompt,
         "image": image_b64,
-        "tool": "visual_grounding",
+        "tool": (
+            "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
+        ),
         "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
     }
     data: Dict[str, Any] = _send_inference_request(request_data, "tools")
@@ -109,6 +118,62 @@ def grounding_dino(
     return return_data
+def owl_v2(
+    prompt: str,
+    image: np.ndarray,
+    box_threshold: float = 0.10,
+    iou_threshold: float = 0.10,
+) -> List[Dict[str, Any]]:
+    """'owl_v2' is a tool that can detect and count multiple objects given a text
+    prompt such as category names or referring expressions. The categories in text prompt
+    are separated by commas or periods. It returns a list of bounding boxes with
+    normalized coordinates, label names and associated probability scores.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+        box_threshold (float, optional): The threshold for the box detection. Defaults
+            to 0.10.
+        iou_threshold (float, optional): The threshold for the Intersection over Union
+            (IoU). Defaults to 0.10.
+        model_size (str, optional): The size of the model to use.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box.
+    Example
+    -------
+        >>> owl_v2("car. dinosaur", image)
+        [
+            {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
+        ]
+    """
+    image_size = image.shape[:2]
+    image_b64 = convert_to_b64(image)
+    request_data = {
+        "prompt": prompt,
+        "image": image_b64,
+        "tool": "open_vocab_detection",
+        "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+    }
+    data: Dict[str, Any] = _send_inference_request(request_data, "tools")
+    return_data = []
+    for i in range(len(data["bboxes"])):
+        return_data.append(
+            {
+                "score": round(data["scores"][i], 2),
+                "label": data["labels"][i].strip(),
+                "bbox": normalize_bbox(data["bboxes"][i], image_size),
+            }
+        )
+    return return_data
 def grounding_sam(
     prompt: str,
     image: np.ndarray,
@@ -253,8 +318,8 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     return ocr_results
-def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
-    """'zero_shot_counting' is a tool that counts the dominant foreground object given
+def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
+    """'loca_zero_shot_counting' is a tool that counts the dominant foreground object given
     an image and no other information about the content. It returns only the count of
     the objects in the image.
@@ -267,7 +332,7 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
     Example
     -------
-        >>> zero_shot_counting(image)
+        >>> loca_zero_shot_counting(image)
         {'count': 45},
     """
@@ -281,10 +346,10 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
     return resp_data
-def visual_prompt_counting(
+def loca_visual_prompt_counting(
     image: np.ndarray, visual_prompt: Dict[str, List[float]]
 ) -> Dict[str, Any]:
-    """'visual_prompt_counting' is a tool that counts the dominant foreground object
+    """'loca_visual_prompt_counting' is a tool that counts the dominant foreground object
     given an image and a visual prompt which is a bounding box describing the object.
     It returns only the count of the objects in the image.
@@ -297,7 +362,7 @@ def visual_prompt_counting(
     Example
     -------
-        >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
+        >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
         {'count': 45},
     """
@@ -316,8 +381,8 @@ def visual_prompt_counting(
     return resp_data
-def image_question_answering(prompt: str, image: np.ndarray) -> str:
-    """'image_question_answering_' is a tool that can answer questions about the visual
+def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
+    """'git_vqa_v2' is a tool that can answer questions about the visual
     contents of an image given a question and an image. It returns an answer to the
     question
@@ -331,7 +396,7 @@ def image_question_answering(prompt: str, image: np.ndarray) -> str:
     Example
     -------
-        >>> image_question_answering('What is the cat doing ?', image)
+        >>> git_vqa_v2('What is the cat doing ?', image)
         'drinking milk'
     """
@@ -376,8 +441,62 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
     return resp_data
-def image_caption(image: np.ndarray) -> str:
-    """'image_caption' is a tool that can caption an image based on its contents. It
+def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
+    """'vit_image_classification' is a tool that can classify an image. It returns a
+    list of classes and their probability scores based on image content.
+    Parameters:
+        image (np.ndarray): The image to classify or tag
+    Returns:
+        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
+            contains a list of labels and other a list of scores.
+    Example
+    -------
+        >>> vit_image_classification(image)
+        {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
+    """
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "tool": "image_classification",
+    }
+    resp_data = _send_inference_request(data, "tools")
+    resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
+    return resp_data
+def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
+    """'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
+    It returns the predicted label and their probability scores based on image content.
+    Parameters:
+        image (np.ndarray): The image to classify or tag
+    Returns:
+        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
+            contains a list of labels and other a list of scores.
+    Example
+    -------
+        >>> vit_nsfw_classification(image)
+        {"labels": "normal", "scores": 0.68},
+    """
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "tool": "nsfw_image_classification",
+    }
+    resp_data = _send_inference_request(data, "tools")
+    resp_data["scores"] = round(resp_data["scores"], 4)
+    return resp_data
+def blip_image_caption(image: np.ndarray) -> str:
+    """'blip_image_caption' is a tool that can caption an image based on its contents. It
     returns a text describing the image.
     Parameters:
@@ -388,7 +507,7 @@ def image_caption(image: np.ndarray) -> str:
     Example
     -------
-        >>> image_caption(image)
+        >>> blip_image_caption(image)
         'This image contains a cat sitting on a table with a bowl of milk.'
     """
@@ -543,7 +662,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
     """
     from IPython.display import display
-    pil_image = Image.fromarray(image.astype(np.uint8))
+    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
     display(pil_image)
     pil_image.save(file_path)
@@ -792,15 +911,17 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
 TOOLS = [
-    grounding_dino,
+    owl_v2,
     grounding_sam,
     extract_frames,
     ocr,
     clip,
-    zero_shot_counting,
-    visual_prompt_counting,
-    image_question_answering,
-    image_caption,
+    vit_image_classification,
+    vit_nsfw_classification,
+    loca_zero_shot_counting,
+    loca_visual_prompt_counting,
+    git_vqa_v2,
+    blip_image_caption,
     closest_mask_distance,
     closest_box_distance,
     save_json,

{vision_agent-0.2.63.dist-info → vision_agent-0.2.65.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.63
+Version: 0.2.65
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -21,6 +21,7 @@ Requires-Dist: openai (>=1.0.0,<2.0.0)
 Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
 Requires-Dist: pandas (>=2.0.0,<3.0.0)
 Requires-Dist: pillow (>=10.0.0,<11.0.0)
+Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
 Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
 Requires-Dist: requests (>=2.0.0,<3.0.0)
 Requires-Dist: rich (>=13.7.1,<14.0.0)

{vision_agent-0.2.63.dist-info → vision_agent-0.2.65.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,23 @@
 vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
 vision_agent/agent/__init__.py,sha256=IUwfbPMcT8X_rnXMLmI8gJ4ltsHy_XSs9eLiKURJxeY,81
 vision_agent/agent/agent.py,sha256=ZK-5lOtd9-eD9aWcXssJpnOyvZuO7_5hAmnb-6sWVe8,569
-vision_agent/agent/vision_agent.py,sha256=TVODnpLVlAtqnvSMUQ0wC5YyDxt2U9KRK5V13dxhUA4,25194
+vision_agent/agent/vision_agent.py,sha256=HC63BP4jPiR4lJLEkKQ-zMV5C5JwjnuZvc7hVjjS2uk,25284
 vision_agent/agent/vision_agent_prompts.py,sha256=bMXdZYf6kbikHn__tCGrYE1QvXC88EmpMpM_97V6szA,8472
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/lmm/__init__.py,sha256=3ro5lCIoS3DgEghOy0SPFrEhYvFnWZpVC5S5kSnIx6A,57
-vision_agent/lmm/lmm.py,sha256=UDyGjMRG_CHhcyTnsmvowRE38zHJATy5cbg1UIbdIjs,8954
-vision_agent/tools/__init__.py,sha256=inKVLRUATQA9oi83l0NluC8Gm-LJU2-AjA6rL1j12Q8,1532
+vision_agent/lmm/lmm.py,sha256=ihmLYL_291HnELyMtfFKTCnPWnmuoEH2DDFmc4ynMG8,8945
+vision_agent/tools/__init__.py,sha256=aE1O8cMeLDPO50Sc-CuAQ_Akh0viz7vBxDcVeZNqsA0,1604
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
-vision_agent/tools/tools.py,sha256=o9ojTfhu8KCSXfW4UPUNOhmki6A-l3jtVi0rPEnELjc,26944
+vision_agent/tools/tools.py,sha256=Qzwm_wu6KJh-3DSoNmZ4Lv8jCCNJMwKIPBFxxN6FmDo,31397
 vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
 vision_agent/utils/execute.py,sha256=GqoAodxtwTPBr1nujPTsWiZO2rBGvWVXTe8lgxY4d_g,20603
 vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
 vision_agent/utils/sim.py,sha256=ci6Eta73dDgLP1Ajtknbgmf1g8aAvBHqlVQvBuLMKXQ,4427
 vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
 vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
-vision_agent-0.2.63.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.63.dist-info/METADATA,sha256=clb-wEt_PcXS2I27fGICOau8hbsrkQLuhDVD0pnH1QQ,8317
-vision_agent-0.2.63.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.63.dist-info/RECORD,,
+vision_agent-0.2.65.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.65.dist-info/METADATA,sha256=MnlqbmIs4PRO4Y1qaR2abmD0RueZnIYUEnGGcuJ1wHA,8363
+vision_agent-0.2.65.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.65.dist-info/RECORD,,

{vision_agent-0.2.63.dist-info → vision_agent-0.2.65.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.63.dist-info → vision_agent-0.2.65.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.63__py3-none-any.whl → 0.2.65__py3-none-any.whl

vision-agent 0.2.63py3-none-any.whl → 0.2.65py3-none-any.whl