PyPI - vision-agent - Versions diffs - 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl - Mend

vision-agent 0.2.24py3-none-any.whl → 0.2.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -8,7 +8,7 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
 _LOGGER = logging.getLogger(__name__)
 _LND_API_KEY = LandingaiAPIKey().api_key
-_LND_API_URL = "https://api.dev.landing.ai/v1/agent"
+_LND_API_URL = "https://api.staging.landing.ai/v1/agent"
 def _send_inference_request(

vision_agent/tools/tools.py CHANGED Viewed

@@ -53,7 +53,7 @@ class NoOp(Tool):
 class CLIP(Tool):
-    r"""CLIP is a tool that can classify or tag any image given a set if input classes
+    r"""CLIP is a tool that can classify or tag any image given a set of input classes
     or tags.
     Example

vision_agent/tools/tools_v2.py CHANGED Viewed

@@ -15,7 +15,14 @@ from scipy.spatial import distance  # type: ignore
 from vision_agent.tools.tool_utils import _send_inference_request
 from vision_agent.utils import extract_frames_from_video
-from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
+from vision_agent.utils.image_utils import (
+    convert_to_b64,
+    normalize_bbox,
+    rle_decode,
+    b64_to_pil,
+    get_image_size,
+    denormalize_bbox,
+)
 COLORS = [
     (158, 218, 229),
@@ -49,7 +56,7 @@ def grounding_dino(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.20,
-    iou_threshold: float = 0.75,
+    iou_threshold: float = 0.20,
 ) -> List[Dict[str, Any]]:
     """'grounding_dino' is a tool that can detect and count objects given a text prompt
     such as category names or referring expressions. It returns a list and count of
@@ -61,12 +68,13 @@ def grounding_dino(
         box_threshold (float, optional): The threshold for the box detection. Defaults
             to 0.20.
         iou_threshold (float, optional): The threshold for the Intersection over Union
-            (IoU). Defaults to 0.75.
+            (IoU). Defaults to 0.20.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
         bounding box of the detected objects with normalized coordinates
-        (x1, y1, x2, y2).
+        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
+        xmax and ymax are the coordinates of the bottom-right of the bounding box.
     Example
     -------
@@ -77,7 +85,7 @@ def grounding_dino(
     ]
     """
     image_size = image.shape[:2]
-    image_b64 = convert_to_b64(Image.fromarray(image))
+    image_b64 = convert_to_b64(image)
     request_data = {
         "prompt": prompt,
         "image": image_b64,
@@ -101,7 +109,7 @@ def grounding_sam(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.20,
-    iou_threshold: float = 0.75,
+    iou_threshold: float = 0.20,
 ) -> List[Dict[str, Any]]:
     """'grounding_sam' is a tool that can detect and segment objects given a text
     prompt such as category names or referring expressions. It returns a list of
@@ -113,12 +121,15 @@ def grounding_sam(
         box_threshold (float, optional): The threshold for the box detection. Defaults
             to 0.20.
         iou_threshold (float, optional): The threshold for the Intersection over Union
-            (IoU). Defaults to 0.75.
+            (IoU). Defaults to 0.20.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         bounding box, and mask of the detected objects with normalized coordinates
-        (x1, y1, x2, y2).
+        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
+        xmax and ymax are the coordinates of the bottom-right of the bounding box.
+        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+        the background.
     Example
     -------
@@ -137,7 +148,7 @@ def grounding_sam(
     ]
     """
     image_size = image.shape[:2]
-    image_b64 = convert_to_b64(Image.fromarray(image))
+    image_b64 = convert_to_b64(image)
     request_data = {
         "prompt": prompt,
         "image": image_b64,
@@ -235,6 +246,152 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     return output
+def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
+    """'zero_shot_counting' is a tool that counts the dominant foreground object given an image and no other information about the content.
+    It returns only the count of the objects in the image.
+    Parameters:
+        image (np.ndarray): The image that contains lot of instances of a single object
+    Returns:
+        Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
+    Example
+    -------
+    >>> zero_shot_counting(image)
+    {'count': 45},
+    """
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "tool": "zero_shot_counting",
+    }
+    resp_data = _send_inference_request(data, "tools")
+    resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
+    return resp_data
+def visual_prompt_counting(
+    image: np.ndarray, visual_prompt: Dict[str, List[float]]
+) -> Dict[str, Any]:
+    """'visual_prompt_counting' is a tool that counts the dominant foreground object given an image and a visual prompt which is a bounding box describing the object.
+    It returns only the count of the objects in the image.
+    Parameters:
+        image (np.ndarray): The image that contains lot of instances of a single object
+    Returns:
+        Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
+    Example
+    -------
+    >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
+    {'count': 45},
+    """
+    image_size = get_image_size(image)
+    bbox = visual_prompt["bbox"]
+    bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "prompt": bbox_str,
+        "tool": "few_shot_counting",
+    }
+    resp_data = _send_inference_request(data, "tools")
+    resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
+    return resp_data
+def image_question_answering(image: np.ndarray, prompt: str) -> str:
+    """'image_question_answering_' is a tool that can answer questions about the visual contents of an image given a question and an image.
+    It returns an answer to the question
+    Parameters:
+        image (np.ndarray): The reference image used for the question
+        prompt (str): The question about the image
+    Returns:
+        str: A string which is the answer to the given prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}.
+    Example
+    -------
+    >>> image_question_answering(image, 'What is the cat doing ?')
+    'drinking milk'
+    """
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "prompt": prompt,
+        "tool": "image_question_answering",
+    }
+    answer = _send_inference_request(data, "tools")
+    return answer["text"][0]  # type: ignore
+def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
+    """'clip' is a tool that can classify an image given a list of input classes or tags.
+    It returns the same list of the input classes along with their probability scores based on image content.
+    Parameters:
+        image (np.ndarray): The image to classify or tag
+        classes (List[str]): The list of classes or tags that is associated with the image
+    Returns:
+        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary contains a list of given labels and other a list of scores.
+    Example
+    -------
+    >>> clip(image, ['dog', 'cat', 'bird'])
+    {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
+    """
+    image_b64 = convert_to_b64(image)
+    data = {
+        "prompt": ",".join(classes),
+        "image": image_b64,
+        "tool": "closed_set_image_classification",
+    }
+    resp_data = _send_inference_request(data, "tools")
+    resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
+    return resp_data
+def image_caption(image: np.ndarray) -> str:
+    """'image_caption' is a tool that can caption an image based on its contents.
+    It returns a text describing the image.
+    Parameters:
+        image (np.ndarray): The image to caption
+    Returns:
+       str: A string which is the caption for the given image.
+    Example
+    -------
+    >>> image_caption(image)
+    'This image contains a cat sitting on a table with a bowl of milk.'
+    """
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "tool": "image_captioning",
+    }
+    answer = _send_inference_request(data, "tools")
+    return answer["text"][0]  # type: ignore
 def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
     """'closest_mask_distance' calculates the closest distance between two masks.
@@ -504,6 +661,11 @@ TOOLS = [
     grounding_sam,
     extract_frames,
     ocr,
+    clip,
+    zero_shot_counting,
+    visual_prompt_counting,
+    image_question_answering,
+    image_caption,
     closest_mask_distance,
     closest_box_distance,
     save_json,

vision_agent/utils/image_utils.py CHANGED Viewed

@@ -104,15 +104,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
     """
     if data is None:
         raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
     if isinstance(data, (str, Path)):
         data = Image.open(data)
+    elif isinstance(data, np.ndarray):
+        data = Image.fromarray(data)
     if isinstance(data, Image.Image):
         buffer = BytesIO()
         data.convert("RGB").save(buffer, format="PNG")
         return base64.b64encode(buffer.getvalue()).decode("utf-8")
     else:
-        arr_bytes = data.tobytes()
-        return base64.b64encode(arr_bytes).decode("utf-8")
+        raise ValueError(
+            f"Invalid input image: {data}. Input image must be a PIL Image or a numpy array."
+        )
 def denormalize_bbox(

vision_agent/utils/type_defs.py CHANGED Viewed

@@ -12,7 +12,7 @@ class LandingaiAPIKey(BaseSettings):
     """
     api_key: str = Field(
-        default="land_sk_PCRPYKqB3cq0JWGY83hjEk33SWSDOwdNoyUjTgCDMZO4NxeCXW",
+        default="land_sk_IJrojHarPXRjqDj1Fng76mX7yCbzVm1s5rZYxaNXu5v0cNLn0w",
         alias="LANDINGAI_API_KEY",
         description="The API key of LandingAI.",
     )

{vision_agent-0.2.24.dist-info → vision_agent-0.2.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.24
+Version: 0.2.25
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.24.dist-info → vision_agent-0.2.25.dist-info}/RECORD RENAMED Viewed

@@ -19,16 +19,16 @@ vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,
 vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
 vision_agent/tools/__init__.py,sha256=dRHXGpjhItXZRQs0r_l3Z3bQIreaZaYP0CJrl8mOJxM,452
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tool_utils.py,sha256=mK6QfbYr6oo9ci979-_6R1DrxU2i8HGhwosADyvciI0,865
-vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
-vision_agent/tools/tools_v2.py,sha256=iO-ochdLq73xdCRUY1MKixHyVAk6UIUrY648MtjlHno,16201
+vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
+vision_agent/tools/tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
+vision_agent/tools/tools_v2.py,sha256=Tdam-cWBI4ipXWwGyxim-SK07zP97_hcdUtYd1a4CnI,21404
 vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
 vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
-vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
+vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
 vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
-vision_agent/utils/type_defs.py,sha256=ijFAd7D0y8JOg0Ib063rqsDcrFtZfQbdqpaRPTmp2hY,1792
+vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
 vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
-vision_agent-0.2.24.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.24.dist-info/METADATA,sha256=G4bq69V2-eRKNSWwx0skCfU60iiCUQf5l37B9O49Bkk,9212
-vision_agent-0.2.24.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.24.dist-info/RECORD,,
+vision_agent-0.2.25.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.25.dist-info/METADATA,sha256=5bycdwOp0pnRpUBQo_JM1c1Abq2fmWJcVYE_7YgtoUY,9212
+vision_agent-0.2.25.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.25.dist-info/RECORD,,

{vision_agent-0.2.24.dist-info → vision_agent-0.2.25.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.24.dist-info → vision_agent-0.2.25.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl

vision-agent 0.2.24py3-none-any.whl → 0.2.25py3-none-any.whl