PyPI - vision-agent - Versions diffs - 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

vision-agent 0.1.5py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

vision_agent/__init__.py +0 -2
vision_agent/agent/vision_agent.py +30 -13
vision_agent/image_utils.py +96 -5
vision_agent/llm/llm.py +4 -0
vision_agent/lmm/lmm.py +4 -0
vision_agent/tools/__init__.py +2 -0
vision_agent/tools/tools.py +133 -30
{vision_agent-0.1.5.dist-info → vision_agent-0.2.1.dist-info}/METADATA +6 -6
{vision_agent-0.1.5.dist-info → vision_agent-0.2.1.dist-info}/RECORD +11 -15
vision_agent/data/__init__.py +0 -1
vision_agent/data/data.py +0 -142
vision_agent/emb/__init__.py +0 -1
vision_agent/emb/emb.py +0 -47
{vision_agent-0.1.5.dist-info → vision_agent-0.2.1.dist-info}/LICENSE +0 -0
{vision_agent-0.1.5.dist-info → vision_agent-0.2.1.dist-info}/WHEEL +0 -0

vision_agent/__init__.py CHANGED Viewed

@@ -1,5 +1,3 @@
 from .agent import Agent
-from .data import DataStore, build_data_store
-from .emb import Embedder, OpenAIEmb, SentenceTransformerEmb, get_embedder
 from .llm import LLM, OpenAILLM
 from .lmm import LMM, LLaVALMM, OpenAILMM, get_lmm

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 from PIL import Image
 from tabulate import tabulate
-from vision_agent.image_utils import overlay_bboxes, overlay_masks
+from vision_agent.image_utils import overlay_bboxes, overlay_masks, overlay_heat_map
 from vision_agent.llm import LLM, OpenAILLM
 from vision_agent.lmm import LMM, OpenAILMM
 from vision_agent.tools import TOOLS
@@ -336,7 +336,9 @@ def _handle_viz_tools(
     for param, call_result in zip(parameters, tool_result["call_results"]):
         # calls can fail, so we need to check if the call was successful
-        if not isinstance(call_result, dict) or "bboxes" not in call_result:
+        if not isinstance(call_result, dict) or (
+            "bboxes" not in call_result and "masks" not in call_result
+        ):
             return image_to_data
         # if the call was successful, then we can add the image data
@@ -349,11 +351,12 @@ def _handle_viz_tools(
                 "scores": [],
             }
-        image_to_data[image]["bboxes"].extend(call_result["bboxes"])
-        image_to_data[image]["labels"].extend(call_result["labels"])
-        image_to_data[image]["scores"].extend(call_result["scores"])
-        if "masks" in call_result:
-            image_to_data[image]["masks"].extend(call_result["masks"])
+        image_to_data[image]["bboxes"].extend(call_result.get("bboxes", []))
+        image_to_data[image]["labels"].extend(call_result.get("labels", []))
+        image_to_data[image]["scores"].extend(call_result.get("scores", []))
+        image_to_data[image]["masks"].extend(call_result.get("masks", []))
+        if "mask_shape" in call_result:
+            image_to_data[image]["mask_shape"] = call_result["mask_shape"]
     return image_to_data
@@ -367,6 +370,8 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
             "grounding_dino_",
             "extract_frames_",
             "dinov_",
+            "zero_shot_counting_",
+            "visual_prompt_counting_",
         ]:
             continue
@@ -379,8 +384,11 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
     for image_str in image_to_data:
         image_path = Path(image_str)
         image_data = image_to_data[image_str]
-        image = overlay_masks(image_path, image_data)
-        image = overlay_bboxes(image, image_data)
+        if "_counting_" in tool_result["tool_name"]:
+            image = overlay_heat_map(image_path, image_data)
+        else:
+            image = overlay_masks(image_path, image_data)
+            image = overlay_bboxes(image, image_data)
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
             image.save(f.name)
             visualized_images.append(f.name)
@@ -484,11 +492,21 @@ class VisionAgent(Agent):
         if image:
             question += f" Image name: {image}"
         if reference_data:
-            if not ("image" in reference_data and "mask" in reference_data):
+            if not (
+                "image" in reference_data
+                and ("mask" in reference_data or "bbox" in reference_data)
+            ):
                 raise ValueError(
-                    f"Reference data must contain 'image' and 'mask'. but got {reference_data}"
+                    f"Reference data must contain 'image' and a visual prompt which can be 'mask' or 'bbox'. but got {reference_data}"
                 )
-            question += f" Reference image: {reference_data['image']}, Reference mask: {reference_data['mask']}"
+            visual_prompt_data = (
+                f"Reference mask: {reference_data['mask']}"
+                if "mask" in reference_data
+                else f"Reference bbox: {reference_data['bbox']}"
+            )
+            question += (
+                f" Reference image: {reference_data['image']}, {visual_prompt_data}"
+            )
         reflections = ""
         final_answer = ""
@@ -531,7 +549,6 @@ class VisionAgent(Agent):
             final_answer = answer_summarize(
                 self.answer_model, question, answers, reflections
             )
             visualized_output = visualize_result(all_tool_results)
             all_tool_results.append({"visualized_output": visualized_output})
             if len(visualized_output) > 0:

vision_agent/image_utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ import base64
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, Tuple, Union
+from typing import Dict, Tuple, Union, List
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
@@ -34,6 +34,35 @@ COLORS = [
 ]
+def normalize_bbox(
+    bbox: List[Union[int, float]], image_size: Tuple[int, ...]
+) -> List[float]:
+    r"""Normalize the bounding box coordinates to be between 0 and 1."""
+    x1, y1, x2, y2 = bbox
+    x1 = round(x1 / image_size[1], 2)
+    y1 = round(y1 / image_size[0], 2)
+    x2 = round(x2 / image_size[1], 2)
+    y2 = round(y2 / image_size[0], 2)
+    return [x1, y1, x2, y2]
+def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
+    r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
+    Parameters:
+        mask_rle: Run-length as string formated (start length)
+        shape: The (height, width) of array to return
+    """
+    s = mask_rle.split()
+    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
+    starts -= 1
+    ends = starts + lengths
+    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
+    for lo, hi in zip(starts, ends):
+        img[lo:hi] = 1
+    return img.reshape(shape)
 def b64_to_pil(b64_str: str) -> ImageType:
     r"""Convert a base64 string to a PIL Image.
@@ -86,6 +115,26 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
         return base64.b64encode(arr_bytes).decode("utf-8")
+def denormalize_bbox(
+    bbox: List[Union[int, float]], image_size: Tuple[int, ...]
+) -> List[float]:
+    r"""DeNormalize the bounding box coordinates so that they are in absolute values."""
+    if len(bbox) != 4:
+        raise ValueError("Bounding box must be of length 4.")
+    arr = np.array(bbox)
+    if np.all((arr >= 0) & (arr <= 1)):
+        x1, y1, x2, y2 = bbox
+        x1 = round(x1 * image_size[1])
+        y1 = round(y1 * image_size[0])
+        x2 = round(x2 * image_size[1])
+        y2 = round(y2 * image_size[0])
+        return [x1, y1, x2, y2]
+    else:
+        return bbox
 def overlay_bboxes(
     image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
 ) -> ImageType:
@@ -103,6 +152,9 @@ def overlay_bboxes(
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
+    if "bboxes" not in bboxes:
+        return image.convert("RGB")
     color = {
         label: COLORS[i % len(COLORS)] for i, label in enumerate(set(bboxes["labels"]))
     }
@@ -114,8 +166,6 @@ def overlay_bboxes(
         str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
         fontsize,
     )
-    if "bboxes" not in bboxes:
-        return image.convert("RGB")
     for label, box, scores in zip(bboxes["labels"], bboxes["bboxes"], bboxes["scores"]):
         box = [
@@ -150,11 +200,15 @@ def overlay_masks(
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
+    if "masks" not in masks:
+        return image.convert("RGB")
+    if "labels" not in masks:
+        masks["labels"] = [""] * len(masks["masks"])
     color = {
         label: COLORS[i % len(COLORS)] for i, label in enumerate(set(masks["labels"]))
     }
-    if "masks" not in masks:
-        return image.convert("RGB")
     for label, mask in zip(masks["labels"], masks["masks"]):
         if isinstance(mask, str):
@@ -164,3 +218,40 @@ def overlay_masks(
         mask_img = Image.fromarray(np_mask.astype(np.uint8))
         image = Image.alpha_composite(image.convert("RGBA"), mask_img)
     return image.convert("RGB")
+def overlay_heat_map(
+    image: Union[str, Path, np.ndarray, ImageType], masks: Dict, alpha: float = 0.8
+) -> ImageType:
+    r"""Plots heat map on to an image.
+    Parameters:
+        image: the input image
+        masks: the heatmap to overlay
+        alpha: the transparency of the overlay
+    Returns:
+        The image with the heatmap overlayed
+    """
+    if isinstance(image, (str, Path)):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if "masks" not in masks:
+        return image.convert("RGB")
+    # Only one heat map per image, so no need to loop through masks
+    image = image.convert("L")
+    if isinstance(masks["masks"][0], str):
+        mask = b64_to_pil(masks["masks"][0])
+    overlay = Image.new("RGBA", mask.size)
+    odraw = ImageDraw.Draw(overlay)
+    odraw.bitmap(
+        (0, 0), mask, fill=(255, 0, 0, round(alpha * 255))
+    )  # fill=(R, G, B, Alpha)
+    combined = Image.alpha_composite(image.convert("RGBA"), overlay.resize(image.size))
+    return combined.convert("RGB")

vision_agent/llm/llm.py CHANGED Viewed

@@ -11,6 +11,7 @@ from vision_agent.tools import (
     SYSTEM_PROMPT,
     GroundingDINO,
     GroundingSAM,
+    ZeroShotCounting,
 )
@@ -127,6 +128,9 @@ class OpenAILLM(LLM):
         return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
+    def generate_zero_shot_counter(self, question: str) -> Callable:
+        return lambda x: ZeroShotCounting()(**{"image": x})
 class AzureOpenAILLM(OpenAILLM):
     def __init__(

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -15,6 +15,7 @@ from vision_agent.tools import (
     SYSTEM_PROMPT,
     GroundingDINO,
     GroundingSAM,
+    ZeroShotCounting,
 )
 _LOGGER = logging.getLogger(__name__)
@@ -272,6 +273,9 @@ class OpenAILMM(LMM):
         return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
+    def generate_zero_shot_counter(self, question: str) -> Callable:
+        return lambda x: ZeroShotCounting()(**{"image": x})
 class AzureOpenAILMM(OpenAILMM):
     def __init__(

vision_agent/tools/__init__.py CHANGED Viewed

@@ -11,6 +11,8 @@ from .tools import (  # Counter,
     GroundingDINO,
     GroundingSAM,
     ImageCaption,
+    ZeroShotCounting,
+    VisualPromptCounting,
     SegArea,
     SegIoU,
     Tool,

vision_agent/tools/tools.py CHANGED Viewed

@@ -9,7 +9,13 @@ import requests
 from PIL import Image
 from PIL.Image import Image as ImageType
-from vision_agent.image_utils import convert_to_b64, get_image_size
+from vision_agent.image_utils import (
+    convert_to_b64,
+    get_image_size,
+    rle_decode,
+    normalize_bbox,
+    denormalize_bbox,
+)
 from vision_agent.tools.video import extract_frames_from_video
 from vision_agent.type_defs import LandingaiAPIKey
@@ -18,35 +24,6 @@ _LND_API_KEY = LandingaiAPIKey().api_key
 _LND_API_URL = "https://api.dev.landing.ai/v1/agent"
-def normalize_bbox(
-    bbox: List[Union[int, float]], image_size: Tuple[int, ...]
-) -> List[float]:
-    r"""Normalize the bounding box coordinates to be between 0 and 1."""
-    x1, y1, x2, y2 = bbox
-    x1 = round(x1 / image_size[1], 2)
-    y1 = round(y1 / image_size[0], 2)
-    x2 = round(x2 / image_size[1], 2)
-    y2 = round(y2 / image_size[0], 2)
-    return [x1, y1, x2, y2]
-def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
-    r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
-    Parameters:
-        mask_rle: Run-length as string formated (start length)
-        shape: The (height, width) of array to return
-    """
-    s = mask_rle.split()
-    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
-    starts -= 1
-    ends = starts + lengths
-    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
-    for lo, hi in zip(starts, ends):
-        img[lo:hi] = 1
-    return img.reshape(shape)
 class Tool(ABC):
     name: str
     description: str
@@ -489,6 +466,130 @@ class AgentGroundingSAM(GroundingSAM):
         return rets
+class ZeroShotCounting(Tool):
+    r"""ZeroShotCounting is a tool that can count total number of instances of an object
+    present in an image belonging to same class without a text or visual prompt.
+    Example
+    -------
+        >>> import vision_agent as va
+        >>> zshot_count = va.tools.ZeroShotCounting()
+        >>> zshot_count("image1.jpg")
+        {'count': 45}
+    """
+    name = "zero_shot_counting_"
+    description = "'zero_shot_counting_' is a tool that counts and returns the total number of instances of an object present in an image belonging to the same class without a text or visual prompt."
+    usage = {
+        "required_parameters": [
+            {"name": "image", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Can you count the lids in the image ? Image name: lids.jpg",
+                "parameters": {"image": "lids.jpg"},
+            },
+            {
+                "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
+                "parameters": {"image": "tray.jpg"},
+            },
+            {
+                "scenario": "Can you build me an object counting tool ? Image name: shirts.jpg",
+                "parameters": {
+                    "image": "shirts.jpg",
+                },
+            },
+        ],
+    }
+    # TODO: Add support for input multiple images, which aligns with the output type.
+    def __call__(self, image: Union[str, ImageType]) -> Dict:
+        """Invoke the Image captioning model.
+        Parameters:
+            image: the input image.
+        Returns:
+            A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
+        """
+        image_b64 = convert_to_b64(image)
+        data = {
+            "image": image_b64,
+            "tool": "zero_shot_counting",
+        }
+        return _send_inference_request(data, "tools")
+class VisualPromptCounting(Tool):
+    r"""VisualPromptCounting is a tool that can count total number of instances of an object
+    present in an image belonging to same class with help of an visual prompt which is a bounding box.
+    Example
+    -------
+        >>> import vision_agent as va
+        >>> prompt_count = va.tools.VisualPromptCounting()
+        >>> prompt_count(image="image1.jpg", prompt="0.1, 0.1, 0.4, 0.42")
+        {'count': 23}
+    """
+    name = "visual_prompt_counting_"
+    description = "'visual_prompt_counting_' is a tool that can count and return total number of instances of an object present in an image belonging to the same class given an example bounding box."
+    usage = {
+        "required_parameters": [
+            {"name": "image", "type": "str"},
+            {"name": "prompt", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the lids in the image ? Image name: lids.jpg",
+                "parameters": {"image": "lids.jpg", "prompt": "0.1, 0.1, 0.14, 0.2"},
+            },
+            {
+                "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
+                "parameters": {"image": "tray.jpg", "prompt": "0.1, 0.1, 0.2, 0.25"},
+            },
+            {
+                "scenario": "Can you build me a few shot object counting tool ? Image name: shirts.jpg",
+                "parameters": {
+                    "image": "shirts.jpg",
+                    "prompt": "0.1, 0.15, 0.2, 0.2",
+                },
+            },
+            {
+                "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg",
+                "parameters": {
+                    "image": "shoes.jpg",
+                    "prompt": "0.1, 0.1, 0.6, 0.65",
+                },
+            },
+        ],
+    }
+    # TODO: Add support for input multiple images, which aligns with the output type.
+    def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
+        """Invoke the Image captioning model.
+        Parameters:
+            image: the input image.
+        Returns:
+            A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
+        """
+        image_size = get_image_size(image)
+        bbox = [float(x) for x in prompt.split(",")]
+        prompt = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
+        image_b64 = convert_to_b64(image)
+        data = {
+            "image": image_b64,
+            "prompt": prompt,
+            "tool": "few_shot_counting",
+        }
+        return _send_inference_request(data, "tools")
 class Crop(Tool):
     r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
@@ -798,6 +899,8 @@ TOOLS = {
             ImageCaption,
             GroundingDINO,
             AgentGroundingSAM,
+            ZeroShotCounting,
+            VisualPromptCounting,
             AgentDINOv,
             ExtractFrames,
             Crop,

{vision_agent-0.1.5.dist-info → vision_agent-0.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,15 +1,14 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.1.5
+Version: 0.2.1
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
-Requires-Python: >=3.9,<3.12
+Requires-Python: >=3.9
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
-Requires-Dist: faiss-cpu (>=1.0.0,<2.0.0)
 Requires-Dist: moviepy (>=1.0.0,<2.0.0)
 Requires-Dist: numpy (>=1.21.0,<2.0.0)
 Requires-Dist: openai (>=1.0.0,<2.0.0)
@@ -18,9 +17,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
 Requires-Dist: pillow (>=10.0.0,<11.0.0)
 Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
 Requires-Dist: requests (>=2.0.0,<3.0.0)
-Requires-Dist: sentence-transformers (>=2.0.0,<3.0.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
-Requires-Dist: torch (>=2.1.0,<2.2.0)
 Requires-Dist: tqdm (>=4.64.0,<5.0.0)
 Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
 Project-URL: Homepage, https://landing.ai
@@ -41,7 +38,7 @@ Description-Content-Type: text/markdown
 Vision Agent is a library that helps you utilize agent frameworks for your vision tasks.
 Many current vision problems can easily take hours or days to solve, you need to find the
-right model, figure out how to use it, possibly write programming logic around it to
+right model, figure out how to use it, possibly write programming logic around it to
 accomplish the task you want or even more expensive, train your own model. Vision Agent
 aims to provide an in-seconds experience by allowing users to describe their problem in
 text and utilizing agent frameworks to solve the task for them. Check out our discord
@@ -138,6 +135,9 @@ you. For example:
 | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
 | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
 | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
+| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
+| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
+| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
 It also has a basic set of calculate tools such as add, subtract, multiply and divide.

{vision_agent-0.1.5.dist-info → vision_agent-0.2.1.dist-info}/RECORD RENAMED Viewed

@@ -1,29 +1,25 @@
-vision_agent/__init__.py,sha256=wD1cssVTAJ55uTViNfBGooqJUV0p9fmVAuTMHHrmUBU,229
+vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
 vision_agent/agent/__init__.py,sha256=B4JVrbY4IRVCJfjmrgvcp7h1mTUEk8MZvL0Zmej4Ka0,127
 vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
 vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
 vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
 vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=Deuj28hqRq4wHnD08pU_7fok_EicvlGnDoINYh5hw1k,22853
+vision_agent/agent/vision_agent.py,sha256=MTxeV5_Sghqoe2aOW9EbNgiq61sVCcF3ZndJ7BZl6x0,23588
 vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
-vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
-vision_agent/data/data.py,sha256=Z2l76OrT0GgyuN52OeJqDitUcP0q1rhfdXd1of3GsVo,5128
-vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
-vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
-vision_agent/image_utils.py,sha256=qRN_Y1XXBm9EL6V53OZUq21h0spIa1J6X9YDbe6B87o,4805
+vision_agent/image_utils.py,sha256=Cg4aKO1tQiETT1gdsZ50XzORBtJnBFfMG2cKJyjaY6Q,7555
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
-vision_agent/llm/llm.py,sha256=Jty_RHdqVmIM0Mm31JNk50c882Tx7hHtkmh0WyXeJd8,5016
+vision_agent/llm/llm.py,sha256=gwDQ9-p9wEn24xi1019e5jzTGQg4xWDSqBCsqIqGcU4,5168
 vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
-vision_agent/lmm/lmm.py,sha256=1E7e_S_0fOKnf6mSsEdkXvsIjGmhBGl5XW4By2jvhbY,10045
-vision_agent/tools/__init__.py,sha256=dkzk9amNzTEKULMB1xRJspqEGpzNPGuccWeXrv1xI0U,280
+vision_agent/lmm/lmm.py,sha256=FjxCuIk0KXuWnfY4orVmdyhJW2I4C6i5QNNEXk7gybk,10197
+vision_agent/tools/__init__.py,sha256=BlfxqbYkB0oODhnSmQg1UyzQm73AvvjCjrIiOWBIYDs,328
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tools.py,sha256=WIodfggPkz_2LSWn_Kqm9uvQUtCgKy3jmMoPVTwf1bA,31181
+vision_agent/tools/tools.py,sha256=gCjHs5vJuGNBFsnJWFT7PX3wTyfHgtrgX1Eq9vqknN0,34979
 vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
 vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
-vision_agent-0.1.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.1.5.dist-info/METADATA,sha256=ubzhbZW7oT9sIaIkuM6QObXINZGz5Zcvgjdp7sUcsJE,6233
-vision_agent-0.1.5.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.1.5.dist-info/RECORD,,
+vision_agent-0.2.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.1.dist-info/METADATA,sha256=RAD8NCAo5N12sccgSC5Q0j4hKwU_rVKg5p_eLE-Njdc,6434
+vision_agent-0.2.1.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.1.dist-info/RECORD,,

vision_agent/data/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .data import DataStore, build_data_store

vision_agent/data/data.py DELETED Viewed

@@ -1,142 +0,0 @@
-from __future__ import annotations
-import uuid
-from pathlib import Path
-from typing import Callable, Dict, List, Optional, Union, cast
-import faiss
-import numpy as np
-import numpy.typing as npt
-import pandas as pd
-from faiss import read_index, write_index
-from tqdm import tqdm
-from typing_extensions import Self
-from vision_agent.emb import Embedder
-from vision_agent.lmm import LMM
-tqdm.pandas()
-class DataStore:
-    r"""A class to store and manage image data along with its generated metadata from an LMM."""
-    def __init__(self, df: pd.DataFrame):
-        r"""Initializes the DataStore with a DataFrame containing image paths and image
-        IDs. If the image IDs are not present, they are generated using UUID4. The
-        DataFrame must contain an 'image_paths' column.
-        Args:
-            df: The DataFrame containing "image_paths" and "image_id" columns.
-        """
-        self.df = df
-        self.lmm: Optional[LMM] = None
-        self.emb: Optional[Embedder] = None
-        self.index: Optional[faiss.IndexFlatIP] = None  # type: ignore
-        if "image_paths" not in self.df.columns:
-            raise ValueError("image_paths column must be present in DataFrame")
-        if "image_id" not in self.df.columns:
-            self.df["image_id"] = [str(uuid.uuid4()) for _ in range(len(df))]
-    def add_embedder(self, emb: Embedder) -> Self:
-        self.emb = emb
-        return self
-    def add_lmm(self, lmm: LMM) -> Self:
-        self.lmm = lmm
-        return self
-    def add_column(
-        self, name: str, prompt: str, func: Optional[Callable[[str], str]] = None
-    ) -> Self:
-        r"""Adds a new column to the DataFrame containing the generated metadata from
-        the LMM.
-        Args:
-            name: The name of the column to be added.
-            prompt: The prompt to be used to generate the metadata.
-            func: A Python function to be applied on the output of `lmm.generate`.
-                Defaults to None.
-        """
-        if self.lmm is None:
-            raise ValueError("LMM not set yet")
-        self.df[name] = self.df["image_paths"].progress_apply(  # type: ignore
-            lambda x: (
-                func(self.lmm.generate(prompt, images=[x]))
-                if func
-                else self.lmm.generate(prompt, images=[x])
-            )
-        )
-        return self
-    def build_index(self, target_col: str) -> Self:
-        r"""This will generate embeddings for the `target_col` and build a searchable
-        index over them, so next time you run search it will search over this index.
-        Args:
-            target_col: The column name containing the data to be indexed."""
-        if self.emb is None:
-            raise ValueError("Embedder not set yet")
-        embeddings: pd.Series = self.df[target_col].progress_apply(lambda x: self.emb.embed(x))  # type: ignore
-        embeddings_np = np.array(embeddings.tolist()).astype(np.float32)
-        self.index = faiss.IndexFlatIP(embeddings_np.shape[1])
-        self.index.add(embeddings_np)
-        return self
-    def get_embeddings(self) -> npt.NDArray[np.float32]:
-        if self.index is None:
-            raise ValueError("Index not built yet")
-        ntotal = self.index.ntotal
-        d: int = self.index.d
-        return cast(
-            npt.NDArray[np.float32],
-            faiss.rev_swig_ptr(self.index.get_xb(), ntotal * d).reshape(ntotal, d),
-        )
-    def search(self, query: str, top_k: int = 10) -> List[Dict]:
-        r"""Searches the index for the most similar images to the query and returns
-        the top_k results.
-        Args:
-            query: The query to search for.
-            top_k: The number of results to return. Defaults to 10."""
-        if self.index is None:
-            raise ValueError("Index not built yet")
-        if self.emb is None:
-            raise ValueError("Embedder not set yet")
-        query_embedding: npt.NDArray[np.float32] = self.emb.embed(query)
-        _, idx = self.index.search(query_embedding.reshape(1, -1), top_k)
-        return cast(List[Dict], self.df.iloc[idx[0]].to_dict(orient="records"))
-    def save(self, path: Union[str, Path]) -> None:
-        path = Path(path)
-        path.mkdir(parents=True)
-        self.df.to_csv(path / "data.csv")
-        if self.index is not None:
-            write_index(self.index, str(path / "data.index"))
-    @classmethod
-    def load(cls, path: Union[str, Path]) -> DataStore:
-        path = Path(path)
-        df = pd.read_csv(path / "data.csv", index_col=0)
-        ds = DataStore(df)
-        if Path(path / "data.index").exists():
-            ds.index = read_index(str(path / "data.index"))
-        return ds
-def build_data_store(data: Union[str, Path, list[Union[str, Path]]]) -> DataStore:
-    if isinstance(data, Path) or isinstance(data, str):
-        data = Path(data)
-        data_files = list(Path(data).glob("*"))
-    elif isinstance(data, list):
-        data_files = [Path(d) for d in data]
-    df = pd.DataFrame()
-    df["image_paths"] = data_files
-    df["image_id"] = [uuid.uuid4() for _ in range(len(data_files))]
-    return DataStore(df)

vision_agent/emb/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .emb import Embedder, OpenAIEmb, SentenceTransformerEmb, get_embedder

vision_agent/emb/emb.py DELETED Viewed

@@ -1,47 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import cast
-import numpy as np
-import numpy.typing as npt
-class Embedder(ABC):
-    @abstractmethod
-    def embed(self, text: str) -> npt.NDArray[np.float32]:
-        pass
-class SentenceTransformerEmb(Embedder):
-    def __init__(self, model_name: str = "BAAI/bge-small-en-v1.5"):
-        from sentence_transformers import SentenceTransformer
-        self.model = SentenceTransformer(model_name)
-    def embed(self, text: str) -> npt.NDArray[np.float32]:
-        return cast(
-            npt.NDArray[np.float32],
-            self.model.encode([text]).flatten().astype(np.float32),
-        )
-class OpenAIEmb(Embedder):
-    def __init__(self, model_name: str = "text-embedding-3-small"):
-        from openai import OpenAI
-        self.client = OpenAI()
-        self.model_name = model_name
-    def embed(self, text: str) -> npt.NDArray[np.float32]:
-        response = self.client.embeddings.create(input=text, model=self.model_name)
-        return np.array(response.data[0].embedding).astype(np.float32)
-def get_embedder(name: str) -> Embedder:
-    if name == "sentence-transformer":
-        return SentenceTransformerEmb()
-    elif name == "openai":
-        return OpenAIEmb()
-    else:
-        raise ValueError(
-            f"Unknown embedder name: {name}, currently support sentence-transformer, openai."
-        )

{vision_agent-0.1.5.dist-info → vision_agent-0.2.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.1.5.dist-info → vision_agent-0.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl

vision-agent 0.1.5py3-none-any.whl → 0.2.1py3-none-any.whl