PyPI - vision-agent - Versions diffs - 0.0.40__tar.gz → 0.0.41__tar.gz - Mend

vision-agent 0.0.40tar.gz → 0.0.41tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{vision_agent-0.0.40 → vision_agent-0.0.41}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.0.40
+Version: 0.0.41
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.0.40 → vision_agent-0.0.41}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.0.40"
+version = "0.0.41"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.0.40 → vision_agent-0.0.41}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -1,11 +1,13 @@
 import json
 import logging
 import sys
+import tempfile
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from tabulate import tabulate
+from vision_agent.image_utils import overlay_bboxes, overlay_masks
 from vision_agent.llm import LLM, OpenAILLM
 from vision_agent.lmm import LMM, OpenAILMM
 from vision_agent.tools import TOOLS
@@ -248,12 +250,12 @@ def retrieval(
     tools: Dict[int, Any],
     previous_log: str,
     reflections: str,
-) -> Tuple[List[Dict], str]:
+) -> Tuple[Dict, str]:
     tool_id = choose_tool(
         model, question, {k: v["description"] for k, v in tools.items()}, reflections
     )
     if tool_id is None:
-        return [{}], ""
+        return {}, ""
     _LOGGER.info(f"\t(Tool ID, name): ({tool_id}, {tools[tool_id]['name']})")
     tool_instructions = tools[tool_id]
@@ -265,14 +267,12 @@ def retrieval(
     )
     _LOGGER.info(f"\tParameters: {parameters} for {tool_name}")
     if parameters is None:
-        return [{}], ""
-    tool_results = [
-        {"task": question, "tool_name": tool_name, "parameters": parameters}
-    ]
+        return {}, ""
+    tool_results = {"task": question, "tool_name": tool_name, "parameters": parameters}
     _LOGGER.info(
-        f"""Going to run the following {len(tool_results)} tool(s) in sequence:
-{tabulate(tool_results, headers="keys", tablefmt="mixed_grid")}"""
+        f"""Going to run the following tool(s) in sequence:
+{tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
     )
     def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
@@ -286,12 +286,10 @@ def retrieval(
                 call_results.append(function_call(tools[tool_id]["class"], parameters))
         return call_results
-    call_results = []
-    for i, result in enumerate(tool_results):
-        call_results.extend(parse_tool_results(result))
-        tool_results[i]["call_results"] = call_results
+    call_results = parse_tool_results(tool_results)
+    tool_results["call_results"] = call_results
-    call_results_str = "\n\n".join([str(e) for e in call_results if e is not None])
+    call_results_str = str(call_results)
     _LOGGER.info(f"\tCall Results: {call_results_str}")
     return tool_results, call_results_str
@@ -335,7 +333,11 @@ def self_reflect(
         tool_results=str(tool_result),
         final_answer=final_answer,
     )
-    if issubclass(type(reflect_model), LMM):
+    if (
+        issubclass(type(reflect_model), LMM)
+        and image is not None
+        and Path(image).suffix in [".jpg", ".jpeg", ".png"]
+    ):
         return reflect_model(prompt, image=image)  # type: ignore
     return reflect_model(prompt)
@@ -345,6 +347,56 @@ def parse_reflect(reflect: str) -> bool:
     return "finish" in reflect.lower() and len(reflect) < 100
+def visualize_result(all_tool_results: List[Dict]) -> List[str]:
+    image_to_data: Dict[str, Dict] = {}
+    for tool_result in all_tool_results:
+        if not tool_result["tool_name"] in ["grounding_sam_", "grounding_dino_"]:
+            continue
+        parameters = tool_result["parameters"]
+        # parameters can either be a dictionary or list, parameters can also be malformed
+        # becaus the LLM builds them
+        if isinstance(parameters, dict):
+            if "image" not in parameters:
+                continue
+            parameters = [parameters]
+        elif isinstance(tool_result["parameters"], list):
+            if (
+                len(tool_result["parameters"]) < 1
+                and "image" not in tool_result["parameters"][0]
+            ):
+                continue
+        for param, call_result in zip(parameters, tool_result["call_results"]):
+            # calls can fail, so we need to check if the call was successful
+            if not isinstance(call_result, dict):
+                continue
+            if "bboxes" not in call_result:
+                continue
+            # if the call was successful, then we can add the image data
+            image = param["image"]
+            if image not in image_to_data:
+                image_to_data[image] = {"bboxes": [], "masks": [], "labels": []}
+            image_to_data[image]["bboxes"].extend(call_result["bboxes"])
+            image_to_data[image]["labels"].extend(call_result["labels"])
+            if "masks" in call_result:
+                image_to_data[image]["masks"].extend(call_result["masks"])
+    visualized_images = []
+    for image in image_to_data:
+        image_path = Path(image)
+        image_data = image_to_data[image]
+        image = overlay_masks(image_path, image_data)
+        image = overlay_bboxes(image, image_data)
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
+            image.save(f.name)
+            visualized_images.append(f.name)
+    return visualized_images
 class VisionAgent(Agent):
     r"""Vision Agent is an agent framework that utilizes tools as well as self
     reflection to accomplish tasks, in particular vision tasks. Vision Agent is based
@@ -389,7 +441,8 @@ class VisionAgent(Agent):
         """Invoke the vision agent.
         Parameters:
-            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            input: a prompt that describe the task or a conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
             image: the input image referenced in the prompt parameter.
         Returns:
@@ -436,9 +489,8 @@ class VisionAgent(Agent):
                     self.answer_model, task_str, call_results, previous_log, reflections
                 )
-                for tool_result in tool_results:
-                    tool_result["answer"] = answer
-                all_tool_results.extend(tool_results)
+                tool_results["answer"] = answer
+                all_tool_results.append(tool_results)
                 _LOGGER.info(f"\tAnswer: {answer}")
                 answers.append({"task": task_str, "answer": answer})
@@ -448,13 +500,15 @@ class VisionAgent(Agent):
                 self.answer_model, question, answers, reflections
             )
+            visualized_images = visualize_result(all_tool_results)
+            all_tool_results.append({"visualized_images": visualized_images})
             reflection = self_reflect(
                 self.reflect_model,
                 question,
                 self.tools,
                 all_tool_results,
                 final_answer,
-                image,
+                visualized_images[0] if len(visualized_images) > 0 else image,
             )
             _LOGGER.info(f"\tReflection: {reflection}")
             if parse_reflect(reflection):

vision_agent-0.0.41/vision_agent/image_utils.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""Utility functions for image processing."""
+import base64
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, Tuple, Union
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from PIL.Image import Image as ImageType
+COLORS = [
+    (158, 218, 229),
+    (219, 219, 141),
+    (23, 190, 207),
+    (188, 189, 34),
+    (199, 199, 199),
+    (247, 182, 210),
+    (127, 127, 127),
+    (227, 119, 194),
+    (196, 156, 148),
+    (197, 176, 213),
+    (140, 86, 75),
+    (148, 103, 189),
+    (255, 152, 150),
+    (152, 223, 138),
+    (214, 39, 40),
+    (44, 160, 44),
+    (255, 187, 120),
+    (174, 199, 232),
+    (255, 127, 14),
+    (31, 119, 180),
+]
+def b64_to_pil(b64_str: str) -> ImageType:
+    r"""Convert a base64 string to a PIL Image.
+    Parameters:
+        b64_str: the base64 encoded image
+    Returns:
+        The decoded PIL Image
+    """
+    # , can't be encoded in b64 data so must be part of prefix
+    if "," in b64_str:
+        b64_str = b64_str.split(",")[1]
+    return Image.open(BytesIO(base64.b64decode(b64_str)))
+def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
+    r"""Get the size of an image.
+    Parameters:
+        data: the input image
+    Returns:
+        The size of the image in the form (height, width)
+    """
+    if isinstance(data, (str, Path)):
+        data = Image.open(data)
+    return data.size[::-1] if isinstance(data, Image.Image) else data.shape[:2]
+def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
+    r"""Convert an image to a base64 string.
+    Parameters:
+        data: the input image
+    Returns:
+        The base64 encoded image
+    """
+    if data is None:
+        raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
+    if isinstance(data, (str, Path)):
+        data = Image.open(data)
+    if isinstance(data, Image.Image):
+        buffer = BytesIO()
+        data.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+    else:
+        arr_bytes = data.tobytes()
+        return base64.b64encode(arr_bytes).decode("utf-8")
+def overlay_bboxes(
+    image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
+) -> ImageType:
+    r"""Plots bounding boxes on to an image.
+    Parameters:
+        image: the input image
+        bboxes: the bounding boxes to overlay
+    Returns:
+        The image with the bounding boxes overlayed
+    """
+    if isinstance(image, (str, Path)):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.load_default()
+    width, height = image.size
+    if "bboxes" not in bboxes:
+        return image.convert("RGB")
+    for label, box in zip(bboxes["labels"], bboxes["bboxes"]):
+        box = [box[0] * width, box[1] * height, box[2] * width, box[3] * height]
+        draw.rectangle(box, outline=color[label], width=3)
+        label = f"{label}"
+        text_box = draw.textbbox((box[0], box[1]), text=label, font=font)
+        draw.rectangle(text_box, fill=color[label])
+        draw.text((text_box[0], text_box[1]), label, fill="black", font=font)
+    return image.convert("RGB")
+def overlay_masks(
+    image: Union[str, Path, np.ndarray, ImageType], masks: Dict, alpha: float = 0.5
+) -> ImageType:
+    r"""Plots masks on to an image.
+    Parameters:
+        image: the input image
+        masks: the masks to overlay
+        alpha: the transparency of the overlay
+    Returns:
+        The image with the masks overlayed
+    """
+    if isinstance(image, (str, Path)):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(masks["labels"])}
+    if "masks" not in masks:
+        return image.convert("RGB")
+    for label, mask in zip(masks["labels"], masks["masks"]):
+        if isinstance(mask, str):
+            mask = np.array(Image.open(mask))
+        np_mask = np.zeros((image.size[1], image.size[0], 4))
+        np_mask[mask > 0, :] = color[label] + (255 * alpha,)
+        mask_img = Image.fromarray(np_mask.astype(np.uint8))
+        image = Image.alpha_composite(image.convert("RGBA"), mask_img)
+    return image.convert("RGB")

vision_agent-0.0.41/vision_agent/tools/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
+from .tools import (
+    CLIP,
+    TOOLS,
+    BboxArea,
+    BboxIoU,
+    Counter,
+    Crop,
+    ExtractFrames,
+    GroundingDINO,
+    GroundingSAM,
+    SegArea,
+    SegIoU,
+    Tool,
+)

{vision_agent-0.0.40 → vision_agent-0.0.41}/vision_agent/tools/tools.py RENAMED Viewed

@@ -92,7 +92,7 @@ class CLIP(Tool):
     }
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         """Invoke the CLIP model.
         Parameters:
@@ -122,7 +122,7 @@ class CLIP(Tool):
         rets = []
         for elt in resp_json["data"]:
             rets.append({"labels": prompt, "scores": [round(prob, 2) for prob in elt]})
-        return cast(List[Dict], rets)
+        return cast(Dict, rets[0])
 class GroundingDINO(Tool):
@@ -168,7 +168,7 @@ class GroundingDINO(Tool):
     }
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> Dict:
         """Invoke the Grounding DINO model.
         Parameters:
@@ -204,7 +204,7 @@ class GroundingDINO(Tool):
             if "scores" in elt:
                 elt["scores"] = [round(score, 2) for score in elt["scores"]]
             elt["size"] = (image_size[1], image_size[0])
-        return cast(List[Dict], resp_data)
+        return cast(Dict, resp_data)
 class GroundingSAM(Tool):
@@ -259,7 +259,7 @@ class GroundingSAM(Tool):
     }
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         """Invoke the Grounding SAM model.
         Parameters:
@@ -294,7 +294,7 @@ class GroundingSAM(Tool):
             ret_pred["labels"].append(pred["label_name"])
             ret_pred["bboxes"].append(normalize_bbox(pred["bbox"], image_size))
             ret_pred["masks"].append(mask)
-        return [ret_pred]
+        return ret_pred
 class AgentGroundingSAM(GroundingSAM):
@@ -302,15 +302,14 @@ class AgentGroundingSAM(GroundingSAM):
     returns the file name. This makes it easier for agents to use.
     """
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         rets = super().__call__(prompt, image)
-        for ret in rets:
-            mask_files = []
-            for mask in ret["masks"]:
-                with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
-                    Image.fromarray(mask * 255).save(tmp)
-                    mask_files.append(tmp.name)
-            ret["masks"] = mask_files
+        mask_files = []
+        for mask in rets["masks"]:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                Image.fromarray(mask * 255).save(tmp)
+                mask_files.append(tmp.name)
+        rets["masks"] = mask_files
         return rets
@@ -363,7 +362,7 @@ class Crop(Tool):
         ],
     }
-    def __call__(self, bbox: List[float], image: Union[str, Path]) -> str:
+    def __call__(self, bbox: List[float], image: Union[str, Path]) -> Dict:
         pil_image = Image.open(image)
         width, height = pil_image.size
         bbox = [
@@ -373,10 +372,10 @@ class Crop(Tool):
             int(bbox[3] * height),
         ]
         cropped_image = pil_image.crop(bbox)  # type: ignore
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
             cropped_image.save(tmp.name)
-        return tmp.name
+        return {"image": tmp.name}
 class BboxArea(Tool):
@@ -388,7 +387,7 @@ class BboxArea(Tool):
         "required_parameters": [{"name": "bbox", "type": "List[int]"}],
         "examples": [
             {
-                "scenario": "If you want to calculate the area of the bounding box [0, 0, 100, 100]",
+                "scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
                 "parameters": {"bboxes": [0.2, 0.21, 0.34, 0.42]},
             }
         ],
@@ -430,6 +429,109 @@ class SegArea(Tool):
         return cast(float, round(np.sum(np_mask) / 255, 2))
+class BboxIoU(Tool):
+    name = "bbox_iou_"
+    description = (
+        "'bbox_iou_' returns the intersection over union of two bounding boxes."
+    )
+    usage = {
+        "required_parameters": [
+            {"name": "bbox1", "type": "List[int]"},
+            {"name": "bbox2", "type": "List[int]"},
+        ],
+        "examples": [
+            {
+                "scenario": "If you want to calculate the intersection over union of the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
+                "parameters": {
+                    "bbox1": [0.2, 0.21, 0.34, 0.42],
+                    "bbox2": [0.3, 0.31, 0.44, 0.52],
+                },
+            }
+        ],
+    }
+    def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
+        x1, y1, x2, y2 = bbox1
+        x3, y3, x4, y4 = bbox2
+        xA = max(x1, x3)
+        yA = max(y1, y3)
+        xB = min(x2, x4)
+        yB = min(y2, y4)
+        inter_area = max(0, xB - xA) * max(0, yB - yA)
+        boxa_area = (x2 - x1) * (y2 - y1)
+        boxb_area = (x4 - x3) * (y4 - y3)
+        iou = inter_area / float(boxa_area + boxb_area - inter_area)
+        return round(iou, 2)
+class SegIoU(Tool):
+    name = "seg_iou_"
+    description = "'seg_iou_' returns the intersection over union of two segmentation masks given their segmentation mask files."
+    usage = {
+        "required_parameters": [
+            {"name": "mask1", "type": "str"},
+            {"name": "mask2", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "If you want to calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
+                "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
+            }
+        ],
+    }
+    def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
+        pil_mask1 = Image.open(str(mask1))
+        pil_mask2 = Image.open(str(mask2))
+        np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
+        np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
+        intersection = np.logical_and(np_mask1, np_mask2)
+        union = np.logical_or(np_mask1, np_mask2)
+        iou = np.sum(intersection) / np.sum(union)
+        return cast(float, round(iou, 2))
+class ExtractFrames(Tool):
+    r"""Extract frames from a video."""
+    name = "extract_frames_"
+    description = "'extract_frames_' extracts frames where there is motion detected in a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where teh frame was captured. The frame is a local image file path."
+    usage = {
+        "required_parameters": [{"name": "video_uri", "type": "str"}],
+        "examples": [
+            {
+                "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
+                "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
+            },
+            {
+                "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
+                "parameters": {"video_uri": "tests/data/test.mp4"},
+            },
+        ],
+    }
+    def __call__(self, video_uri: str) -> List[Tuple[str, float]]:
+        """Extract frames from a video.
+        Parameters:
+            video_uri: the path to the video file or a url points to the video data
+        Returns:
+            a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
+        """
+        frames = extract_frames_from_video(video_uri)
+        result = []
+        _LOGGER.info(
+            f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
+        )
+        for frame, ts in frames:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                Image.fromarray(frame).save(tmp)
+            result.append((tmp.name, ts))
+        return result
 class Add(Tool):
     r"""Add returns the sum of all the arguments passed to it, normalized to 2 decimal places."""
@@ -506,47 +608,6 @@ class Divide(Tool):
         return round(input[0] / input[1], 2)
-class ExtractFrames(Tool):
-    r"""Extract frames from a video."""
-    name = "extract_frames_"
-    description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame."
-    usage = {
-        "required_parameters": [{"name": "video_uri", "type": "str"}],
-        "examples": [
-            {
-                "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
-                "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
-            },
-            {
-                "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
-                "parameters": {"video_uri": "tests/data/test.mp4"},
-            },
-        ],
-    }
-    def __call__(self, video_uri: str) -> list[tuple[str, float]]:
-        """Extract frames from a video.
-        Parameters:
-            video_uri: the path to the video file or a url points to the video data
-        Returns:
-            a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
-        """
-        frames = extract_frames_from_video(video_uri)
-        result = []
-        _LOGGER.info(
-            f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
-        )
-        for frame, ts in frames:
-            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
-                Image.fromarray(frame).save(tmp)
-            result.append((tmp.name, ts))
-        return result
 TOOLS = {
     i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
     for i, c in enumerate(
@@ -554,15 +615,17 @@ TOOLS = {
             CLIP,
             GroundingDINO,
             AgentGroundingSAM,
+            ExtractFrames,
             Counter,
             Crop,
             BboxArea,
             SegArea,
+            BboxIoU,
+            SegIoU,
             Add,
             Subtract,
             Multiply,
             Divide,
-            ExtractFrames,
         ]
     )
     if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))

{vision_agent-0.0.40 → vision_agent-0.0.41}/vision_agent/tools/video.py RENAMED Viewed

@@ -22,12 +22,16 @@ def extract_frames_from_video(
     Parameters:
         video_uri: the path to the video file or a video file url
         fps: the frame rate per second to extract the frames
-        motion_detection_threshold: The threshold to detect motion between changes/frames.
-            A value between 0-1, which represents the percentage change required for the frames to be considered in motion.
-            For example, a lower value means more frames will be extracted.
+        motion_detection_threshold: The threshold to detect motion between
+            changes/frames. A value between 0-1, which represents the percentage change
+            required for the frames to be considered in motion. For example, a lower
+            value means more frames will be extracted.
     Returns:
-        a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
+        a list of tuples containing the extracted frame and the timestamp in seconds.
+        E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
+        from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
+        the video. The frames are sorted by the timestamp in ascending order.
     """
     with VideoFileClip(video_uri) as video:
         video_duration: float = video.duration

vision_agent-0.0.40/vision_agent/image_utils.py DELETED Viewed

@@ -1,62 +0,0 @@
-"""Utility functions for image processing."""
-import base64
-from io import BytesIO
-from pathlib import Path
-from typing import Tuple, Union
-import numpy as np
-from PIL import Image
-from PIL.Image import Image as ImageType
-def b64_to_pil(b64_str: str) -> ImageType:
-    """Convert a base64 string to a PIL Image.
-    Parameters:
-        b64_str: the base64 encoded image
-    Returns:
-        The decoded PIL Image
-    """
-    # , can't be encoded in b64 data so must be part of prefix
-    if "," in b64_str:
-        b64_str = b64_str.split(",")[1]
-    return Image.open(BytesIO(base64.b64decode(b64_str)))
-def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
-    """Get the size of an image.
-    Parameters:
-        data: the input image
-    Returns:
-        The size of the image in the form (height, width)
-    """
-    if isinstance(data, (str, Path)):
-        data = Image.open(data)
-    return data.size[::-1] if isinstance(data, Image.Image) else data.shape[:2]
-def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
-    """Convert an image to a base64 string.
-    Parameters:
-        data: the input image
-    Returns:
-        The base64 encoded image
-    """
-    if data is None:
-        raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
-    if isinstance(data, (str, Path)):
-        data = Image.open(data)
-    if isinstance(data, Image.Image):
-        buffer = BytesIO()
-        data.save(buffer, format="PNG")
-        return base64.b64encode(buffer.getvalue()).decode("utf-8")
-    else:
-        arr_bytes = data.tobytes()
-        return base64.b64encode(arr_bytes).decode("utf-8")

vision_agent-0.0.40/vision_agent/tools/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
2	- from .tools import CLIP, TOOLS, Counter, Crop, GroundingDINO, GroundingSAM, Tool