PyPI - vision-agent - Versions diffs - 0.2.192__py3-none-any.whl → 0.2.195__py3-none-any.whl - Mend

vision-agent 0.2.192py3-none-any.whl → 0.2.195py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

vision_agent/.sim_tools/df.csv +640 -0
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/__init__.py +2 -0
vision_agent/agent/agent_utils.py +211 -3
vision_agent/agent/vision_agent_coder.py +5 -113
vision_agent/agent/vision_agent_coder_prompts_v2.py +119 -0
vision_agent/agent/vision_agent_coder_v2.py +341 -0
vision_agent/agent/vision_agent_planner.py +2 -2
vision_agent/agent/vision_agent_planner_prompts.py +1 -1
vision_agent/agent/vision_agent_planner_prompts_v2.py +748 -0
vision_agent/agent/vision_agent_planner_v2.py +432 -0
vision_agent/lmm/lmm.py +4 -0
vision_agent/tools/__init__.py +2 -1
vision_agent/tools/planner_tools.py +246 -0
vision_agent/tools/tool_utils.py +65 -1
vision_agent/tools/tools.py +98 -35
vision_agent/utils/image_utils.py +12 -6
vision_agent/utils/sim.py +65 -14
{vision_agent-0.2.192.dist-info → vision_agent-0.2.195.dist-info}/METADATA +1 -1
vision_agent-0.2.195.dist-info/RECORD +42 -0
vision_agent-0.2.192.dist-info/RECORD +0 -35
{vision_agent-0.2.192.dist-info → vision_agent-0.2.195.dist-info}/LICENSE +0 -0
{vision_agent-0.2.192.dist-info → vision_agent-0.2.195.dist-info}/WHEEL +0 -0

vision_agent/tools/planner_tools.py ADDED Viewed

@@ -0,0 +1,246 @@
+import logging
+import shutil
+import tempfile
+from typing import Any, Callable, Dict, List, Optional, Tuple, cast
+import numpy as np
+from PIL import Image
+import vision_agent.tools as T
+from vision_agent.agent.agent_utils import (
+    DefaultImports,
+    extract_code,
+    extract_json,
+    extract_tag,
+)
+from vision_agent.agent.vision_agent_planner_prompts_v2 import (
+    CATEGORIZE_TOOL_REQUEST,
+    FINALIZE_PLAN,
+    PICK_TOOL,
+    TEST_TOOLS,
+    TEST_TOOLS_EXAMPLE1,
+    TEST_TOOLS_EXAMPLE2,
+)
+from vision_agent.lmm import AnthropicLMM
+from vision_agent.utils.execute import CodeInterpreterFactory
+from vision_agent.utils.image_utils import convert_to_b64
+from vision_agent.utils.sim import load_cached_sim
+TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
+TOOL_RECOMMENDER = load_cached_sim(T.TOOLS_DF)
+_LOGGER = logging.getLogger(__name__)
+EXAMPLES = f"\n{TEST_TOOLS_EXAMPLE1}\n{TEST_TOOLS_EXAMPLE2}\n"
+def extract_tool_info(
+    tool_choice_context: Dict[str, Any]
+) -> Tuple[Optional[Callable], str, str, str]:
+    tool_thoughts = tool_choice_context.get("thoughts", "")
+    tool_docstring = ""
+    tool = tool_choice_context.get("best_tool", None)
+    if tool in TOOL_FUNCTIONS:
+        tool = TOOL_FUNCTIONS[tool]
+        tool_docstring = T.TOOLS_INFO[tool.__name__]
+    return tool, tool_thoughts, tool_docstring, ""
+def get_tool_for_task(
+    task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
+) -> None:
+    """Given a task and one or more images this function will find a tool to accomplish
+    the jobs. It prints the tool documentation and thoughts on why it chose the tool.
+    It can produce tools for the following types of tasks:
+        - Object detection and counting
+        - Classification
+        - Segmentation
+        - OCR
+        - VQA
+        - Depth and pose estimation
+        - Video object tracking
+    Wait until the documentation is printed to use the function so you know what the
+    input and output signatures are.
+    Parameters:
+        task: str: The task to accomplish.
+        images: List[np.ndarray]: The images to use for the task.
+        exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
+            recommendations. This is helpful if you are calling get_tool_for_task twice
+            and do not want the same tool recommended.
+    Returns:
+        The tool to use for the task is printed to stdout
+    Examples
+    --------
+        >>> get_tool_for_task("Give me an OCR model that can find 'hot chocolate' in the image", [image])
+    """
+    lmm = AnthropicLMM()
+    with (
+        tempfile.TemporaryDirectory() as tmpdirname,
+        CodeInterpreterFactory.new_instance() as code_interpreter,
+    ):
+        image_paths = []
+        for i, image in enumerate(images[:3]):
+            image_path = f"{tmpdirname}/image_{i}.png"
+            Image.fromarray(image).save(image_path)
+            image_paths.append(image_path)
+        query = lmm.generate(CATEGORIZE_TOOL_REQUEST.format(task=task))
+        category = extract_tag(query, "category")  # type: ignore
+        if category is None:
+            category = task
+        else:
+            category = (
+                f"I need models from the {category.strip()} category of tools. {task}"
+            )
+        tool_docs = TOOL_RECOMMENDER.top_k(category, k=10, thresh=0.2)
+        if exclude_tools is not None and len(exclude_tools) > 0:
+            cleaned_tool_docs = []
+            for tool_doc in tool_docs:
+                if not tool_doc["name"] in exclude_tools:
+                    cleaned_tool_docs.append(tool_doc)
+            tool_docs = cleaned_tool_docs
+        tool_docs_str = "\n".join([e["doc"] for e in tool_docs])
+        prompt = TEST_TOOLS.format(
+            tool_docs=tool_docs_str,
+            previous_attempts="",
+            user_request=task,
+            examples=EXAMPLES,
+            media=str(image_paths),
+        )
+        response = lmm.generate(prompt, media=image_paths)
+        code = extract_tag(response, "code")  # type: ignore
+        if code is None:
+            raise ValueError(f"Could not extract code from response: {response}")
+        tool_output = code_interpreter.exec_isolation(
+            DefaultImports.prepend_imports(code)
+        )
+        tool_output_str = tool_output.text(include_results=False).strip()
+        count = 1
+        while (
+            not tool_output.success
+            or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
+        ) and count <= 3:
+            if tool_output_str.strip() == "":
+                tool_output_str = "EMPTY"
+            prompt = TEST_TOOLS.format(
+                tool_docs=tool_docs_str,
+                previous_attempts=f"<code>\n{code}\n</code>\nTOOL OUTPUT\n{tool_output_str}",
+                user_request=task,
+                examples=EXAMPLES,
+                media=str(image_paths),
+            )
+            code = extract_code(lmm.generate(prompt, media=image_paths))  # type: ignore
+            tool_output = code_interpreter.exec_isolation(
+                DefaultImports.prepend_imports(code)
+            )
+            tool_output_str = tool_output.text(include_results=False).strip()
+        error_message = ""
+        prompt = PICK_TOOL.format(
+            tool_docs=tool_docs_str,
+            user_request=task,
+            context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
+            previous_attempts=error_message,
+        )
+        response = lmm.generate(prompt, media=image_paths)
+        tool_choice_context = extract_tag(response, "json")  # type: ignore
+        tool_choice_context_dict = extract_json(tool_choice_context)  # type: ignore
+        tool, tool_thoughts, tool_docstring, error_message = extract_tool_info(
+            tool_choice_context_dict
+        )
+        count = 1
+        while tool is None and count <= 3:
+            prompt = PICK_TOOL.format(
+                tool_docs=tool_docs_str,
+                user_request=task,
+                context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
+                previous_attempts=error_message,
+            )
+            tool_choice_context_dict = extract_json(lmm.generate(prompt, media=image_paths))  # type: ignore
+            tool, tool_thoughts, tool_docstring, error_message = extract_tool_info(
+                tool_choice_context_dict
+            )
+        try:
+            shutil.rmtree(tmpdirname)
+        except Exception as e:
+            _LOGGER.error(f"Error removing temp directory: {e}")
+    print(
+        f"[get_tool_for_task output]\n{tool_thoughts}\n\nTool Documentation:\n{tool_docstring}\n[end of get_tool_for_task output]\n"
+    )
+def finalize_plan(user_request: str, chain_of_thoughts: str) -> str:
+    """Finalizes the plan by taking the user request and the chain of thoughts that
+    represent the plan and returns the finalized plan.
+    """
+    lmm = AnthropicLMM()
+    prompt = FINALIZE_PLAN.format(
+        user_request=user_request, chain_of_thoughts=chain_of_thoughts
+    )
+    finalized_plan = cast(str, lmm.generate(prompt))
+    return finalized_plan
+def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
+    """Asks the Claude-3.5 model a question about the given media and returns an answer.
+    Parameters:
+        prompt: str: The question to ask the model.
+        medias: List[np.ndarray]: The images to ask the question about, it could also
+            be frames from a video. You can send up to 5 frames from a video.
+    """
+    lmm = AnthropicLMM()
+    if isinstance(medias, np.ndarray):
+        medias = [medias]
+    if isinstance(medias, list) and len(medias) > 5:
+        medias = medias[:5]
+    all_media_b64 = [
+        "data:image/png;base64," + convert_to_b64(media) for media in medias
+    ]
+    response = cast(str, lmm.generate(prompt, media=all_media_b64))
+    print(f"[claude35_vqa output]\n{response}\n[end of claude35_vqa output]")
+def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
+    """Given your problem statement and the images, this will provide you with a
+    suggested plan on how to proceed. Always call suggestion when starting to solve
+    a problem.
+    Parameters:
+        prompt: str: The problem statement.
+        medias: List[np.ndarray]: The images to use for the problem
+    """
+    try:
+        from .suggestion import suggestion_impl  # type: ignore
+        suggestion = suggestion_impl(prompt, medias)
+        print(suggestion)
+    except ImportError:
+        print("")
+PLANNER_TOOLS = [
+    claude35_vqa,
+    suggestion,
+    get_tool_for_task,
+    T.load_image,
+    T.save_image,
+    T.extract_frames_and_timestamps,
+    T.save_video,
+]
+PLANNER_DOCSTRING = T.get_tool_documentation(PLANNER_TOOLS)  # type: ignore

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
+import numpy as np
 import pandas as pd
 from IPython.display import display
 from pydantic import BaseModel
@@ -14,6 +15,7 @@ from urllib3.util.retry import Retry
 from vision_agent.tools.tools_types import BoundingBoxes
 from vision_agent.utils.exceptions import RemoteToolCallFailed
 from vision_agent.utils.execute import Error, MimeType
+from vision_agent.utils.image_utils import normalize_bbox
 from vision_agent.utils.type_defs import LandingaiAPIKey
 _LOGGER = logging.getLogger(__name__)
@@ -170,7 +172,7 @@ def get_tool_descriptions_by_names(
 def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
-    data: Dict[str, List[str]] = {"desc": [], "doc": []}
+    data: Dict[str, List[str]] = {"desc": [], "doc": [], "name": []}
     for func in funcs:
         desc = func.__doc__
@@ -182,6 +184,7 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
         doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
         data["desc"].append(desc)
         data["doc"].append(doc)
+        data["name"].append(func.__name__)
     return pd.DataFrame(data)  # type: ignore
@@ -256,3 +259,64 @@ def filter_bboxes_by_threshold(
     bboxes: BoundingBoxes, threshold: float
 ) -> BoundingBoxes:
     return list(filter(lambda bbox: bbox.score >= threshold, bboxes))
+def add_bboxes_from_masks(
+    all_preds: List[List[Dict[str, Any]]],
+) -> List[List[Dict[str, Any]]]:
+    for frame_preds in all_preds:
+        for preds in frame_preds:
+            if np.sum(preds["mask"]) == 0:
+                preds["bbox"] = []
+            else:
+                rows, cols = np.where(preds["mask"])
+                bbox = [
+                    float(np.min(cols)),
+                    float(np.min(rows)),
+                    float(np.max(cols)),
+                    float(np.max(rows)),
+                ]
+                bbox = normalize_bbox(bbox, preds["mask"].shape)
+                preds["bbox"] = bbox
+    return all_preds
+def calculate_iou(bbox1: List[float], bbox2: List[float]) -> float:
+    x1, y1, x2, y2 = bbox1
+    x3, y3, x4, y4 = bbox2
+    x_overlap = max(0, min(x2, x4) - max(x1, x3))
+    y_overlap = max(0, min(y2, y4) - max(y1, y3))
+    intersection = x_overlap * y_overlap
+    area1 = (x2 - x1) * (y2 - y1)
+    area2 = (x4 - x3) * (y4 - y3)
+    union = area1 + area2 - intersection
+    return intersection / union if union > 0 else 0
+def single_nms(
+    preds: List[Dict[str, Any]], iou_threshold: float
+) -> List[Dict[str, Any]]:
+    for i in range(len(preds)):
+        for j in range(i + 1, len(preds)):
+            if calculate_iou(preds[i]["bbox"], preds[j]["bbox"]) > iou_threshold:
+                if preds[i]["score"] > preds[j]["score"]:
+                    preds[j]["score"] = 0
+                else:
+                    preds[i]["score"] = 0
+    return [pred for pred in preds if pred["score"] > 0]
+def nms(
+    all_preds: List[List[Dict[str, Any]]], iou_threshold: float
+) -> List[List[Dict[str, Any]]]:
+    return_preds = []
+    for frame_preds in all_preds:
+        frame_preds = single_nms(frame_preds, iou_threshold)
+        return_preds.append(frame_preds)
+    return return_preds

vision_agent/tools/tools.py CHANGED Viewed

@@ -17,15 +17,18 @@ from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
 from vision_agent.clients.landing_public_api import LandingPublicAPI
-from vision_agent.lmm.lmm import OpenAILMM
+from vision_agent.lmm.lmm import AnthropicLMM, OpenAILMM
 from vision_agent.tools.tool_utils import (
+    add_bboxes_from_masks,
     filter_bboxes_by_threshold,
     get_tool_descriptions,
     get_tool_documentation,
     get_tools_df,
     get_tools_info,
+    nms,
     send_inference_request,
     send_task_inference_request,
+    single_nms,
 )
 from vision_agent.tools.tools_types import JobStatus, ODResponseData
 from vision_agent.utils.exceptions import FineTuneModelIsNotReady
@@ -260,8 +263,8 @@ def owl_v2_video(
             ...
         ]
     """
-    if len(frames) == 0:
-        raise ValueError("No frames provided")
+    if len(frames) == 0 or not isinstance(frames, List):
+        raise ValueError("Must provide a list of numpy arrays for frames")
     image_size = frames[0].shape[:2]
     buffer_bytes = frames_to_bytes(frames)
@@ -455,7 +458,7 @@ def florence2_sam2_image(
 def florence2_sam2_video_tracking(
     prompt: str,
     frames: List[np.ndarray],
-    chunk_length: Optional[int] = 3,
+    chunk_length: Optional[int] = 10,
     fine_tune_id: Optional[str] = None,
 ) -> List[List[Dict[str, Any]]]:
     """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
@@ -473,11 +476,11 @@ def florence2_sam2_video_tracking(
             fine-tuned model ID here to use it.
     Returns:
-        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
-        and segment mask. The outer list represents each frame and the inner list is
-        the entities per frame. The label contains the object ID followed by the label
-        name. The objects are only identified in the first framed and tracked
-        throughout the video.
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
+        label,segment mask and bounding boxes. The outer list represents each frame and
+        the inner list is the entities per frame. The label contains the object ID
+        followed by the label name. The objects are only identified in the first framed
+        and tracked throughout the video.
     Example
     -------
@@ -486,6 +489,7 @@ def florence2_sam2_video_tracking(
             [
                 {
                     'label': '0: dinosaur',
+                    'bbox': [0.1, 0.11, 0.35, 0.4],
                     'mask': array([[0, 0, 0, ..., 0, 0, 0],
                         [0, 0, 0, ..., 0, 0, 0],
                         ...,
@@ -496,8 +500,8 @@ def florence2_sam2_video_tracking(
             ...
         ]
     """
-    if len(frames) == 0:
-        raise ValueError("No frames provided")
+    if len(frames) == 0 or not isinstance(frames, List):
+        raise ValueError("Must provide a list of numpy arrays for frames")
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
@@ -535,7 +539,8 @@ def florence2_sam2_video_tracking(
             label = str(detection["id"]) + ": " + detection["label"]
             return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
         return_data.append(return_frame_data)
-    return return_data
+    return_data = add_bboxes_from_masks(return_data)
+    return nms(return_data, iou_threshold=0.95)
 def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -677,8 +682,9 @@ def countgd_counting(
     image: np.ndarray,
     box_threshold: float = 0.23,
 ) -> List[Dict[str, Any]]:
-    """'countgd_counting' is a tool that can precisely count multiple instances of an
-    object given a text prompt. It returns a list of bounding boxes with normalized
+    """'countgd_counting' is a tool that can detect multiple instances of an object
+    given a text prompt. It is particularly useful when trying to detect and count a
+    large number of objects. It returns a list of bounding boxes with normalized
     coordinates, label names and associated confidence scores.
     Parameters:
@@ -711,7 +717,7 @@ def countgd_counting(
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     payload = {
-        "prompts": [prompt.replace(", ", " .")],
+        "prompts": [prompt.replace(", ", ". ")],
         "confidence": box_threshold,  # still not being used in the API
         "model": "countgd",
     }
@@ -733,7 +739,8 @@ def countgd_counting(
     ]
     # TODO: remove this once we start to use the confidence on countgd
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
-    return [bbox.model_dump() for bbox in filtered_bboxes]
+    return_data = [bbox.model_dump() for bbox in filtered_bboxes]
+    return single_nms(return_data, iou_threshold=0.80)
 def countgd_example_based_counting(
@@ -864,9 +871,10 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
 def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
-    """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images
-    including regular images or images of documents or presentations. It returns text
-    as an answer to the question.
+    """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
+    images including regular images or images of documents or presentations. It can be
+    very useful for document QA or OCR text extraction. It returns text as an answer to
+    the question.
     Parameters:
         prompt (str): The question about the document image
@@ -880,6 +888,9 @@ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
         >>> qwen2_vl_images_vqa('Give a summary of the document', images)
         'The document talks about the history of the United States of America and its...'
     """
+    if isinstance(images, np.ndarray):
+        images = [images]
     for image in images:
         if image.shape[0] < 1 or image.shape[1] < 1:
             raise ValueError(f"Image is empty, image shape: {image.shape}")
@@ -896,6 +907,30 @@ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
     return cast(str, data)
+def claude35_text_extraction(image: np.ndarray) -> str:
+    """'claude35_text_extraction' is a tool that can extract text from an image. It
+    returns the extracted text as a string and can be used as an alternative to OCR if
+    you do not need to know the exact bounding box of the text.
+    Parameters:
+        image (np.ndarray): The image to extract text from.
+    Returns:
+        str: The extracted text from the image.
+    """
+    lmm = AnthropicLMM()
+    buffer = io.BytesIO()
+    Image.fromarray(image).save(buffer, format="PNG")
+    image_bytes = buffer.getvalue()
+    image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
+    text = lmm.generate(
+        "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
+        [image_b64],
+    )
+    return cast(str, text)
 def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
     including regular videos or videos of documents or presentations. It returns text
@@ -944,6 +979,9 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
         'Lionel Messi'
     """
+    if len(frames) == 0 or not isinstance(frames, List):
+        raise ValueError("Must provide a list of numpy arrays for frames")
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
     payload = {
@@ -1798,24 +1836,33 @@ def flux_image_inpainting(
         ... )
         >>> save_image(result, "inpainted_room.png")
     """
-    if (
-        image.shape[0] < 8
-        or image.shape[1] < 8
-        or mask.shape[0] < 8
-        or mask.shape[1] < 8
-    ):
-        raise ValueError("The image or mask does not have enough size for inpainting")
-    if image.shape[0] % 8 != 0 or image.shape[1] % 8 != 0:
-        new_height = (image.shape[0] // 8) * 8
-        new_width = (image.shape[1] // 8) * 8
-        image = cv2.resize(image, (new_width, new_height))
-        mask = cv2.resize(mask, (new_width, new_height))
+    min_dim = 8
+    if any(dim < min_dim for dim in image.shape[:2] + mask.shape[:2]):
+        raise ValueError(f"Image and mask must be at least {min_dim}x{min_dim} pixels")
+    max_size = (512, 512)
+    if image.shape[0] > max_size[0] or image.shape[1] > max_size[1]:
+        scaling_factor = min(max_size[0] / image.shape[0], max_size[1] / image.shape[1])
+        new_size = (
+            int(image.shape[1] * scaling_factor),
+            int(image.shape[0] * scaling_factor),
+        )
+        new_size = ((new_size[0] // 8) * 8, (new_size[1] // 8) * 8)
+        image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
+        mask = cv2.resize(mask, new_size, interpolation=cv2.INTER_NEAREST)
+    elif image.shape[0] % 8 != 0 or image.shape[1] % 8 != 0:
+        new_size = ((image.shape[1] // 8) * 8, (image.shape[0] // 8) * 8)
+        image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
+        mask = cv2.resize(mask, new_size, interpolation=cv2.INTER_NEAREST)
     if np.array_equal(mask, mask.astype(bool).astype(int)):
         mask = np.where(mask > 0, 255, 0).astype(np.uint8)
     else:
-        raise ValueError("The mask should be a binary mask with 0's and 1's")
+        raise ValueError("Mask should contain only binary values (0 or 1)")
     image_file = numpy_to_bytes(image)
     mask_file = numpy_to_bytes(mask)
@@ -2148,7 +2195,8 @@ def overlay_bounding_boxes(
         bboxes = bbox_int[i]
         bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
-        if len(bboxes) > 40:
+        # if more than 50 boxes use small boxes to indicate objects else use regular boxes
+        if len(bboxes) > 50:
             pil_image = _plot_counting(pil_image, bboxes, color)
         else:
             width, height = pil_image.size
@@ -2179,7 +2227,14 @@ def overlay_bounding_boxes(
                 draw.text((box[0], box[1]), text, fill="black", font=font)
         frame_out.append(np.array(pil_image))
-    return frame_out[0] if len(frame_out) == 1 else frame_out
+    return_frame = frame_out[0] if len(frame_out) == 1 else frame_out
+    if isinstance(return_frame, np.ndarray):
+        from IPython.display import display
+        display(Image.fromarray(return_frame))
+    return return_frame  # type: ignore
 def _get_text_coords_from_mask(
@@ -2291,7 +2346,14 @@ def overlay_segmentation_masks(
                     draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
                     draw.text((x, y), text, fill="black", font=font)
         frame_out.append(np.array(pil_image))
-    return frame_out[0] if len(frame_out) == 1 else frame_out
+    return_frame = frame_out[0] if len(frame_out) == 1 else frame_out
+    if isinstance(return_frame, np.ndarray):
+        from IPython.display import display
+        display(Image.fromarray(return_frame))
+    return return_frame  # type: ignore
 def overlay_heat_map(
@@ -2399,6 +2461,7 @@ FUNCTION_TOOLS = [
     florence2_sam2_image,
     florence2_sam2_video_tracking,
     florence2_phrase_grounding,
+    claude35_text_extraction,
     detr_segmentation,
     depth_anything_v2,
     generate_pose_image,

vision_agent/utils/image_utils.py CHANGED Viewed

@@ -42,10 +42,10 @@ def normalize_bbox(
 ) -> List[float]:
     r"""Normalize the bounding box coordinates to be between 0 and 1."""
     x1, y1, x2, y2 = bbox
-    x1 = round(x1 / image_size[1], 2)
-    y1 = round(y1 / image_size[0], 2)
-    x2 = round(x2 / image_size[1], 2)
-    y2 = round(y2 / image_size[0], 2)
+    x1 = max(round(x1 / image_size[1], 2), 0)
+    y1 = max(round(y1 / image_size[0], 2), 0)
+    x2 = min(round(x2 / image_size[1], 2), image_size[1])
+    y2 = min(round(y2 / image_size[0], 2), image_size[0])
     return [x1, y1, x2, y2]
@@ -175,9 +175,15 @@ def encode_media(media: Union[str, Path], resize: Optional[int] = None) -> str:
             return media[:-4] + ".png"
         return media
-    # if media is already a base64 encoded image return
+    # if media is in base64 ensure it's the correct resize
     if isinstance(media, str) and media.startswith("data:image/"):
-        return media
+        image_pil = b64_to_pil(media)
+        if resize is not None:
+            if image_pil.size[0] > resize or image_pil.size[1] > resize:
+                image_pil.thumbnail((resize, resize))
+        buffer = io.BytesIO()
+        image_pil.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
     extension = "png"
     extension = Path(media).suffix

vision-agent 0.2.192__py3-none-any.whl → 0.2.195__py3-none-any.whl

vision-agent 0.2.192py3-none-any.whl → 0.2.195py3-none-any.whl