PyPI - vision-agent - Versions diffs - 0.2.208__tar.gz → 0.2.209__tar.gz - Mend

vision-agent 0.2.208tar.gz → 0.2.209tar.gz

Files changed (46) hide show

{vision_agent-0.2.208 → vision_agent-0.2.209}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.208
+Version: 0.2.209
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -31,6 +31,7 @@ Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
 Requires-Dist: pytube (==15.0.0)
 Requires-Dist: requests (>=2.0.0,<3.0.0)
 Requires-Dist: rich (>=13.7.1,<14.0.0)
+Requires-Dist: scikit-learn (>=1.5.2,<2.0.0)
 Requires-Dist: scipy (>=1.13.0,<1.14.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
 Requires-Dist: tenacity (>=8.3.0,<9.0.0)

{vision_agent-0.2.208 → vision_agent-0.2.209}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.208"
+version = "0.2.209"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"
@@ -46,6 +46,7 @@ pydantic = "2.7.4"
 av = "^11.0.0"
 libcst = "^1.5.0"
 matplotlib = "^3.9.2"
+scikit-learn = "^1.5.2"
 [tool.poetry.group.dev.dependencies]
 autoflake = "1.*"

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/.sim_tools/df.csv RENAMED Viewed

@@ -112,10 +112,11 @@ desc,doc,name
         >>> vit_nsfw_classification(image)
         {""label"": ""normal"", ""scores"": 0.68},
     ",vit_nsfw_classification
-"'countgd_counting' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_counting(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
-'countgd_counting' is a tool that can detect multiple instances of an object
-    given a text prompt. It is particularly useful when trying to detect and count a
-    large number of objects. It returns a list of bounding boxes with normalized
+"'countgd_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
+'countgd_object_detection' is a tool that can detect multiple instances of an
+    object given a text prompt. It is particularly useful when trying to detect and
+    count a large number of objects. You can optionally separate object names in the
+    prompt with commas. It returns a list of bounding boxes with normalized
     coordinates, label names and associated confidence scores.
     Parameters:
@@ -133,14 +134,51 @@ desc,doc,name
     Example
     -------
-        >>> countgd_counting(""flower"", image)
+        >>> countgd_object_detection(""flower"", image)
         [
             {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
             {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
             {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
         ]
-    ",countgd_counting
+    ",countgd_object_detection
+"'countgd_sam2_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names, masks associated confidence scores.","countgd_sam2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
+'countgd_sam2_object_detection' is a tool that can detect multiple instances of
+    an object given a text prompt. It is particularly useful when trying to detect and
+    count a large number of objects. You can optionally separate object names in the
+    prompt with commas. It returns a list of bounding boxes with normalized coordinates,
+    label names, masks associated confidence scores.
+    Parameters:
+        prompt (str): The object that needs to be counted.
+        image (np.ndarray): The image that contains multiple instances of the object.
+        box_threshold (float, optional): The threshold for detection. Defaults
+            to 0.23.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+    Example
+    -------
+        >>> countgd_object_detection(""flower"", image)
+        [
+            {
+                'score': 0.49,
+                'label': 'flower',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
+    ",countgd_sam2_object_detection
 "'florence2_ocr' is a tool that can detect text and text regions in an image. Each text region contains one line of text. It returns a list of detected text, the text region as a bounding box with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","florence2_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
 'florence2_ocr' is a tool that can detect text and text regions in an image.
     Each text region contains one line of text. It returns a list of detected text,
@@ -214,8 +252,8 @@ desc,doc,name
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
-        label,segment mask and bounding boxes. The outer list represents each frame and
-        the inner list is the entities per frame. The label contains the object ID
+        label, segment mask and bounding boxes. The outer list represents each frame
+        and the inner list is the entities per frame. The label contains the object ID
         followed by the label name. The objects are only identified in the first framed
         and tracked throughout the video.
@@ -237,12 +275,12 @@ desc,doc,name
             ...
         ]
     ",florence2_sam2_video_tracking
-"'florence2_phrase_grounding' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores of 1.0.","florence2_phrase_grounding(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
+"'florence2_phrase_grounding' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_phrase_grounding(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
 'florence2_phrase_grounding' is a tool that can detect multiple
     objects given a text prompt which can be object names or caption. You
     can optionally separate the object names in the text with commas. It returns a list
     of bounding boxes with normalized coordinates, label names and associated
-    probability scores of 1.0.
+    confidence scores of 1.0.
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -353,37 +391,24 @@ desc,doc,name
                 [10, 11, 15, ..., 202, 202, 205],
                 [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
     ",generate_pose_image
-'closest_mask_distance' calculates the closest distance between two masks.,"closest_mask_distance(mask1: numpy.ndarray, mask2: numpy.ndarray) -> float:
-'closest_mask_distance' calculates the closest distance between two masks.
-    Parameters:
-        mask1 (np.ndarray): The first mask.
-        mask2 (np.ndarray): The second mask.
-    Returns:
-        float: The closest distance between the two masks.
-    Example
-    -------
-        >>> closest_mask_distance(mask1, mask2)
-        0.5
-    ",closest_mask_distance
-'closest_box_distance' calculates the closest distance between two bounding boxes.,"closest_box_distance(box1: List[float], box2: List[float], image_size: Tuple[int, int]) -> float:
-'closest_box_distance' calculates the closest distance between two bounding boxes.
+"'minimum_distance' calculates the minimum distance between two detections which can include bounding boxes and or masks. This will return the closest distance between the objects, not the distance between the centers of the objects.","minimum_distance(det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]) -> float:
+'minimum_distance' calculates the minimum distance between two detections which
+    can include bounding boxes and or masks. This will return the closest distance
+    between the objects, not the distance between the centers of the objects.
     Parameters:
-        box1 (List[float]): The first bounding box.
-        box2 (List[float]): The second bounding box.
+        det1 (Dict[str, Any]): The first detection of boxes or masks.
+        det2 (Dict[str, Any]): The second detection of boxes or masks.
         image_size (Tuple[int, int]): The size of the image given as (height, width).
     Returns:
-        float: The closest distance between the two bounding boxes.
+        float: The closest distance between the two detections.
     Example
     -------
-        >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
+        >>> closest_distance(det1, det2, image_size)
         141.42
-    ",closest_box_distance
+    ",minimum_distance
 'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen2_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
 'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
     images including regular images or images of documents or presentations. It can be
@@ -561,9 +586,9 @@ desc,doc,name
         >>> save_video(frames)
         ""/tmp/tmpvideo123.mp4""
     ",save_video
-'overlay_bounding_boxes' is a utility function that displays bounding boxes on an image.,"overlay_bounding_boxes(medias: Union[numpy.ndarray, List[numpy.ndarray]], bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]) -> Union[numpy.ndarray, List[numpy.ndarray]]:
+'overlay_bounding_boxes' is a utility function that displays bounding boxes on an image. It will draw a box around the detected object with the label and score.,"overlay_bounding_boxes(medias: Union[numpy.ndarray, List[numpy.ndarray]], bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]) -> Union[numpy.ndarray, List[numpy.ndarray]]:
 'overlay_bounding_boxes' is a utility function that displays bounding boxes on
-    an image.
+    an image. It will draw a box around the detected object with the label and score.
     Parameters:
         medias (Union[np.ndarray, List[np.ndarra]]): The image or frames to display the
@@ -581,9 +606,9 @@ desc,doc,name
             image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
         )
     ",overlay_bounding_boxes
-'overlay_segmentation_masks' is a utility function that displays segmentation masks.,"overlay_segmentation_masks(medias: Union[numpy.ndarray, List[numpy.ndarray]], masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]], draw_label: bool = True, secondary_label_key: str = 'tracking_label') -> Union[numpy.ndarray, List[numpy.ndarray]]:
+'overlay_segmentation_masks' is a utility function that displays segmentation masks. It will overlay a colored mask on the detected object with the label.,"overlay_segmentation_masks(medias: Union[numpy.ndarray, List[numpy.ndarray]], masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]], draw_label: bool = True, secondary_label_key: str = 'tracking_label') -> Union[numpy.ndarray, List[numpy.ndarray]]:
 'overlay_segmentation_masks' is a utility function that displays segmentation
-    masks.
+    masks. It will overlay a colored mask on the detected object with the label.
     Parameters:
         medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/.sim_tools/embs.npy RENAMED Viewed

Binary file

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_planner_prompts_v2.py RENAMED Viewed

@@ -1,7 +1,7 @@
 PLAN = """
 **Role**: You are an expert planning agent that can understand the user request and search for a plan to accomplish it.
-**Task**: As a planning agent you are required to understand the user's request and search for a plan to accomplish it. Use Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Esnure your response is clear, concise, andhelpful. You can use an interactive Pyton (Jupyter Notebok) environment, executing code with <execute_python>, each execution is a new cell so old code and outputs are saved.
+**Task**: As a planning agent you are required to understand the user's request and search for a plan to accomplish it. Use Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Esnure your response is clear, concise, and helpful. You can use an interactive Pyton (Jupyter Notebok) environment, executing code with <execute_python>, each execution is a new cell so old code and outputs are saved.
 **Documentation**: this is the documentation for the functions you can use to accomplish the task:
 {tool_desc}
@@ -18,7 +18,7 @@ PLAN = """
 1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You can <count> number of turns to complete the user's request.
 2. You can execute python code in the ipython notebook using <execute_python> tags. Only output one <execute_python> tag at a time.
 3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
-4. Only load/save files from {media_list} unless you specifically saved the previously.
+4. Only load/save files from {media_list} unless you specifically saved the file previously.
 5. Ensure you always call `suggestion` initially and `get_tool_for_task` to get the right tool for the subtask.
 6. Calling `plt.imshow` or `save_image` will display the image to you, use this to visually check your results.
 7. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
@@ -130,13 +130,14 @@ In these aerial images, I can see approximately 5-6 pedestrians walking in vario
 [end of claude35_vqa_output]
 [get_tool_for_task output]
-After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_counting tool returned an empty list, which is incorrect given the visible evidence in the image. This suggests the tool may have failed to detect people inside vehicles or had difficulty with the aerial perspective. Despite this tool being specifically requested, its output is clearly inaccurate for this task.
+After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool to count the number of pedestrians in the image.
 Tool Documentation:
-countgd_counting(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
-'countgd_counting' is a tool that can precisely count multiple instances of an
-    object given a text prompt. It returns a list of bounding boxes with normalized
-    coordinates, label names and associated confidence scores.
+countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
+    'countgd_object_detection' is a tool that can detect multiple instances of an
+    object given a text prompt. It is particularly useful when trying to detect and
+    count a large number of objects. It returns a list of bounding boxes with
+    normalized coordinates, label names and associated confidence scores.
     Parameters:
         prompt (str): The object that needs to be counted.
@@ -153,7 +154,7 @@ countgd_counting(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23)
     Example
     -------
-        >>> countgd_counting("flower", image)
+        >>> countgd_object_detection("flower", image)
         [
             {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
         ]
@@ -195,7 +196,7 @@ def merge_bounding_box_list(bboxes):
 detections = []
 for region in subdivide_image(image):
-    detections.extend(countgd_counting("pedestrian", region))
+    detections.extend(countgd_object_detection("pedestrian", region))
 detections = merge_bounding_box_list(detections)
 print(len(detections))
 </execute_python>
@@ -203,8 +204,8 @@ print(len(detections))
 OBSERVATION: 24
 <count>7</count>
-AGENT: <thinking>countgd_counting detected 24 pedestrians in the image by subdividing the image into smaller regions, this differs from claude35_vqa's answer of 5-6 pedestrians. I will use countgd_counting's answer since it is specifically designed to count objects.</thinking>
-<finalize_plan>This plan successfully detected 24 pedestrians in the image by subdividing the image into smaller regions and using the countgd_counting tool on each region. The bounding boxes were then merged to count the unique pedestrians.</finalize_plan>
+AGENT: <thinking>countgd_object_detection detected 24 pedestrians in the image by subdividing the image into smaller regions, this differs from claude35_vqa's answer of 5-6 pedestrians. I will use countgd_object_detection's answer since it is specifically designed to count objects.</thinking>
+<finalize_plan>This plan successfully detected 24 pedestrians in the image by subdividing the image into smaller regions and using the countgd_object_detection tool on each region. The bounding boxes were then merged to count the unique pedestrians.</finalize_plan>
 --- END EXAMPLE1 ---
 """
@@ -448,7 +449,7 @@ You are given a task: {task} from the user. Your task is to extract the type of
 - "video object tracking" - tracking objects in a video.
 - "depth and pose estimation" - estimating the depth or pose of objects in an image.
-Return the category inside tags <category># your categories here</category>.
+Return the category or categories (comma separated) inside tags <category># your categories here</category>.
 """
 TEST_TOOLS = """
@@ -492,7 +493,7 @@ Count the number of pedestrians across all the images.
 <code>
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from vision_agent.tools import load_image, owl_v2_image, florence2_phrase_grounding, countgd_counting
+from vision_agent.tools import load_image, owl_v2_image, florence2_phrase_grounding, countgd_object_detection
 # process functions in a try catch so that if it fails it doesn't cause `as_completed` to hang
 def process_owl_v2(image_paths):
@@ -520,7 +521,7 @@ def process_countgd(image_paths):
         results = []
         for image_path in image_paths:
             image = load_image(image_path)
-            results.extend(countgd_counting("person", image))
+            results.extend(countgd_object_detection("person", image))
     except Exception as e:
         results = f"Encountered error when executing process_countgd: {str(e)}"
     return results
@@ -531,7 +532,7 @@ with ThreadPoolExecutor() as executor:
     futures = {{
         executor.submit(process_owl_v2, image_paths): "owl_v2_image",
         executor.submit(process_florence2, image_paths): "florence2_phrase_grounding",
-        executor.submit(process_countgd, image_paths): "countgd_counting",
+        executor.submit(process_countgd, image_paths): "countgd_object_detection",
     }}
     final_results = {{}}

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_planner_v2.py RENAMED Viewed

@@ -49,6 +49,7 @@ UTIL_DOCSTRING = T.get_tool_documentation(
         T.save_video,
         T.overlay_bounding_boxes,
         T.overlay_segmentation_masks,
+        T.minimum_distance,
     ]
 )
 PLANNING_TOOLS_DOCSTRING = UTIL_DOCSTRING + "\n" + pt.PLANNER_DOCSTRING

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -28,8 +28,9 @@ from .tools import (
     clip,
     closest_box_distance,
     closest_mask_distance,
-    countgd_counting,
     countgd_example_based_counting,
+    countgd_object_detection,
+    countgd_sam2_object_detection,
     depth_anything_v2,
     detr_segmentation,
     dpt_hybrid_midas,
@@ -56,6 +57,7 @@ from .tools import (
     load_image,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
+    minimum_distance,
     ocr,
     overlay_bounding_boxes,
     overlay_heat_map,
@@ -64,6 +66,7 @@ from .tools import (
     owl_v2_video,
     qwen2_vl_images_vqa,
     qwen2_vl_video_vqa,
+    sam2,
     save_image,
     save_json,
     save_video,

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/planner_tools.py RENAMED Viewed

@@ -375,9 +375,5 @@ PLANNER_TOOLS = [
     claude35_vqa,
     suggestion,
     get_tool_for_task,
-    T.load_image,
-    T.save_image,
-    T.extract_frames_and_timestamps,
-    T.save_video,
 ]
 PLANNER_DOCSTRING = T.get_tool_documentation(PLANNER_TOOLS)  # type: ignore

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/tools.py RENAMED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 import tempfile
 import urllib.request
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from functools import lru_cache
 from importlib import resources
 from pathlib import Path
@@ -484,8 +485,8 @@ def florence2_sam2_video_tracking(
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
-        label,segment mask and bounding boxes. The outer list represents each frame and
-        the inner list is the entities per frame. The label contains the object ID
+        label, segment mask and bounding boxes. The outer list represents each frame
+        and the inner list is the entities per frame. The label contains the object ID
         followed by the label name. The objects are only identified in the first framed
         and tracked throughout the video.
@@ -684,14 +685,15 @@ def loca_visual_prompt_counting(
     return resp_data
-def countgd_counting(
+def countgd_object_detection(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.23,
 ) -> List[Dict[str, Any]]:
-    """'countgd_counting' is a tool that can detect multiple instances of an object
-    given a text prompt. It is particularly useful when trying to detect and count a
-    large number of objects. It returns a list of bounding boxes with normalized
+    """'countgd_object_detection' is a tool that can detect multiple instances of an
+    object given a text prompt. It is particularly useful when trying to detect and
+    count a large number of objects. You can optionally separate object names in the
+    prompt with commas. It returns a list of bounding boxes with normalized
     coordinates, label names and associated confidence scores.
     Parameters:
@@ -709,7 +711,7 @@ def countgd_counting(
     Example
     -------
-        >>> countgd_counting("flower", image)
+        >>> countgd_object_detection("flower", image)
         [
             {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
@@ -723,19 +725,28 @@ def countgd_counting(
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
-    payload = {
-        "prompts": [prompt.replace(", ", ". ")],
-        "confidence": box_threshold,  # still not being used in the API
-        "model": "countgd",
-    }
-    metadata = {"function_name": "countgd_counting"}
+    prompts = [p.strip() for p in prompt.split(", ")]
-    detections = send_task_inference_request(
-        payload, "text-to-object-detection", files=files, metadata=metadata
-    )
+    def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
+        payload = {
+            "prompts": [prompt],
+            "confidence": box_threshold,  # still not being used in the API
+            "model": "countgd",
+        }
+        metadata = {"function_name": "countgd_counting"}
+        detections = send_task_inference_request(
+            payload, "text-to-object-detection", files=files, metadata=metadata
+        )
+        # get the first frame
+        return detections[0]  # type: ignore
+    bboxes = []
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(_run_countgd, prompt) for prompt in prompts]
+        for future in as_completed(futures):
+            bboxes.extend(future.result())
-    # get the first frame
-    bboxes = detections[0]
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
@@ -750,6 +761,131 @@ def countgd_counting(
     return single_nms(return_data, iou_threshold=0.80)
+def sam2(
+    image: np.ndarray,
+    detections: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """'sam2' is a tool that can segment multiple objects given an input bounding box,
+    label and score. It returns a set of masks along with the corresponding bounding
+    boxes and labels.
+    Parameters:
+        image (np.ndarray): The image that contains multiple instances of the object.
+        detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
+            label, and bounding box of the detected objects with normalized coordinates
+            between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
+            of the top-left and xmax and ymax are the coordinates of the bottom-right of
+            the bounding box.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+    Example
+    -------
+        >>> sam2(image, [
+                {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            ])
+        [
+            {
+                'score': 0.49,
+                'label': 'flower',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
+    """
+    image_size = image.shape[:2]
+    files = [("images", numpy_to_bytes(image))]
+    payload = {
+        "model": "sam2",
+        "bboxes": json.dumps(
+            [
+                {
+                    "labels": [d["label"] for d in detections],
+                    "bboxes": [
+                        denormalize_bbox(d["bbox"], image_size) for d in detections
+                    ],
+                }
+            ]
+        ),
+    }
+    metadata = {"function_name": "sam2"}
+    pred_detections = send_task_inference_request(
+        payload, "sam2", files=files, metadata=metadata
+    )
+    frame = pred_detections[0]
+    return_data = []
+    for inp_detection, detection in zip(detections, frame):
+        mask = rle_decode_array(detection["mask"])
+        label = detection["label"]
+        bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
+        return_data.append(
+            {
+                "label": label,
+                "bbox": bbox,
+                "mask": mask,
+                "score": inp_detection["score"],
+            }
+        )
+    return return_data
+def countgd_sam2_object_detection(
+    prompt: str,
+    image: np.ndarray,
+    box_threshold: float = 0.23,
+) -> List[Dict[str, Any]]:
+    """'countgd_sam2_object_detection' is a tool that can detect multiple instances of
+    an object given a text prompt. It is particularly useful when trying to detect and
+    count a large number of objects. You can optionally separate object names in the
+    prompt with commas. It returns a list of bounding boxes with normalized coordinates,
+    label names, masks associated confidence scores.
+    Parameters:
+        prompt (str): The object that needs to be counted.
+        image (np.ndarray): The image that contains multiple instances of the object.
+        box_threshold (float, optional): The threshold for detection. Defaults
+            to 0.23.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+    Example
+    -------
+        >>> countgd_object_detection("flower", image)
+        [
+            {
+                'score': 0.49,
+                'label': 'flower',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
+    """
+    detections = countgd_object_detection(prompt, image, box_threshold)
+    detections_with_masks = sam2(image, detections)
+    return detections_with_masks
 def countgd_example_based_counting(
     visual_prompts: List[List[float]],
     image: np.ndarray,
@@ -1299,7 +1435,7 @@ def florence2_phrase_grounding(
     objects given a text prompt which can be object names or caption. You
     can optionally separate the object names in the text with commas. It returns a list
     of bounding boxes with normalized coordinates, label names and associated
-    probability scores of 1.0.
+    confidence scores of 1.0.
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -1732,6 +1868,35 @@ def template_match(
     return return_data
+def minimum_distance(
+    det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
+) -> float:
+    """'minimum_distance' calculates the minimum distance between two detections which
+    can include bounding boxes and or masks. This will return the closest distance
+    between the objects, not the distance between the centers of the objects.
+    Parameters:
+        det1 (Dict[str, Any]): The first detection of boxes or masks.
+        det2 (Dict[str, Any]): The second detection of boxes or masks.
+        image_size (Tuple[int, int]): The size of the image given as (height, width).
+    Returns:
+        float: The closest distance between the two detections.
+    Example
+    -------
+        >>> closest_distance(det1, det2, image_size)
+        141.42
+    """
+    if "mask" in det1 and "mask" in det2:
+        return closest_mask_distance(det1["mask"], det2["mask"])
+    elif "bbox" in det1 and "bbox" in det2:
+        return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
+    else:
+        raise ValueError("Both detections must have either bbox or mask")
 def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
     """'closest_mask_distance' calculates the closest distance between two masks.
@@ -2156,7 +2321,7 @@ def overlay_bounding_boxes(
     bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
 ) -> Union[np.ndarray, List[np.ndarray]]:
     """'overlay_bounding_boxes' is a utility function that displays bounding boxes on
-    an image.
+    an image. It will draw a box around the detected object with the label and score.
     Parameters:
         medias (Union[np.ndarray, List[np.ndarra]]): The image or frames to display the
@@ -2270,7 +2435,7 @@ def overlay_segmentation_masks(
     secondary_label_key: str = "tracking_label",
 ) -> Union[np.ndarray, List[np.ndarray]]:
     """'overlay_segmentation_masks' is a utility function that displays segmentation
-    masks.
+    masks. It will overlay a colored mask on the detected object with the label.
     Parameters:
         medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
@@ -2329,11 +2494,25 @@ def overlay_segmentation_masks(
             mask = elt["mask"]
             label = elt["label"]
             tracking_lbl = elt.get(secondary_label_key, None)
+            # Create semi-transparent mask overlay
             np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
-            np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
+            np_mask[mask > 0, :] = color[label] + (255 * 0.7,)
             mask_img = Image.fromarray(np_mask.astype(np.uint8))
             pil_image = Image.alpha_composite(pil_image, mask_img)
+            # Draw contour border
+            mask_uint8 = mask.astype(np.uint8) * 255
+            contours, _ = cv2.findContours(
+                mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            )
+            border_mask = np.zeros(
+                (pil_image.size[1], pil_image.size[0], 4), dtype=np.uint8
+            )
+            cv2.drawContours(border_mask, contours, -1, color[label] + (255,), 8)
+            border_img = Image.fromarray(border_mask)
+            pil_image = Image.alpha_composite(pil_image, border_img)
             if draw_label:
                 draw = ImageDraw.Draw(pil_image)
                 text = tracking_lbl if tracking_lbl else label
@@ -2452,7 +2631,8 @@ FUNCTION_TOOLS = [
     ocr,
     vit_image_classification,
     vit_nsfw_classification,
-    countgd_counting,
+    countgd_object_detection,
+    countgd_sam2_object_detection,
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
@@ -2461,8 +2641,7 @@ FUNCTION_TOOLS = [
     detr_segmentation,
     depth_anything_v2,
     generate_pose_image,
-    closest_mask_distance,
-    closest_box_distance,
+    minimum_distance,
     qwen2_vl_images_vqa,
     qwen2_vl_video_vqa,
     video_temporal_localization,

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/image_utils.py RENAMED Viewed

@@ -42,10 +42,10 @@ def normalize_bbox(
 ) -> List[float]:
     r"""Normalize the bounding box coordinates to be between 0 and 1."""
     x1, y1, x2, y2 = bbox
-    x1 = max(round(x1 / image_size[1], 2), 0)
-    y1 = max(round(y1 / image_size[0], 2), 0)
-    x2 = min(round(x2 / image_size[1], 2), image_size[1])
-    y2 = min(round(y2 / image_size[0], 2), image_size[0])
+    x1 = max(round(x1 / image_size[1], 3), 0)
+    y1 = max(round(y1 / image_size[0], 3), 0)
+    x2 = min(round(x2 / image_size[1], 3), image_size[1])
+    y2 = min(round(y2 / image_size[0], 3), image_size[0])
     return [x1, y1, x2, y2]

{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/sim.py RENAMED Viewed

@@ -109,7 +109,10 @@ class Sim:
     @lru_cache(maxsize=256)
     def top_k(
-        self, query: str, k: int = 5, thresh: Optional[float] = None
+        self,
+        query: str,
+        k: int = 5,
+        thresh: Optional[float] = None,
     ) -> Sequence[Dict]:
         """Returns the top k most similar items to the query.