vision-agent 0.2.221__py3-none-any.whl → 0.2.223__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +253 -244
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/vision_agent_planner_prompts_v2.py +28 -23
- vision_agent/tools/__init__.py +6 -10
- vision_agent/tools/tools.py +639 -787
- vision_agent/utils/sim.py +24 -1
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.223.dist-info}/METADATA +1 -1
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.223.dist-info}/RECORD +10 -10
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.223.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.223.dist-info}/WHEEL +0 -0
    
        vision_agent/.sim_tools/df.csv
    CHANGED
    
    | @@ -1,9 +1,9 @@ | |
| 1 1 | 
             
            desc,doc,name
         | 
| 2 | 
            -
            "' | 
| 3 | 
            -
            ' | 
| 4 | 
            -
                prompt such as category names or referring expressions on images. The | 
| 5 | 
            -
                text prompt are separated by commas. It returns a list of bounding | 
| 6 | 
            -
                normalized coordinates, label names and associated probability scores.
         | 
| 2 | 
            +
            "'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
         | 
| 3 | 
            +
            'owlv2_object_detection' is a tool that can detect and count multiple objects
         | 
| 4 | 
            +
                given a text prompt such as category names or referring expressions on images. The
         | 
| 5 | 
            +
                categories in text prompt are separated by commas. It returns a list of bounding
         | 
| 6 | 
            +
                boxes with normalized coordinates, label names and associated probability scores.
         | 
| 7 7 |  | 
| 8 8 | 
             
                Parameters:
         | 
| 9 9 | 
             
                    prompt (str): The prompt to ground to the image.
         | 
| @@ -22,96 +22,87 @@ desc,doc,name | |
| 22 22 |  | 
| 23 23 | 
             
                Example
         | 
| 24 24 | 
             
                -------
         | 
| 25 | 
            -
                    >>>  | 
| 25 | 
            +
                    >>> owlv2_object_detection(""car, dinosaur"", image)
         | 
| 26 26 | 
             
                    [
         | 
| 27 27 | 
             
                        {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
         | 
| 28 28 | 
             
                        {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         | 
| 29 29 | 
             
                    ]
         | 
| 30 | 
            -
                ", | 
| 31 | 
            -
            "' | 
| 32 | 
            -
            ' | 
| 33 | 
            -
                 | 
| 34 | 
            -
                 | 
| 35 | 
            -
                 | 
| 36 | 
            -
                 | 
| 30 | 
            +
                ",owlv2_object_detection
         | 
| 31 | 
            +
            "'owlv2_sam2_instance_segmentation' is a tool that can detect and count multiple instances of objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names, masks and associated probability scores.","owlv2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1) -> List[Dict[str, Any]]:
         | 
| 32 | 
            +
            'owlv2_sam2_instance_segmentation' is a tool that can detect and count multiple
         | 
| 33 | 
            +
                instances of objects given a text prompt such as category names or referring
         | 
| 34 | 
            +
                expressions on images. The categories in text prompt are separated by commas. It
         | 
| 35 | 
            +
                returns a list of bounding boxes with normalized coordinates, label names, masks
         | 
| 36 | 
            +
                and associated probability scores.
         | 
| 37 37 |  | 
| 38 38 | 
             
                Parameters:
         | 
| 39 | 
            -
                    prompt (str): The  | 
| 40 | 
            -
                     | 
| 41 | 
            -
                    box_threshold (float, optional): The threshold for  | 
| 42 | 
            -
                        to 0. | 
| 43 | 
            -
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         | 
| 44 | 
            -
                        fine-tuned model ID here to use it.
         | 
| 39 | 
            +
                    prompt (str): The object that needs to be counted.
         | 
| 40 | 
            +
                    image (np.ndarray): The image that contains multiple instances of the object.
         | 
| 41 | 
            +
                    box_threshold (float, optional): The threshold for detection. Defaults
         | 
| 42 | 
            +
                        to 0.10.
         | 
| 45 43 |  | 
| 46 44 | 
             
                Returns:
         | 
| 47 | 
            -
                    List[ | 
| 48 | 
            -
                         | 
| 49 | 
            -
                         | 
| 50 | 
            -
                         | 
| 51 | 
            -
                         | 
| 45 | 
            +
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 46 | 
            +
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 47 | 
            +
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 48 | 
            +
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 49 | 
            +
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 50 | 
            +
                        the background.
         | 
| 52 51 |  | 
| 53 52 | 
             
                Example
         | 
| 54 53 | 
             
                -------
         | 
| 55 | 
            -
                    >>>  | 
| 54 | 
            +
                    >>> owlv2_sam2_instance_segmentation(""flower"", image)
         | 
| 56 55 | 
             
                    [
         | 
| 57 | 
            -
                         | 
| 58 | 
            -
                             | 
| 59 | 
            -
                             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 56 | 
            +
                        {
         | 
| 57 | 
            +
                            'score': 0.49,
         | 
| 58 | 
            +
                            'label': 'flower',
         | 
| 59 | 
            +
                            'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 60 | 
            +
                            'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 61 | 
            +
                                [0, 0, 0, ..., 0, 0, 0],
         | 
| 62 | 
            +
                                ...,
         | 
| 63 | 
            +
                                [0, 0, 0, ..., 0, 0, 0],
         | 
| 64 | 
            +
                                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 65 | 
            +
                        },
         | 
| 62 66 | 
             
                    ]
         | 
| 63 | 
            -
                ", | 
| 64 | 
            -
            "' | 
| 65 | 
            -
            ' | 
| 66 | 
            -
                 | 
| 67 | 
            -
                 | 
| 67 | 
            +
                ",owlv2_sam2_instance_segmentation
         | 
| 68 | 
            +
            "'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
         | 
| 69 | 
            +
            'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 70 | 
            +
                prompt such as category names or referring expressions. The categories in the text
         | 
| 71 | 
            +
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 72 | 
            +
                mask file names and associated probability scores.
         | 
| 68 73 |  | 
| 69 74 | 
             
                Parameters:
         | 
| 70 | 
            -
                     | 
| 75 | 
            +
                    prompt (str): The prompt to ground to the image.
         | 
| 76 | 
            +
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 77 | 
            +
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         | 
| 78 | 
            +
                        fine-tuned model ID here to use it.
         | 
| 71 79 |  | 
| 72 80 | 
             
                Returns:
         | 
| 73 | 
            -
                    List[Dict[str, Any]]: A list of dictionaries containing the  | 
| 74 | 
            -
                         | 
| 81 | 
            +
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 82 | 
            +
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 83 | 
            +
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 84 | 
            +
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 85 | 
            +
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 86 | 
            +
                        the background.
         | 
| 75 87 |  | 
| 76 88 | 
             
                Example
         | 
| 77 89 | 
             
                -------
         | 
| 78 | 
            -
                    >>>  | 
| 90 | 
            +
                    >>> owlv2_sam2_video_tracking(""car, dinosaur"", frames)
         | 
| 79 91 | 
             
                    [
         | 
| 80 | 
            -
                         | 
| 92 | 
            +
                        [
         | 
| 93 | 
            +
                            {
         | 
| 94 | 
            +
                                'label': '0: dinosaur',
         | 
| 95 | 
            +
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 96 | 
            +
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 97 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 98 | 
            +
                                    ...,
         | 
| 99 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 100 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 101 | 
            +
                            },
         | 
| 102 | 
            +
                        ],
         | 
| 103 | 
            +
                        ...
         | 
| 81 104 | 
             
                    ]
         | 
| 82 | 
            -
                ", | 
| 83 | 
            -
            'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
         | 
| 84 | 
            -
            'vit_image_classification' is a tool that can classify an image. It returns a
         | 
| 85 | 
            -
                list of classes and their probability scores based on image content.
         | 
| 86 | 
            -
             | 
| 87 | 
            -
                Parameters:
         | 
| 88 | 
            -
                    image (np.ndarray): The image to classify or tag
         | 
| 89 | 
            -
             | 
| 90 | 
            -
                Returns:
         | 
| 91 | 
            -
                    Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
         | 
| 92 | 
            -
                        contains a list of labels and other a list of scores.
         | 
| 93 | 
            -
             | 
| 94 | 
            -
                Example
         | 
| 95 | 
            -
                -------
         | 
| 96 | 
            -
                    >>> vit_image_classification(image)
         | 
| 97 | 
            -
                    {""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
         | 
| 98 | 
            -
                ",vit_image_classification
         | 
| 99 | 
            -
            'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
         | 
| 100 | 
            -
            'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
         | 
| 101 | 
            -
                It returns the predicted label and their probability scores based on image content.
         | 
| 102 | 
            -
             | 
| 103 | 
            -
                Parameters:
         | 
| 104 | 
            -
                    image (np.ndarray): The image to classify or tag
         | 
| 105 | 
            -
             | 
| 106 | 
            -
                Returns:
         | 
| 107 | 
            -
                    Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
         | 
| 108 | 
            -
                        contains a list of labels and other a list of scores.
         | 
| 109 | 
            -
             | 
| 110 | 
            -
                Example
         | 
| 111 | 
            -
                -------
         | 
| 112 | 
            -
                    >>> vit_nsfw_classification(image)
         | 
| 113 | 
            -
                    {""label"": ""normal"", ""scores"": 0.68},
         | 
| 114 | 
            -
                ",vit_nsfw_classification
         | 
| 105 | 
            +
                ",owlv2_sam2_video_tracking
         | 
| 115 106 | 
             
            "'countgd_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
         | 
| 116 107 | 
             
            'countgd_object_detection' is a tool that can detect multiple instances of an
         | 
| 117 108 | 
             
                object given a text prompt. It is particularly useful when trying to detect and
         | 
| @@ -142,12 +133,12 @@ desc,doc,name | |
| 142 133 | 
             
                        {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
         | 
| 143 134 | 
             
                    ]
         | 
| 144 135 | 
             
                ",countgd_object_detection
         | 
| 145 | 
            -
            "' | 
| 146 | 
            -
            ' | 
| 147 | 
            -
                an object given a text prompt. It is particularly useful when trying | 
| 148 | 
            -
                count a large number of objects. You can optionally separate object | 
| 149 | 
            -
                prompt with commas. It returns a list of bounding boxes with | 
| 150 | 
            -
                label names, masks associated confidence scores.
         | 
| 136 | 
            +
            "'countgd_sam2_instance_segmentation' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names, masks associated confidence scores.","countgd_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
         | 
| 137 | 
            +
            'countgd_sam2_instance_segmentation' is a tool that can detect multiple
         | 
| 138 | 
            +
                instances of an object given a text prompt. It is particularly useful when trying
         | 
| 139 | 
            +
                to detect and count a large number of objects. You can optionally separate object
         | 
| 140 | 
            +
                names in the prompt with commas. It returns a list of bounding boxes with
         | 
| 141 | 
            +
                normalized coordinates, label names, masks associated confidence scores.
         | 
| 151 142 |  | 
| 152 143 | 
             
                Parameters:
         | 
| 153 144 | 
             
                    prompt (str): The object that needs to be counted.
         | 
| @@ -165,7 +156,7 @@ desc,doc,name | |
| 165 156 |  | 
| 166 157 | 
             
                Example
         | 
| 167 158 | 
             
                -------
         | 
| 168 | 
            -
                    >>>  | 
| 159 | 
            +
                    >>> countgd_sam2_instance_segmentation(""flower"", image)
         | 
| 169 160 | 
             
                    [
         | 
| 170 161 | 
             
                        {
         | 
| 171 162 | 
             
                            'score': 0.49,
         | 
| @@ -178,7 +169,45 @@ desc,doc,name | |
| 178 169 | 
             
                                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 179 170 | 
             
                        },
         | 
| 180 171 | 
             
                    ]
         | 
| 181 | 
            -
                ", | 
| 172 | 
            +
                ",countgd_sam2_instance_segmentation
         | 
| 173 | 
            +
            "'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
         | 
| 174 | 
            +
            'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 175 | 
            +
                prompt such as category names or referring expressions. The categories in the text
         | 
| 176 | 
            +
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 177 | 
            +
                mask file names and associated probability scores.
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                Parameters:
         | 
| 180 | 
            +
                    prompt (str): The prompt to ground to the image.
         | 
| 181 | 
            +
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 182 | 
            +
                    chunk_length (Optional[int]): The number of frames to re-run florence2 to find
         | 
| 183 | 
            +
                        new objects.
         | 
| 184 | 
            +
             | 
| 185 | 
            +
                Returns:
         | 
| 186 | 
            +
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 187 | 
            +
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 188 | 
            +
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 189 | 
            +
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 190 | 
            +
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 191 | 
            +
                        the background.
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                Example
         | 
| 194 | 
            +
                -------
         | 
| 195 | 
            +
                    >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
         | 
| 196 | 
            +
                    [
         | 
| 197 | 
            +
                        [
         | 
| 198 | 
            +
                            {
         | 
| 199 | 
            +
                                'label': '0: dinosaur',
         | 
| 200 | 
            +
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 201 | 
            +
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 202 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 203 | 
            +
                                    ...,
         | 
| 204 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 205 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 206 | 
            +
                            },
         | 
| 207 | 
            +
                        ],
         | 
| 208 | 
            +
                        ...
         | 
| 209 | 
            +
                    ]
         | 
| 210 | 
            +
                ",countgd_sam2_video_tracking
         | 
| 182 211 | 
             
            "'florence2_ocr' is a tool that can detect text and text regions in an image. Each text region contains one line of text. It returns a list of detected text, the text region as a bounding box with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","florence2_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
         | 
| 183 212 | 
             
            'florence2_ocr' is a tool that can detect text and text regions in an image.
         | 
| 184 213 | 
             
                Each text region contains one line of text. It returns a list of detected text,
         | 
| @@ -199,11 +228,12 @@ desc,doc,name | |
| 199 228 | 
             
                        {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
         | 
| 200 229 | 
             
                    ]
         | 
| 201 230 | 
             
                ",florence2_ocr
         | 
| 202 | 
            -
            "' | 
| 203 | 
            -
            ' | 
| 204 | 
            -
                prompt such as category names or referring expressions. The | 
| 205 | 
            -
                prompt are separated by commas. It returns a list of | 
| 206 | 
            -
                mask file names and associated probability scores of | 
| 231 | 
            +
            "'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
         | 
| 232 | 
            +
            'florence2_sam2_instance_segmentation' is a tool that can segment multiple
         | 
| 233 | 
            +
                objects given a text prompt such as category names or referring expressions. The
         | 
| 234 | 
            +
                categories in the text prompt are separated by commas. It returns a list of
         | 
| 235 | 
            +
                bounding boxes, label names, mask file names and associated probability scores of
         | 
| 236 | 
            +
                1.0.
         | 
| 207 237 |  | 
| 208 238 | 
             
                Parameters:
         | 
| 209 239 | 
             
                    prompt (str): The prompt to ground to the image.
         | 
| @@ -221,7 +251,7 @@ desc,doc,name | |
| 221 251 |  | 
| 222 252 | 
             
                Example
         | 
| 223 253 | 
             
                -------
         | 
| 224 | 
            -
                    >>>  | 
| 254 | 
            +
                    >>> florence2_sam2_instance_segmentation(""car, dinosaur"", image)
         | 
| 225 255 | 
             
                    [
         | 
| 226 256 | 
             
                        {
         | 
| 227 257 | 
             
                            'score': 1.0,
         | 
| @@ -234,7 +264,7 @@ desc,doc,name | |
| 234 264 | 
             
                                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 235 265 | 
             
                        },
         | 
| 236 266 | 
             
                    ]
         | 
| 237 | 
            -
                ", | 
| 267 | 
            +
                ",florence2_sam2_instance_segmentation
         | 
| 238 268 | 
             
            'florence2_sam2_video_tracking' is a tool that can segment and track multiple entities in a video given a text prompt such as category names or referring expressions. You can optionally separate the categories in the text with commas. It can find new objects every 'chunk_length' frames and is useful for tracking and counting without duplicating counts and always outputs scores of 1.0.,"florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
         | 
| 239 269 | 
             
            'florence2_sam2_video_tracking' is a tool that can segment and track multiple
         | 
| 240 270 | 
             
                entities in a video given a text prompt such as category names or referring
         | 
| @@ -259,7 +289,7 @@ desc,doc,name | |
| 259 289 |  | 
| 260 290 | 
             
                Example
         | 
| 261 291 | 
             
                -------
         | 
| 262 | 
            -
                    >>>  | 
| 292 | 
            +
                    >>> florence2_sam2_video_tracking(""car, dinosaur"", frames)
         | 
| 263 293 | 
             
                    [
         | 
| 264 294 | 
             
                        [
         | 
| 265 295 | 
             
                            {
         | 
| @@ -275,8 +305,8 @@ desc,doc,name | |
| 275 305 | 
             
                        ...
         | 
| 276 306 | 
             
                    ]
         | 
| 277 307 | 
             
                ",florence2_sam2_video_tracking
         | 
| 278 | 
            -
            "' | 
| 279 | 
            -
            ' | 
| 308 | 
            +
            "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
         | 
| 309 | 
            +
            'florence2_object_detection' is a tool that can detect multiple
         | 
| 280 310 | 
             
                objects given a text prompt which can be object names or caption. You
         | 
| 281 311 | 
             
                can optionally separate the object names in the text with commas. It returns a list
         | 
| 282 312 | 
             
                of bounding boxes with normalized coordinates, label names and associated
         | 
| @@ -297,12 +327,12 @@ desc,doc,name | |
| 297 327 |  | 
| 298 328 | 
             
                Example
         | 
| 299 329 | 
             
                -------
         | 
| 300 | 
            -
                    >>>  | 
| 330 | 
            +
                    >>> florence2_object_detection('person looking at a coyote', image)
         | 
| 301 331 | 
             
                    [
         | 
| 302 332 | 
             
                        {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
         | 
| 303 333 | 
             
                        {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
         | 
| 304 334 | 
             
                    ]
         | 
| 305 | 
            -
                ", | 
| 335 | 
            +
                ",florence2_object_detection
         | 
| 306 336 | 
             
            'claude35_text_extraction' is a tool that can extract text from an image. It returns the extracted text as a string and can be used as an alternative to OCR if you do not need to know the exact bounding box of the text.,"claude35_text_extraction(image: numpy.ndarray) -> str:
         | 
| 307 337 | 
             
            'claude35_text_extraction' is a tool that can extract text from an image. It
         | 
| 308 338 | 
             
                returns the extracted text as a string and can be used as an alternative to OCR if
         | 
| @@ -314,6 +344,107 @@ desc,doc,name | |
| 314 344 | 
             
                Returns:
         | 
| 315 345 | 
             
                    str: The extracted text from the image.
         | 
| 316 346 | 
             
                ",claude35_text_extraction
         | 
| 347 | 
            +
            "'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
         | 
| 348 | 
            +
            'document_extraction' is a tool that can extract structured information out of
         | 
| 349 | 
            +
                documents with different layouts. It returns the extracted data in a structured
         | 
| 350 | 
            +
                hierarchical format containing text, tables, pictures, charts, and other
         | 
| 351 | 
            +
                information.
         | 
| 352 | 
            +
             | 
| 353 | 
            +
                Parameters:
         | 
| 354 | 
            +
                    image (np.ndarray): The document image to analyze
         | 
| 355 | 
            +
             | 
| 356 | 
            +
                Returns:
         | 
| 357 | 
            +
                    Dict[str, Any]: A dictionary containing the extracted information.
         | 
| 358 | 
            +
             | 
| 359 | 
            +
                Example
         | 
| 360 | 
            +
                -------
         | 
| 361 | 
            +
                    >>> document_analysis(image)
         | 
| 362 | 
            +
                    {'pages':
         | 
| 363 | 
            +
                        [{'bbox': [0, 0, 1.0, 1.0],
         | 
| 364 | 
            +
                                'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
         | 
| 365 | 
            +
                                            'label': 'page_header',
         | 
| 366 | 
            +
                                            'order': 75
         | 
| 367 | 
            +
                                            'caption': 'Annual Report 2024',
         | 
| 368 | 
            +
                                            'summary': 'This annual report summarizes ...' },
         | 
| 369 | 
            +
                                           {'bbox': [0.2, 0.9, 0.9, 1.0],
         | 
| 370 | 
            +
                                            'label': 'table',
         | 
| 371 | 
            +
                                            'order': 1119,
         | 
| 372 | 
            +
                                            'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
         | 
| 373 | 
            +
                                            'summary': 'This table illustrates a trend of ...'},
         | 
| 374 | 
            +
                                ],
         | 
| 375 | 
            +
                ",document_extraction
         | 
| 376 | 
            +
            "'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
         | 
| 377 | 
            +
            'document_qa' is a tool that can answer any questions about arbitrary documents,
         | 
| 378 | 
            +
                presentations, or tables. It's very useful for document QA tasks, you can ask it a
         | 
| 379 | 
            +
                specific question or ask it to return a JSON object answering multiple questions
         | 
| 380 | 
            +
                about the document.
         | 
| 381 | 
            +
             | 
| 382 | 
            +
                Parameters:
         | 
| 383 | 
            +
                    prompt (str): The question to be answered about the document image.
         | 
| 384 | 
            +
                    image (np.ndarray): The document image to analyze.
         | 
| 385 | 
            +
             | 
| 386 | 
            +
                Returns:
         | 
| 387 | 
            +
                    str: The answer to the question based on the document's context.
         | 
| 388 | 
            +
             | 
| 389 | 
            +
                Example
         | 
| 390 | 
            +
                -------
         | 
| 391 | 
            +
                    >>> document_qa(image, question)
         | 
| 392 | 
            +
                    'The answer to the question ...'
         | 
| 393 | 
            +
                ",document_qa
         | 
| 394 | 
            +
            "'ocr' extracts text from an image. It returns a list of detected text, bounding boxes with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
         | 
| 395 | 
            +
            'ocr' extracts text from an image. It returns a list of detected text, bounding
         | 
| 396 | 
            +
                boxes with normalized coordinates, and confidence scores. The results are sorted
         | 
| 397 | 
            +
                from top-left to bottom right.
         | 
| 398 | 
            +
             | 
| 399 | 
            +
                Parameters:
         | 
| 400 | 
            +
                    image (np.ndarray): The image to extract text from.
         | 
| 401 | 
            +
             | 
| 402 | 
            +
                Returns:
         | 
| 403 | 
            +
                    List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
         | 
| 404 | 
            +
                        with normalized coordinates, and confidence score.
         | 
| 405 | 
            +
             | 
| 406 | 
            +
                Example
         | 
| 407 | 
            +
                -------
         | 
| 408 | 
            +
                    >>> ocr(image)
         | 
| 409 | 
            +
                    [
         | 
| 410 | 
            +
                        {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
         | 
| 411 | 
            +
                    ]
         | 
| 412 | 
            +
                ",ocr
         | 
| 413 | 
            +
            'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen2_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
         | 
| 414 | 
            +
            'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
         | 
| 415 | 
            +
                images including regular images or images of documents or presentations. It can be
         | 
| 416 | 
            +
                very useful for document QA or OCR text extraction. It returns text as an answer to
         | 
| 417 | 
            +
                the question.
         | 
| 418 | 
            +
             | 
| 419 | 
            +
                Parameters:
         | 
| 420 | 
            +
                    prompt (str): The question about the document image
         | 
| 421 | 
            +
                    images (List[np.ndarray]): The reference images used for the question
         | 
| 422 | 
            +
             | 
| 423 | 
            +
                Returns:
         | 
| 424 | 
            +
                    str: A string which is the answer to the given prompt.
         | 
| 425 | 
            +
             | 
| 426 | 
            +
                Example
         | 
| 427 | 
            +
                -------
         | 
| 428 | 
            +
                    >>> qwen2_vl_images_vqa('Give a summary of the document', images)
         | 
| 429 | 
            +
                    'The document talks about the history of the United States of America and its...'
         | 
| 430 | 
            +
                ",qwen2_vl_images_vqa
         | 
| 431 | 
            +
            'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen2_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
         | 
| 432 | 
            +
            'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
         | 
| 433 | 
            +
                including regular videos or videos of documents or presentations. It returns text
         | 
| 434 | 
            +
                as an answer to the question.
         | 
| 435 | 
            +
             | 
| 436 | 
            +
                Parameters:
         | 
| 437 | 
            +
                    prompt (str): The question about the video
         | 
| 438 | 
            +
                    frames (List[np.ndarray]): The reference frames used for the question
         | 
| 439 | 
            +
             | 
| 440 | 
            +
                Returns:
         | 
| 441 | 
            +
                    str: A string which is the answer to the given prompt.
         | 
| 442 | 
            +
             | 
| 443 | 
            +
                Example
         | 
| 444 | 
            +
                -------
         | 
| 445 | 
            +
                    >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
         | 
| 446 | 
            +
                    'Lionel Messi'
         | 
| 447 | 
            +
                ",qwen2_vl_video_vqa
         | 
| 317 448 | 
             
            "'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects as labels, their regions as masks and their scores.","detr_segmentation(image: numpy.ndarray) -> List[Dict[str, Any]]:
         | 
| 318 449 | 
             
            'detr_segmentation' is a tool that can segment common objects in an
         | 
| 319 450 | 
             
                image without any text prompt. It returns a list of detected objects
         | 
| @@ -391,106 +522,38 @@ desc,doc,name | |
| 391 522 | 
             
                            [10, 11, 15, ..., 202, 202, 205],
         | 
| 392 523 | 
             
                            [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
         | 
| 393 524 | 
             
                ",generate_pose_image
         | 
| 394 | 
            -
             | 
| 395 | 
            -
            ' | 
| 396 | 
            -
                 | 
| 397 | 
            -
                between the objects, not the distance between the centers of the objects.
         | 
| 398 | 
            -
             | 
| 399 | 
            -
                Parameters:
         | 
| 400 | 
            -
                    det1 (Dict[str, Any]): The first detection of boxes or masks.
         | 
| 401 | 
            -
                    det2 (Dict[str, Any]): The second detection of boxes or masks.
         | 
| 402 | 
            -
                    image_size (Tuple[int, int]): The size of the image given as (height, width).
         | 
| 403 | 
            -
             | 
| 404 | 
            -
                Returns:
         | 
| 405 | 
            -
                    float: The closest distance between the two detections.
         | 
| 406 | 
            -
             | 
| 407 | 
            -
                Example
         | 
| 408 | 
            -
                -------
         | 
| 409 | 
            -
                    >>> closest_distance(det1, det2, image_size)
         | 
| 410 | 
            -
                    141.42
         | 
| 411 | 
            -
                ",minimum_distance
         | 
| 412 | 
            -
            'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen2_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
         | 
| 413 | 
            -
            'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
         | 
| 414 | 
            -
                images including regular images or images of documents or presentations. It can be
         | 
| 415 | 
            -
                very useful for document QA or OCR text extraction. It returns text as an answer to
         | 
| 416 | 
            -
                the question.
         | 
| 417 | 
            -
             | 
| 418 | 
            -
                Parameters:
         | 
| 419 | 
            -
                    prompt (str): The question about the document image
         | 
| 420 | 
            -
                    images (List[np.ndarray]): The reference images used for the question
         | 
| 421 | 
            -
             | 
| 422 | 
            -
                Returns:
         | 
| 423 | 
            -
                    str: A string which is the answer to the given prompt.
         | 
| 424 | 
            -
             | 
| 425 | 
            -
                Example
         | 
| 426 | 
            -
                -------
         | 
| 427 | 
            -
                    >>> qwen2_vl_images_vqa('Give a summary of the document', images)
         | 
| 428 | 
            -
                    'The document talks about the history of the United States of America and its...'
         | 
| 429 | 
            -
                ",qwen2_vl_images_vqa
         | 
| 430 | 
            -
            'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen2_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
         | 
| 431 | 
            -
            'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
         | 
| 432 | 
            -
                including regular videos or videos of documents or presentations. It returns text
         | 
| 433 | 
            -
                as an answer to the question.
         | 
| 434 | 
            -
             | 
| 435 | 
            -
                Parameters:
         | 
| 436 | 
            -
                    prompt (str): The question about the video
         | 
| 437 | 
            -
                    frames (List[np.ndarray]): The reference frames used for the question
         | 
| 438 | 
            -
             | 
| 439 | 
            -
                Returns:
         | 
| 440 | 
            -
                    str: A string which is the answer to the given prompt.
         | 
| 441 | 
            -
             | 
| 442 | 
            -
                Example
         | 
| 443 | 
            -
                -------
         | 
| 444 | 
            -
                    >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
         | 
| 445 | 
            -
                    'Lionel Messi'
         | 
| 446 | 
            -
                ",qwen2_vl_video_vqa
         | 
| 447 | 
            -
            "'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
         | 
| 448 | 
            -
            'document_extraction' is a tool that can extract structured information out of
         | 
| 449 | 
            -
                documents with different layouts. It returns the extracted data in a structured
         | 
| 450 | 
            -
                hierarchical format containing text, tables, pictures, charts, and other
         | 
| 451 | 
            -
                information.
         | 
| 525 | 
            +
            'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
         | 
| 526 | 
            +
            'vit_image_classification' is a tool that can classify an image. It returns a
         | 
| 527 | 
            +
                list of classes and their probability scores based on image content.
         | 
| 452 528 |  | 
| 453 529 | 
             
                Parameters:
         | 
| 454 | 
            -
                    image (np.ndarray): The  | 
| 530 | 
            +
                    image (np.ndarray): The image to classify or tag
         | 
| 455 531 |  | 
| 456 532 | 
             
                Returns:
         | 
| 457 | 
            -
                    Dict[str, Any]: A dictionary containing the  | 
| 533 | 
            +
                    Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
         | 
| 534 | 
            +
                        contains a list of labels and other a list of scores.
         | 
| 458 535 |  | 
| 459 536 | 
             
                Example
         | 
| 460 537 | 
             
                -------
         | 
| 461 | 
            -
                    >>>  | 
| 462 | 
            -
                    { | 
| 463 | 
            -
             | 
| 464 | 
            -
             | 
| 465 | 
            -
             | 
| 466 | 
            -
             | 
| 467 | 
            -
                                            'caption': 'Annual Report 2024',
         | 
| 468 | 
            -
                                            'summary': 'This annual report summarizes ...' },
         | 
| 469 | 
            -
                                           {'bbox': [0.2, 0.9, 0.9, 1.0],
         | 
| 470 | 
            -
                                            'label': table',
         | 
| 471 | 
            -
                                            'order': 1119,
         | 
| 472 | 
            -
                                            'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
         | 
| 473 | 
            -
                                            'summary': 'This table illustrates a trend of ...'},
         | 
| 474 | 
            -
                                ],
         | 
| 475 | 
            -
                ",document_extraction
         | 
| 476 | 
            -
            "'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
         | 
| 477 | 
            -
            'document_qa' is a tool that can answer any questions about arbitrary documents,
         | 
| 478 | 
            -
                presentations, or tables. It's very useful for document QA tasks, you can ask it a
         | 
| 479 | 
            -
                specific question or ask it to return a JSON object answering multiple questions
         | 
| 480 | 
            -
                about the document.
         | 
| 538 | 
            +
                    >>> vit_image_classification(image)
         | 
| 539 | 
            +
                    {""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
         | 
| 540 | 
            +
                ",vit_image_classification
         | 
| 541 | 
            +
            'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
         | 
| 542 | 
            +
            'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
         | 
| 543 | 
            +
                It returns the predicted label and their probability scores based on image content.
         | 
| 481 544 |  | 
| 482 545 | 
             
                Parameters:
         | 
| 483 | 
            -
                     | 
| 484 | 
            -
                    image (np.ndarray): The document image to analyze.
         | 
| 546 | 
            +
                    image (np.ndarray): The image to classify or tag
         | 
| 485 547 |  | 
| 486 548 | 
             
                Returns:
         | 
| 487 | 
            -
                    str:  | 
| 549 | 
            +
                    Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
         | 
| 550 | 
            +
                        contains a list of labels and other a list of scores.
         | 
| 488 551 |  | 
| 489 552 | 
             
                Example
         | 
| 490 553 | 
             
                -------
         | 
| 491 | 
            -
                    >>>  | 
| 492 | 
            -
                     | 
| 493 | 
            -
                ", | 
| 554 | 
            +
                    >>> vit_nsfw_classification(image)
         | 
| 555 | 
            +
                    {""label"": ""normal"", ""scores"": 0.68},
         | 
| 556 | 
            +
                ",vit_nsfw_classification
         | 
| 494 557 | 
             
            'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
         | 
| 495 558 | 
             
            'video_temporal_localization' will run qwen2vl on each chunk_length_frames
         | 
| 496 559 | 
             
                value selected for the video. It can detect multiple objects independently per
         | 
| @@ -560,78 +623,24 @@ desc,doc,name | |
| 560 623 | 
             
                    >>> siglip_classification(image, ['dog', 'cat', 'bird'])
         | 
| 561 624 | 
             
                    {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
         | 
| 562 625 | 
             
                ",siglip_classification
         | 
| 563 | 
            -
            "' | 
| 564 | 
            -
            ' | 
| 565 | 
            -
                 | 
| 566 | 
            -
                 | 
| 567 | 
            -
                mask file names and associated probability scores.
         | 
| 568 | 
            -
             | 
| 569 | 
            -
                Parameters:
         | 
| 570 | 
            -
                    prompt (str): The prompt to ground to the image.
         | 
| 571 | 
            -
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 572 | 
            -
             | 
| 573 | 
            -
                Returns:
         | 
| 574 | 
            -
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 575 | 
            -
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 576 | 
            -
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 577 | 
            -
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 578 | 
            -
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 579 | 
            -
                        the background.
         | 
| 580 | 
            -
             | 
| 581 | 
            -
                Example
         | 
| 582 | 
            -
                -------
         | 
| 583 | 
            -
                    >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
         | 
| 584 | 
            -
                    [
         | 
| 585 | 
            -
                        [
         | 
| 586 | 
            -
                            {
         | 
| 587 | 
            -
                                'label': '0: dinosaur',
         | 
| 588 | 
            -
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 589 | 
            -
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 590 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 591 | 
            -
                                    ...,
         | 
| 592 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 593 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 594 | 
            -
                            },
         | 
| 595 | 
            -
                        ],
         | 
| 596 | 
            -
                        ...
         | 
| 597 | 
            -
                    ]
         | 
| 598 | 
            -
                ",owlv2_sam2_video_tracking
         | 
| 599 | 
            -
            "'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
         | 
| 600 | 
            -
            'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 601 | 
            -
                prompt such as category names or referring expressions. The categories in the text
         | 
| 602 | 
            -
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 603 | 
            -
                mask file names and associated probability scores.
         | 
| 626 | 
            +
            "'minimum_distance' calculates the minimum distance between two detections which can include bounding boxes and or masks. This will return the closest distance between the objects, not the distance between the centers of the objects.","minimum_distance(det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]) -> float:
         | 
| 627 | 
            +
            'minimum_distance' calculates the minimum distance between two detections which
         | 
| 628 | 
            +
                can include bounding boxes and or masks. This will return the closest distance
         | 
| 629 | 
            +
                between the objects, not the distance between the centers of the objects.
         | 
| 604 630 |  | 
| 605 631 | 
             
                Parameters:
         | 
| 606 | 
            -
                     | 
| 607 | 
            -
                     | 
| 632 | 
            +
                    det1 (Dict[str, Any]): The first detection of boxes or masks.
         | 
| 633 | 
            +
                    det2 (Dict[str, Any]): The second detection of boxes or masks.
         | 
| 634 | 
            +
                    image_size (Tuple[int, int]): The size of the image given as (height, width).
         | 
| 608 635 |  | 
| 609 636 | 
             
                Returns:
         | 
| 610 | 
            -
                     | 
| 611 | 
            -
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 612 | 
            -
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 613 | 
            -
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 614 | 
            -
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 615 | 
            -
                        the background.
         | 
| 637 | 
            +
                    float: The closest distance between the two detections.
         | 
| 616 638 |  | 
| 617 639 | 
             
                Example
         | 
| 618 640 | 
             
                -------
         | 
| 619 | 
            -
                    >>>  | 
| 620 | 
            -
                     | 
| 621 | 
            -
             | 
| 622 | 
            -
                            {
         | 
| 623 | 
            -
                                'label': '0: dinosaur',
         | 
| 624 | 
            -
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 625 | 
            -
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 626 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 627 | 
            -
                                    ...,
         | 
| 628 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 629 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 630 | 
            -
                            },
         | 
| 631 | 
            -
                        ],
         | 
| 632 | 
            -
                        ...
         | 
| 633 | 
            -
                    ]
         | 
| 634 | 
            -
                ",countgd_sam2_video_tracking
         | 
| 641 | 
            +
                    >>> closest_distance(det1, det2, image_size)
         | 
| 642 | 
            +
                    141.42
         | 
| 643 | 
            +
                ",minimum_distance
         | 
| 635 644 | 
             
            "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
         | 
| 636 645 | 
             
            'extract_frames_and_timestamps' extracts frames and timestamps from a video
         | 
| 637 646 | 
             
                which can be a file path, url or youtube link, returns a list of dictionaries
         |