vision-agent 0.2.215__py3-none-any.whl → 0.2.217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +101 -0
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/tools/__init__.py +2 -1
- vision_agent/tools/planner_tools.py +9 -1
- vision_agent/tools/tools.py +331 -213
- {vision_agent-0.2.215.dist-info → vision_agent-0.2.217.dist-info}/METADATA +1 -1
- {vision_agent-0.2.215.dist-info → vision_agent-0.2.217.dist-info}/RECORD +9 -9
- {vision_agent-0.2.215.dist-info → vision_agent-0.2.217.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.215.dist-info → vision_agent-0.2.217.dist-info}/WHEEL +0 -0
    
        vision_agent/.sim_tools/df.csv
    CHANGED
    
    | @@ -444,6 +444,35 @@ desc,doc,name | |
| 444 444 | 
             
                    >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
         | 
| 445 445 | 
             
                    'Lionel Messi'
         | 
| 446 446 | 
             
                ",qwen2_vl_video_vqa
         | 
| 447 | 
            +
            "'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
         | 
| 448 | 
            +
            'document_extraction' is a tool that can extract structured information out of
         | 
| 449 | 
            +
                documents with different layouts. It returns the extracted data in a structured
         | 
| 450 | 
            +
                hierarchical format containing text, tables, pictures, charts, and other
         | 
| 451 | 
            +
                information.
         | 
| 452 | 
            +
             | 
| 453 | 
            +
                Parameters:
         | 
| 454 | 
            +
                    image (np.ndarray): The document image to analyze
         | 
| 455 | 
            +
             | 
| 456 | 
            +
                Returns:
         | 
| 457 | 
            +
                    Dict[str, Any]: A dictionary containing the extracted information.
         | 
| 458 | 
            +
             | 
| 459 | 
            +
                Example
         | 
| 460 | 
            +
                -------
         | 
| 461 | 
            +
                    >>> document_analysis(image)
         | 
| 462 | 
            +
                    {'pages':
         | 
| 463 | 
            +
                        [{'bbox': [0, 0, 1700, 2200],
         | 
| 464 | 
            +
                                'chunks': [{'bbox': [1371, 75, 1503, 112],
         | 
| 465 | 
            +
                                            'label': 'page_header',
         | 
| 466 | 
            +
                                            'order': 75
         | 
| 467 | 
            +
                                            'caption': 'Annual Report 2024',
         | 
| 468 | 
            +
                                            'summary': 'This annual report summarizes ...' },
         | 
| 469 | 
            +
                                           {'bbox': [201, 1119, 1497, 1647],
         | 
| 470 | 
            +
                                            'label': table',
         | 
| 471 | 
            +
                                            'order': 1119,
         | 
| 472 | 
            +
                                            'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
         | 
| 473 | 
            +
                                            'summary': 'This table illustrates a trend of ...'},
         | 
| 474 | 
            +
                                ],
         | 
| 475 | 
            +
                ",document_extraction
         | 
| 447 476 | 
             
            'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
         | 
| 448 477 | 
             
            'video_temporal_localization' will run qwen2vl on each chunk_length_frames
         | 
| 449 478 | 
             
                value selected for the video. It can detect multiple objects independently per
         | 
| @@ -513,6 +542,78 @@ desc,doc,name | |
| 513 542 | 
             
                    >>> siglip_classification(image, ['dog', 'cat', 'bird'])
         | 
| 514 543 | 
             
                    {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
         | 
| 515 544 | 
             
                ",siglip_classification
         | 
| 545 | 
            +
            "'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
         | 
| 546 | 
            +
            'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 547 | 
            +
                prompt such as category names or referring expressions. The categories in the text
         | 
| 548 | 
            +
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 549 | 
            +
                mask file names and associated probability scores.
         | 
| 550 | 
            +
             | 
| 551 | 
            +
                Parameters:
         | 
| 552 | 
            +
                    prompt (str): The prompt to ground to the image.
         | 
| 553 | 
            +
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 554 | 
            +
             | 
| 555 | 
            +
                Returns:
         | 
| 556 | 
            +
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 557 | 
            +
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 558 | 
            +
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 559 | 
            +
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 560 | 
            +
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 561 | 
            +
                        the background.
         | 
| 562 | 
            +
             | 
| 563 | 
            +
                Example
         | 
| 564 | 
            +
                -------
         | 
| 565 | 
            +
                    >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
         | 
| 566 | 
            +
                    [
         | 
| 567 | 
            +
                        [
         | 
| 568 | 
            +
                            {
         | 
| 569 | 
            +
                                'label': '0: dinosaur',
         | 
| 570 | 
            +
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 571 | 
            +
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 572 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 573 | 
            +
                                    ...,
         | 
| 574 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 575 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 576 | 
            +
                            },
         | 
| 577 | 
            +
                        ],
         | 
| 578 | 
            +
                        ...
         | 
| 579 | 
            +
                    ]
         | 
| 580 | 
            +
                ",owlv2_sam2_video_tracking
         | 
| 581 | 
            +
            "'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
         | 
| 582 | 
            +
            'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 583 | 
            +
                prompt such as category names or referring expressions. The categories in the text
         | 
| 584 | 
            +
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 585 | 
            +
                mask file names and associated probability scores.
         | 
| 586 | 
            +
             | 
| 587 | 
            +
                Parameters:
         | 
| 588 | 
            +
                    prompt (str): The prompt to ground to the image.
         | 
| 589 | 
            +
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 590 | 
            +
             | 
| 591 | 
            +
                Returns:
         | 
| 592 | 
            +
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 593 | 
            +
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 594 | 
            +
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 595 | 
            +
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 596 | 
            +
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 597 | 
            +
                        the background.
         | 
| 598 | 
            +
             | 
| 599 | 
            +
                Example
         | 
| 600 | 
            +
                -------
         | 
| 601 | 
            +
                    >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
         | 
| 602 | 
            +
                    [
         | 
| 603 | 
            +
                        [
         | 
| 604 | 
            +
                            {
         | 
| 605 | 
            +
                                'label': '0: dinosaur',
         | 
| 606 | 
            +
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 607 | 
            +
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 608 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 609 | 
            +
                                    ...,
         | 
| 610 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 611 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 612 | 
            +
                            },
         | 
| 613 | 
            +
                        ],
         | 
| 614 | 
            +
                        ...
         | 
| 615 | 
            +
                    ]
         | 
| 616 | 
            +
                ",countgd_sam2_video_tracking
         | 
| 516 617 | 
             
            "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
         | 
| 517 618 | 
             
            'extract_frames_and_timestamps' extracts frames and timestamps from a video
         | 
| 518 619 | 
             
                which can be a file path, url or youtube link, returns a list of dictionaries
         | 
    
        vision_agent/.sim_tools/embs.npy
    CHANGED
    
    | Binary file | 
    
        vision_agent/tools/__init__.py
    CHANGED
    
    
| @@ -143,7 +143,14 @@ def run_tool_testing( | |
| 143 143 | 
             
                code = extract_tag(response, "code")  # type: ignore
         | 
| 144 144 | 
             
                if code is None:
         | 
| 145 145 | 
             
                    raise ValueError(f"Could not extract code from response: {response}")
         | 
| 146 | 
            -
             | 
| 146 | 
            +
             | 
| 147 | 
            +
                # If there's a syntax error with the code, process_code can crash. Executing the
         | 
| 148 | 
            +
                # code and then sending the error to the LLM should correct it.
         | 
| 149 | 
            +
                try:
         | 
| 150 | 
            +
                    code = process_code(code)
         | 
| 151 | 
            +
                except Exception as e:
         | 
| 152 | 
            +
                    _LOGGER.error(f"Error processing code: {e}")
         | 
| 153 | 
            +
             | 
| 147 154 | 
             
                tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
         | 
| 148 155 | 
             
                tool_output_str = tool_output.text(include_results=False).strip()
         | 
| 149 156 |  | 
| @@ -167,6 +174,7 @@ def run_tool_testing( | |
| 167 174 | 
             
                        DefaultImports.prepend_imports(code)
         | 
| 168 175 | 
             
                    )
         | 
| 169 176 | 
             
                    tool_output_str = tool_output.text(include_results=False).strip()
         | 
| 177 | 
            +
                    count += 1
         | 
| 170 178 |  | 
| 171 179 | 
             
                return code, tool_docs_str, tool_output
         | 
| 172 180 |  | 
    
        vision_agent/tools/tools.py
    CHANGED
    
    | @@ -119,6 +119,120 @@ def _display_tool_trace( | |
| 119 119 | 
             
                display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
         | 
| 120 120 |  | 
| 121 121 |  | 
| 122 | 
            +
            class ODModels(str, Enum):
         | 
| 123 | 
            +
                COUNTGD = "countgd"
         | 
| 124 | 
            +
                FLORENCE2 = "florence2"
         | 
| 125 | 
            +
                OWLV2 = "owlv2"
         | 
| 126 | 
            +
             | 
| 127 | 
            +
             | 
| 128 | 
            +
            def od_sam2_video_tracking(
         | 
| 129 | 
            +
                od_model: ODModels,
         | 
| 130 | 
            +
                prompt: str,
         | 
| 131 | 
            +
                frames: List[np.ndarray],
         | 
| 132 | 
            +
                chunk_length: Optional[int] = 10,
         | 
| 133 | 
            +
                fine_tune_id: Optional[str] = None,
         | 
| 134 | 
            +
            ) -> Dict[str, Any]:
         | 
| 135 | 
            +
                results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                if chunk_length is None:
         | 
| 138 | 
            +
                    step = 1  # Process every frame
         | 
| 139 | 
            +
                elif chunk_length <= 0:
         | 
| 140 | 
            +
                    raise ValueError("chunk_length must be a positive integer or None.")
         | 
| 141 | 
            +
                else:
         | 
| 142 | 
            +
                    step = chunk_length  # Process frames with the specified step size
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                for idx in range(0, len(frames), step):
         | 
| 145 | 
            +
                    if od_model == ODModels.COUNTGD:
         | 
| 146 | 
            +
                        results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
         | 
| 147 | 
            +
                        function_name = "countgd_object_detection"
         | 
| 148 | 
            +
                    elif od_model == ODModels.OWLV2:
         | 
| 149 | 
            +
                        results[idx] = owl_v2_image(
         | 
| 150 | 
            +
                            prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
         | 
| 151 | 
            +
                        )
         | 
| 152 | 
            +
                        function_name = "owl_v2_image"
         | 
| 153 | 
            +
                    elif od_model == ODModels.FLORENCE2:
         | 
| 154 | 
            +
                        results[idx] = florence2_sam2_image(
         | 
| 155 | 
            +
                            prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
         | 
| 156 | 
            +
                        )
         | 
| 157 | 
            +
                        function_name = "florence2_sam2_image"
         | 
| 158 | 
            +
                    else:
         | 
| 159 | 
            +
                        raise NotImplementedError(
         | 
| 160 | 
            +
                            f"Object detection model '{od_model}' is not implemented."
         | 
| 161 | 
            +
                        )
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                image_size = frames[0].shape[:2]
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                def _transform_detections(
         | 
| 166 | 
            +
                    input_list: List[Optional[List[Dict[str, Any]]]]
         | 
| 167 | 
            +
                ) -> List[Optional[Dict[str, Any]]]:
         | 
| 168 | 
            +
                    output_list: List[Optional[Dict[str, Any]]] = []
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                    for _, frame in enumerate(input_list):
         | 
| 171 | 
            +
                        if frame is not None:
         | 
| 172 | 
            +
                            labels = [detection["label"] for detection in frame]
         | 
| 173 | 
            +
                            bboxes = [
         | 
| 174 | 
            +
                                denormalize_bbox(detection["bbox"], image_size)
         | 
| 175 | 
            +
                                for detection in frame
         | 
| 176 | 
            +
                            ]
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                            output_list.append(
         | 
| 179 | 
            +
                                {
         | 
| 180 | 
            +
                                    "labels": labels,
         | 
| 181 | 
            +
                                    "bboxes": bboxes,
         | 
| 182 | 
            +
                                }
         | 
| 183 | 
            +
                            )
         | 
| 184 | 
            +
                        else:
         | 
| 185 | 
            +
                            output_list.append(None)
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    return output_list
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                output = _transform_detections(results)
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                buffer_bytes = frames_to_bytes(frames)
         | 
| 192 | 
            +
                files = [("video", buffer_bytes)]
         | 
| 193 | 
            +
                payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
         | 
| 194 | 
            +
                metadata = {"function_name": function_name}
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                detections = send_task_inference_request(
         | 
| 197 | 
            +
                    payload,
         | 
| 198 | 
            +
                    "sam2",
         | 
| 199 | 
            +
                    files=files,
         | 
| 200 | 
            +
                    metadata=metadata,
         | 
| 201 | 
            +
                )
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                return_data = []
         | 
| 204 | 
            +
                for frame in detections:
         | 
| 205 | 
            +
                    return_frame_data = []
         | 
| 206 | 
            +
                    for detection in frame:
         | 
| 207 | 
            +
                        mask = rle_decode_array(detection["mask"])
         | 
| 208 | 
            +
                        label = str(detection["id"]) + ": " + detection["label"]
         | 
| 209 | 
            +
                        return_frame_data.append(
         | 
| 210 | 
            +
                            {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
         | 
| 211 | 
            +
                        )
         | 
| 212 | 
            +
                    return_data.append(return_frame_data)
         | 
| 213 | 
            +
                return_data = add_bboxes_from_masks(return_data)
         | 
| 214 | 
            +
                return_data = nms(return_data, iou_threshold=0.95)
         | 
| 215 | 
            +
             | 
| 216 | 
            +
                # We save the RLE for display purposes, re-calculting RLE can get very expensive.
         | 
| 217 | 
            +
                # Deleted here because we are returning the numpy masks instead
         | 
| 218 | 
            +
                display_data = []
         | 
| 219 | 
            +
                for frame in return_data:
         | 
| 220 | 
            +
                    display_frame_data = []
         | 
| 221 | 
            +
                    for obj in frame:
         | 
| 222 | 
            +
                        display_frame_data.append(
         | 
| 223 | 
            +
                            {
         | 
| 224 | 
            +
                                "label": obj["label"],
         | 
| 225 | 
            +
                                "score": obj["score"],
         | 
| 226 | 
            +
                                "bbox": denormalize_bbox(obj["bbox"], image_size),
         | 
| 227 | 
            +
                                "mask": obj["rle"],
         | 
| 228 | 
            +
                            }
         | 
| 229 | 
            +
                        )
         | 
| 230 | 
            +
                        del obj["rle"]
         | 
| 231 | 
            +
                    display_data.append(display_frame_data)
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                return {"files": files, "return_data": return_data, "display_data": detections}
         | 
| 234 | 
            +
             | 
| 235 | 
            +
             | 
| 122 236 | 
             
            def owl_v2_image(
         | 
| 123 237 | 
             
                prompt: str,
         | 
| 124 238 | 
             
                image: np.ndarray,
         | 
| @@ -302,6 +416,64 @@ def owl_v2_video( | |
| 302 416 | 
             
                return bboxes_formatted
         | 
| 303 417 |  | 
| 304 418 |  | 
| 419 | 
            +
            def owlv2_sam2_video_tracking(
         | 
| 420 | 
            +
                prompt: str,
         | 
| 421 | 
            +
                frames: List[np.ndarray],
         | 
| 422 | 
            +
                chunk_length: Optional[int] = 10,
         | 
| 423 | 
            +
                fine_tune_id: Optional[str] = None,
         | 
| 424 | 
            +
            ) -> List[List[Dict[str, Any]]]:
         | 
| 425 | 
            +
                """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 426 | 
            +
                prompt such as category names or referring expressions. The categories in the text
         | 
| 427 | 
            +
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 428 | 
            +
                mask file names and associated probability scores.
         | 
| 429 | 
            +
             | 
| 430 | 
            +
                Parameters:
         | 
| 431 | 
            +
                    prompt (str): The prompt to ground to the image.
         | 
| 432 | 
            +
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 433 | 
            +
             | 
| 434 | 
            +
                Returns:
         | 
| 435 | 
            +
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 436 | 
            +
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 437 | 
            +
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 438 | 
            +
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 439 | 
            +
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 440 | 
            +
                        the background.
         | 
| 441 | 
            +
             | 
| 442 | 
            +
                Example
         | 
| 443 | 
            +
                -------
         | 
| 444 | 
            +
                    >>> countgd_sam2_video_tracking("car, dinosaur", frames)
         | 
| 445 | 
            +
                    [
         | 
| 446 | 
            +
                        [
         | 
| 447 | 
            +
                            {
         | 
| 448 | 
            +
                                'label': '0: dinosaur',
         | 
| 449 | 
            +
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 450 | 
            +
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 451 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 452 | 
            +
                                    ...,
         | 
| 453 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 454 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 455 | 
            +
                            },
         | 
| 456 | 
            +
                        ],
         | 
| 457 | 
            +
                        ...
         | 
| 458 | 
            +
                    ]
         | 
| 459 | 
            +
                """
         | 
| 460 | 
            +
             | 
| 461 | 
            +
                ret = od_sam2_video_tracking(
         | 
| 462 | 
            +
                    ODModels.OWLV2,
         | 
| 463 | 
            +
                    prompt=prompt,
         | 
| 464 | 
            +
                    frames=frames,
         | 
| 465 | 
            +
                    chunk_length=chunk_length,
         | 
| 466 | 
            +
                    fine_tune_id=fine_tune_id,
         | 
| 467 | 
            +
                )
         | 
| 468 | 
            +
                _display_tool_trace(
         | 
| 469 | 
            +
                    owlv2_sam2_video_tracking.__name__,
         | 
| 470 | 
            +
                    {},
         | 
| 471 | 
            +
                    ret["display_data"],
         | 
| 472 | 
            +
                    ret["files"],
         | 
| 473 | 
            +
                )
         | 
| 474 | 
            +
                return ret["return_data"]  # type: ignore
         | 
| 475 | 
            +
             | 
| 476 | 
            +
             | 
| 305 477 | 
             
            def florence2_sam2_image(
         | 
| 306 478 | 
             
                prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
         | 
| 307 479 | 
             
            ) -> List[Dict[str, Any]]:
         | 
| @@ -834,6 +1006,59 @@ def countgd_sam2_object_detection( | |
| 834 1006 | 
             
                return seg_ret["return_data"]  # type: ignore
         | 
| 835 1007 |  | 
| 836 1008 |  | 
| 1009 | 
            +
            def countgd_sam2_video_tracking(
         | 
| 1010 | 
            +
                prompt: str,
         | 
| 1011 | 
            +
                frames: List[np.ndarray],
         | 
| 1012 | 
            +
                chunk_length: Optional[int] = 10,
         | 
| 1013 | 
            +
            ) -> List[List[Dict[str, Any]]]:
         | 
| 1014 | 
            +
                """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 1015 | 
            +
                prompt such as category names or referring expressions. The categories in the text
         | 
| 1016 | 
            +
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 1017 | 
            +
                mask file names and associated probability scores.
         | 
| 1018 | 
            +
             | 
| 1019 | 
            +
                Parameters:
         | 
| 1020 | 
            +
                    prompt (str): The prompt to ground to the image.
         | 
| 1021 | 
            +
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 1022 | 
            +
             | 
| 1023 | 
            +
                Returns:
         | 
| 1024 | 
            +
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 1025 | 
            +
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 1026 | 
            +
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 1027 | 
            +
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 1028 | 
            +
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 1029 | 
            +
                        the background.
         | 
| 1030 | 
            +
             | 
| 1031 | 
            +
                Example
         | 
| 1032 | 
            +
                -------
         | 
| 1033 | 
            +
                    >>> countgd_sam2_video_tracking("car, dinosaur", frames)
         | 
| 1034 | 
            +
                    [
         | 
| 1035 | 
            +
                        [
         | 
| 1036 | 
            +
                            {
         | 
| 1037 | 
            +
                                'label': '0: dinosaur',
         | 
| 1038 | 
            +
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 1039 | 
            +
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 1040 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 1041 | 
            +
                                    ...,
         | 
| 1042 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 1043 | 
            +
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 1044 | 
            +
                            },
         | 
| 1045 | 
            +
                        ],
         | 
| 1046 | 
            +
                        ...
         | 
| 1047 | 
            +
                    ]
         | 
| 1048 | 
            +
                """
         | 
| 1049 | 
            +
             | 
| 1050 | 
            +
                ret = od_sam2_video_tracking(
         | 
| 1051 | 
            +
                    ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
         | 
| 1052 | 
            +
                )
         | 
| 1053 | 
            +
                _display_tool_trace(
         | 
| 1054 | 
            +
                    countgd_sam2_video_tracking.__name__,
         | 
| 1055 | 
            +
                    {},
         | 
| 1056 | 
            +
                    ret["display_data"],
         | 
| 1057 | 
            +
                    ret["files"],
         | 
| 1058 | 
            +
                )
         | 
| 1059 | 
            +
                return ret["return_data"]  # type: ignore
         | 
| 1060 | 
            +
             | 
| 1061 | 
            +
             | 
| 837 1062 | 
             
            def countgd_example_based_counting(
         | 
| 838 1063 | 
             
                visual_prompts: List[List[float]],
         | 
| 839 1064 | 
             
                image: np.ndarray,
         | 
| @@ -1879,11 +2104,11 @@ def closest_box_distance( | |
| 1879 2104 | 
             
                return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
         | 
| 1880 2105 |  | 
| 1881 2106 |  | 
| 1882 | 
            -
            def  | 
| 1883 | 
            -
                """' | 
| 1884 | 
            -
                 | 
| 1885 | 
            -
                 | 
| 1886 | 
            -
                 | 
| 2107 | 
            +
            def document_extraction(image: np.ndarray) -> Dict[str, Any]:
         | 
| 2108 | 
            +
                """'document_extraction' is a tool that can extract structured information out of
         | 
| 2109 | 
            +
                documents with different layouts. It returns the extracted data in a structured
         | 
| 2110 | 
            +
                hierarchical format containing text, tables, pictures, charts, and other
         | 
| 2111 | 
            +
                information.
         | 
| 1887 2112 |  | 
| 1888 2113 | 
             
                Parameters:
         | 
| 1889 2114 | 
             
                    image (np.ndarray): The document image to analyze
         | 
| @@ -1894,20 +2119,18 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]: | |
| 1894 2119 | 
             
                Example
         | 
| 1895 2120 | 
             
                -------
         | 
| 1896 2121 | 
             
                    >>> document_analysis(image)
         | 
| 1897 | 
            -
                    {'pages': | 
| 1898 | 
            -
             | 
| 1899 | 
            -
             | 
| 2122 | 
            +
                    {'pages':
         | 
| 2123 | 
            +
                        [{'bbox': [0, 0, 1.0, 1.0],
         | 
| 2124 | 
            +
                                'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
         | 
| 1900 2125 | 
             
                                            'label': 'page_header',
         | 
| 1901 | 
            -
                                            ' | 
| 1902 | 
            -
             | 
| 1903 | 
            -
                                            ' | 
| 1904 | 
            -
             | 
| 1905 | 
            -
             | 
| 1906 | 
            -
             | 
| 1907 | 
            -
             | 
| 1908 | 
            -
             | 
| 1909 | 
            -
                                            'label': 'picture',
         | 
| 1910 | 
            -
                                            'summary': 'This bar chart illustrates the trend of ...'},
         | 
| 2126 | 
            +
                                            'order': 75
         | 
| 2127 | 
            +
                                            'caption': 'Annual Report 2024',
         | 
| 2128 | 
            +
                                            'summary': 'This annual report summarizes ...' },
         | 
| 2129 | 
            +
                                           {'bbox': [0.2, 0.9, 0.9, 1.0],
         | 
| 2130 | 
            +
                                            'label': table',
         | 
| 2131 | 
            +
                                            'order': 1119,
         | 
| 2132 | 
            +
                                            'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
         | 
| 2133 | 
            +
                                            'summary': 'This table illustrates a trend of ...'},
         | 
| 1911 2134 | 
             
                                ],
         | 
| 1912 2135 | 
             
                """
         | 
| 1913 2136 |  | 
| @@ -1919,7 +2142,7 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]: | |
| 1919 2142 | 
             
                    "model": "document-analysis",
         | 
| 1920 2143 | 
             
                }
         | 
| 1921 2144 |  | 
| 1922 | 
            -
                 | 
| 2145 | 
            +
                data: Dict[str, Any] = send_inference_request(
         | 
| 1923 2146 | 
             
                    payload=payload,
         | 
| 1924 2147 | 
             
                    endpoint_name="document-analysis",
         | 
| 1925 2148 | 
             
                    files=files,
         | 
| @@ -1927,14 +2150,99 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]: | |
| 1927 2150 | 
             
                    metadata_payload={"function_name": "document_analysis"},
         | 
| 1928 2151 | 
             
                )
         | 
| 1929 2152 |  | 
| 2153 | 
            +
                # don't display normalized bboxes
         | 
| 1930 2154 | 
             
                _display_tool_trace(
         | 
| 1931 | 
            -
                     | 
| 2155 | 
            +
                    document_extraction.__name__,
         | 
| 1932 2156 | 
             
                    payload,
         | 
| 1933 | 
            -
                     | 
| 2157 | 
            +
                    data,
         | 
| 1934 2158 | 
             
                    files,
         | 
| 1935 2159 | 
             
                )
         | 
| 1936 2160 |  | 
| 1937 | 
            -
                 | 
| 2161 | 
            +
                def normalize(data: Any) -> Dict[str, Any]:
         | 
| 2162 | 
            +
                    if isinstance(data, Dict):
         | 
| 2163 | 
            +
                        if "bbox" in data:
         | 
| 2164 | 
            +
                            data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
         | 
| 2165 | 
            +
                        for key in data:
         | 
| 2166 | 
            +
                            data[key] = normalize(data[key])
         | 
| 2167 | 
            +
                    elif isinstance(data, List):
         | 
| 2168 | 
            +
                        for i in range(len(data)):
         | 
| 2169 | 
            +
                            data[i] = normalize(data[i])
         | 
| 2170 | 
            +
                    return data  # type: ignore
         | 
| 2171 | 
            +
             | 
| 2172 | 
            +
                data = normalize(data)
         | 
| 2173 | 
            +
             | 
| 2174 | 
            +
                return data
         | 
| 2175 | 
            +
             | 
| 2176 | 
            +
             | 
| 2177 | 
            +
            def document_qa(
         | 
| 2178 | 
            +
                prompt: str,
         | 
| 2179 | 
            +
                image: np.ndarray,
         | 
| 2180 | 
            +
            ) -> str:
         | 
| 2181 | 
            +
                """'document_qa' is a tool that can answer any questions about arbitrary
         | 
| 2182 | 
            +
                images of documents or presentations. It answers by analyzing the contextual document data
         | 
| 2183 | 
            +
                and then using a model to answer specific questions. It returns text as an answer to the question.
         | 
| 2184 | 
            +
             | 
| 2185 | 
            +
                Parameters:
         | 
| 2186 | 
            +
                    prompt (str): The question to be answered about the document image
         | 
| 2187 | 
            +
                    image (np.ndarray): The document image to analyze
         | 
| 2188 | 
            +
             | 
| 2189 | 
            +
                Returns:
         | 
| 2190 | 
            +
                    str: The answer to the question based on the document's context.
         | 
| 2191 | 
            +
             | 
| 2192 | 
            +
                Example
         | 
| 2193 | 
            +
                -------
         | 
| 2194 | 
            +
                    >>> document_qa(image, question)
         | 
| 2195 | 
            +
                    'The answer to the question ...'
         | 
| 2196 | 
            +
                """
         | 
| 2197 | 
            +
             | 
| 2198 | 
            +
                image_file = numpy_to_bytes(image)
         | 
| 2199 | 
            +
             | 
| 2200 | 
            +
                files = [("image", image_file)]
         | 
| 2201 | 
            +
             | 
| 2202 | 
            +
                payload = {
         | 
| 2203 | 
            +
                    "model": "document-analysis",
         | 
| 2204 | 
            +
                }
         | 
| 2205 | 
            +
             | 
| 2206 | 
            +
                data: dict[str, Any] = send_inference_request(
         | 
| 2207 | 
            +
                    payload=payload,
         | 
| 2208 | 
            +
                    endpoint_name="document-analysis",
         | 
| 2209 | 
            +
                    files=files,
         | 
| 2210 | 
            +
                    v2=True,
         | 
| 2211 | 
            +
                    metadata_payload={"function_name": "document_qa"},
         | 
| 2212 | 
            +
                )
         | 
| 2213 | 
            +
             | 
| 2214 | 
            +
                def normalize(data: Any) -> Dict[str, Any]:
         | 
| 2215 | 
            +
                    if isinstance(data, Dict):
         | 
| 2216 | 
            +
                        if "bbox" in data:
         | 
| 2217 | 
            +
                            data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
         | 
| 2218 | 
            +
                        for key in data:
         | 
| 2219 | 
            +
                            data[key] = normalize(data[key])
         | 
| 2220 | 
            +
                    elif isinstance(data, List):
         | 
| 2221 | 
            +
                        for i in range(len(data)):
         | 
| 2222 | 
            +
                            data[i] = normalize(data[i])
         | 
| 2223 | 
            +
                    return data  # type: ignore
         | 
| 2224 | 
            +
             | 
| 2225 | 
            +
                data = normalize(data)
         | 
| 2226 | 
            +
             | 
| 2227 | 
            +
                prompt = f"""
         | 
| 2228 | 
            +
                Document Context:
         | 
| 2229 | 
            +
                {data}\n
         | 
| 2230 | 
            +
                Question: {prompt}\n
         | 
| 2231 | 
            +
                Please provide a clear, concise answer using only the information from the document. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
         | 
| 2232 | 
            +
                """
         | 
| 2233 | 
            +
             | 
| 2234 | 
            +
                lmm = AnthropicLMM()
         | 
| 2235 | 
            +
                llm_output = lmm.generate(prompt=prompt)
         | 
| 2236 | 
            +
                llm_output = cast(str, llm_output)
         | 
| 2237 | 
            +
             | 
| 2238 | 
            +
                _display_tool_trace(
         | 
| 2239 | 
            +
                    document_qa.__name__,
         | 
| 2240 | 
            +
                    payload,
         | 
| 2241 | 
            +
                    llm_output,
         | 
| 2242 | 
            +
                    files,
         | 
| 2243 | 
            +
                )
         | 
| 2244 | 
            +
             | 
| 2245 | 
            +
                return llm_output
         | 
| 1938 2246 |  | 
| 1939 2247 |  | 
| 1940 2248 | 
             
            # Utility and visualization functions
         | 
| @@ -2453,197 +2761,6 @@ def _plot_counting( | |
| 2453 2761 | 
             
                return image
         | 
| 2454 2762 |  | 
| 2455 2763 |  | 
| 2456 | 
            -
            class ODModels(str, Enum):
         | 
| 2457 | 
            -
                COUNTGD = "countgd"
         | 
| 2458 | 
            -
                FLORENCE2 = "florence2"
         | 
| 2459 | 
            -
                OWLV2 = "owlv2"
         | 
| 2460 | 
            -
             | 
| 2461 | 
            -
             | 
| 2462 | 
            -
            def od_sam2_video_tracking(
         | 
| 2463 | 
            -
                od_model: ODModels,
         | 
| 2464 | 
            -
                prompt: str,
         | 
| 2465 | 
            -
                frames: List[np.ndarray],
         | 
| 2466 | 
            -
                chunk_length: Optional[int] = 10,
         | 
| 2467 | 
            -
                fine_tune_id: Optional[str] = None,
         | 
| 2468 | 
            -
            ) -> List[List[Dict[str, Any]]]:
         | 
| 2469 | 
            -
             | 
| 2470 | 
            -
                results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
         | 
| 2471 | 
            -
             | 
| 2472 | 
            -
                if chunk_length is None:
         | 
| 2473 | 
            -
                    step = 1  # Process every frame
         | 
| 2474 | 
            -
                elif chunk_length <= 0:
         | 
| 2475 | 
            -
                    raise ValueError("chunk_length must be a positive integer or None.")
         | 
| 2476 | 
            -
                else:
         | 
| 2477 | 
            -
                    step = chunk_length  # Process frames with the specified step size
         | 
| 2478 | 
            -
             | 
| 2479 | 
            -
                for idx in range(0, len(frames), step):
         | 
| 2480 | 
            -
                    if od_model == ODModels.COUNTGD:
         | 
| 2481 | 
            -
                        results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
         | 
| 2482 | 
            -
                        function_name = "countgd_object_detection"
         | 
| 2483 | 
            -
                    elif od_model == ODModels.OWLV2:
         | 
| 2484 | 
            -
                        results[idx] = owl_v2_image(
         | 
| 2485 | 
            -
                            prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
         | 
| 2486 | 
            -
                        )
         | 
| 2487 | 
            -
                        function_name = "owl_v2_image"
         | 
| 2488 | 
            -
                    elif od_model == ODModels.FLORENCE2:
         | 
| 2489 | 
            -
                        results[idx] = florence2_sam2_image(
         | 
| 2490 | 
            -
                            prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
         | 
| 2491 | 
            -
                        )
         | 
| 2492 | 
            -
                        function_name = "florence2_sam2_image"
         | 
| 2493 | 
            -
                    else:
         | 
| 2494 | 
            -
                        raise NotImplementedError(
         | 
| 2495 | 
            -
                            f"Object detection model '{od_model}' is not implemented."
         | 
| 2496 | 
            -
                        )
         | 
| 2497 | 
            -
             | 
| 2498 | 
            -
                image_size = frames[0].shape[:2]
         | 
| 2499 | 
            -
             | 
| 2500 | 
            -
                def _transform_detections(
         | 
| 2501 | 
            -
                    input_list: List[Optional[List[Dict[str, Any]]]]
         | 
| 2502 | 
            -
                ) -> List[Optional[Dict[str, Any]]]:
         | 
| 2503 | 
            -
                    output_list: List[Optional[Dict[str, Any]]] = []
         | 
| 2504 | 
            -
             | 
| 2505 | 
            -
                    for idx, frame in enumerate(input_list):
         | 
| 2506 | 
            -
                        if frame is not None:
         | 
| 2507 | 
            -
                            labels = [detection["label"] for detection in frame]
         | 
| 2508 | 
            -
                            bboxes = [
         | 
| 2509 | 
            -
                                denormalize_bbox(detection["bbox"], image_size)
         | 
| 2510 | 
            -
                                for detection in frame
         | 
| 2511 | 
            -
                            ]
         | 
| 2512 | 
            -
             | 
| 2513 | 
            -
                            output_list.append(
         | 
| 2514 | 
            -
                                {
         | 
| 2515 | 
            -
                                    "labels": labels,
         | 
| 2516 | 
            -
                                    "bboxes": bboxes,
         | 
| 2517 | 
            -
                                }
         | 
| 2518 | 
            -
                            )
         | 
| 2519 | 
            -
                        else:
         | 
| 2520 | 
            -
                            output_list.append(None)
         | 
| 2521 | 
            -
             | 
| 2522 | 
            -
                    return output_list
         | 
| 2523 | 
            -
             | 
| 2524 | 
            -
                output = _transform_detections(results)
         | 
| 2525 | 
            -
             | 
| 2526 | 
            -
                buffer_bytes = frames_to_bytes(frames)
         | 
| 2527 | 
            -
                files = [("video", buffer_bytes)]
         | 
| 2528 | 
            -
                payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
         | 
| 2529 | 
            -
                metadata = {"function_name": function_name}
         | 
| 2530 | 
            -
             | 
| 2531 | 
            -
                detections = send_task_inference_request(
         | 
| 2532 | 
            -
                    payload,
         | 
| 2533 | 
            -
                    "sam2",
         | 
| 2534 | 
            -
                    files=files,
         | 
| 2535 | 
            -
                    metadata=metadata,
         | 
| 2536 | 
            -
                )
         | 
| 2537 | 
            -
             | 
| 2538 | 
            -
                return_data = []
         | 
| 2539 | 
            -
                for frame in detections:
         | 
| 2540 | 
            -
                    return_frame_data = []
         | 
| 2541 | 
            -
                    for detection in frame:
         | 
| 2542 | 
            -
                        mask = rle_decode_array(detection["mask"])
         | 
| 2543 | 
            -
                        label = str(detection["id"]) + ": " + detection["label"]
         | 
| 2544 | 
            -
                        return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
         | 
| 2545 | 
            -
                    return_data.append(return_frame_data)
         | 
| 2546 | 
            -
                return_data = add_bboxes_from_masks(return_data)
         | 
| 2547 | 
            -
                return nms(return_data, iou_threshold=0.95)
         | 
| 2548 | 
            -
             | 
| 2549 | 
            -
             | 
| 2550 | 
            -
            def countgd_sam2_video_tracking(
         | 
| 2551 | 
            -
                prompt: str,
         | 
| 2552 | 
            -
                frames: List[np.ndarray],
         | 
| 2553 | 
            -
                chunk_length: Optional[int] = 10,
         | 
| 2554 | 
            -
            ) -> List[List[Dict[str, Any]]]:
         | 
| 2555 | 
            -
                """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 2556 | 
            -
                prompt such as category names or referring expressions. The categories in the text
         | 
| 2557 | 
            -
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 2558 | 
            -
                mask file names and associated probability scores.
         | 
| 2559 | 
            -
             | 
| 2560 | 
            -
                Parameters:
         | 
| 2561 | 
            -
                    prompt (str): The prompt to ground to the image.
         | 
| 2562 | 
            -
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 2563 | 
            -
             | 
| 2564 | 
            -
                Returns:
         | 
| 2565 | 
            -
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 2566 | 
            -
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 2567 | 
            -
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 2568 | 
            -
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 2569 | 
            -
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 2570 | 
            -
                        the background.
         | 
| 2571 | 
            -
             | 
| 2572 | 
            -
                Example
         | 
| 2573 | 
            -
                -------
         | 
| 2574 | 
            -
                    >>> countgd_sam2_video_tracking("car, dinosaur", frames)
         | 
| 2575 | 
            -
                    [
         | 
| 2576 | 
            -
                        [
         | 
| 2577 | 
            -
                            {
         | 
| 2578 | 
            -
                                'label': '0: dinosaur',
         | 
| 2579 | 
            -
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 2580 | 
            -
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 2581 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 2582 | 
            -
                                    ...,
         | 
| 2583 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 2584 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 2585 | 
            -
                            },
         | 
| 2586 | 
            -
                        ],
         | 
| 2587 | 
            -
                        ...
         | 
| 2588 | 
            -
                    ]
         | 
| 2589 | 
            -
                """
         | 
| 2590 | 
            -
             | 
| 2591 | 
            -
                return od_sam2_video_tracking(
         | 
| 2592 | 
            -
                    ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
         | 
| 2593 | 
            -
                )
         | 
| 2594 | 
            -
             | 
| 2595 | 
            -
             | 
| 2596 | 
            -
            def owlv2_sam2_video_tracking(
         | 
| 2597 | 
            -
                prompt: str,
         | 
| 2598 | 
            -
                frames: List[np.ndarray],
         | 
| 2599 | 
            -
                chunk_length: Optional[int] = 10,
         | 
| 2600 | 
            -
                fine_tune_id: Optional[str] = None,
         | 
| 2601 | 
            -
            ) -> List[List[Dict[str, Any]]]:
         | 
| 2602 | 
            -
                """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
         | 
| 2603 | 
            -
                prompt such as category names or referring expressions. The categories in the text
         | 
| 2604 | 
            -
                prompt are separated by commas. It returns a list of bounding boxes, label names,
         | 
| 2605 | 
            -
                mask file names and associated probability scores.
         | 
| 2606 | 
            -
             | 
| 2607 | 
            -
                Parameters:
         | 
| 2608 | 
            -
                    prompt (str): The prompt to ground to the image.
         | 
| 2609 | 
            -
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 2610 | 
            -
             | 
| 2611 | 
            -
                Returns:
         | 
| 2612 | 
            -
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         | 
| 2613 | 
            -
                        bounding box, and mask of the detected objects with normalized coordinates
         | 
| 2614 | 
            -
                        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
         | 
| 2615 | 
            -
                        and xmax and ymax are the coordinates of the bottom-right of the bounding box.
         | 
| 2616 | 
            -
                        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
         | 
| 2617 | 
            -
                        the background.
         | 
| 2618 | 
            -
             | 
| 2619 | 
            -
                Example
         | 
| 2620 | 
            -
                -------
         | 
| 2621 | 
            -
                    >>> countgd_sam2_video_tracking("car, dinosaur", frames)
         | 
| 2622 | 
            -
                    [
         | 
| 2623 | 
            -
                        [
         | 
| 2624 | 
            -
                            {
         | 
| 2625 | 
            -
                                'label': '0: dinosaur',
         | 
| 2626 | 
            -
                                'bbox': [0.1, 0.11, 0.35, 0.4],
         | 
| 2627 | 
            -
                                'mask': array([[0, 0, 0, ..., 0, 0, 0],
         | 
| 2628 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 2629 | 
            -
                                    ...,
         | 
| 2630 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0],
         | 
| 2631 | 
            -
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         | 
| 2632 | 
            -
                            },
         | 
| 2633 | 
            -
                        ],
         | 
| 2634 | 
            -
                        ...
         | 
| 2635 | 
            -
                    ]
         | 
| 2636 | 
            -
                """
         | 
| 2637 | 
            -
             | 
| 2638 | 
            -
                return od_sam2_video_tracking(
         | 
| 2639 | 
            -
                    ODModels.OWLV2,
         | 
| 2640 | 
            -
                    prompt=prompt,
         | 
| 2641 | 
            -
                    frames=frames,
         | 
| 2642 | 
            -
                    chunk_length=chunk_length,
         | 
| 2643 | 
            -
                    fine_tune_id=fine_tune_id,
         | 
| 2644 | 
            -
                )
         | 
| 2645 | 
            -
             | 
| 2646 | 
            -
             | 
| 2647 2764 | 
             
            FUNCTION_TOOLS = [
         | 
| 2648 2765 | 
             
                owl_v2_image,
         | 
| 2649 2766 | 
             
                owl_v2_video,
         | 
| @@ -2663,6 +2780,7 @@ FUNCTION_TOOLS = [ | |
| 2663 2780 | 
             
                minimum_distance,
         | 
| 2664 2781 | 
             
                qwen2_vl_images_vqa,
         | 
| 2665 2782 | 
             
                qwen2_vl_video_vqa,
         | 
| 2783 | 
            +
                document_extraction,
         | 
| 2666 2784 | 
             
                video_temporal_localization,
         | 
| 2667 2785 | 
             
                flux_image_inpainting,
         | 
| 2668 2786 | 
             
                siglip_classification,
         | 
| @@ -1,5 +1,5 @@ | |
| 1 | 
            -
            vision_agent/.sim_tools/df.csv,sha256= | 
| 2 | 
            -
            vision_agent/.sim_tools/embs.npy,sha256= | 
| 1 | 
            +
            vision_agent/.sim_tools/df.csv,sha256=nHhcCD55RO9XTiWq_uQ8pHKkVxLXciCHH-SbGPAQEy0,41969
         | 
| 2 | 
            +
            vision_agent/.sim_tools/embs.npy,sha256=UmnXd2Zv1xBu4a7pxHHf4wOhTLKub629rVX9fAusTxY,393344
         | 
| 3 3 | 
             
            vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
         | 
| 4 4 | 
             
            vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
         | 
| 5 5 | 
             
            vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
         | 
| @@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r | |
| 26 26 | 
             
            vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
         | 
| 27 27 | 
             
            vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
         | 
| 28 28 | 
             
            vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
         | 
| 29 | 
            -
            vision_agent/tools/__init__.py,sha256= | 
| 29 | 
            +
            vision_agent/tools/__init__.py,sha256=Jdq34jMw_KuYwk4Wexqm4DRjuLLoL1Q8wukm0NBv1Tc,2812
         | 
| 30 30 | 
             
            vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
         | 
| 31 | 
            -
            vision_agent/tools/planner_tools.py,sha256= | 
| 31 | 
            +
            vision_agent/tools/planner_tools.py,sha256=tU1qz_VIQM_yPKDmuxjMWu68ZlAZ7ePWI1g7zswyWhI,13540
         | 
| 32 32 | 
             
            vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
         | 
| 33 33 | 
             
            vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
         | 
| 34 | 
            -
            vision_agent/tools/tools.py,sha256= | 
| 34 | 
            +
            vision_agent/tools/tools.py,sha256=Xcm_9EQdDCR9X5FhIm7VJaTL0qWqhnJUVTRVrRtETrA,96112
         | 
| 35 35 | 
             
            vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
         | 
| 36 36 | 
             
            vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
         | 
| 37 37 | 
             
            vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
         | 
| @@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50 | |
| 40 40 | 
             
            vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
         | 
| 41 41 | 
             
            vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
         | 
| 42 42 | 
             
            vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
         | 
| 43 | 
            -
            vision_agent-0.2. | 
| 44 | 
            -
            vision_agent-0.2. | 
| 45 | 
            -
            vision_agent-0.2. | 
| 46 | 
            -
            vision_agent-0.2. | 
| 43 | 
            +
            vision_agent-0.2.217.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         | 
| 44 | 
            +
            vision_agent-0.2.217.dist-info/METADATA,sha256=xl9AmXP9RBpC5frlASsiG7YktdIOTRuJgv8WZdRV_bA,19071
         | 
| 45 | 
            +
            vision_agent-0.2.217.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
         | 
| 46 | 
            +
            vision_agent-0.2.217.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |