vision-agent 0.2.126__py3-none-any.whl → 0.2.128__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_coder_prompts.py +41 -7
 - vision_agent/tools/__init__.py +3 -2
 - vision_agent/tools/tools.py +128 -39
 - vision_agent/utils/video.py +24 -5
 - {vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/METADATA +2 -1
 - {vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/RECORD +8 -8
 - {vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/LICENSE +0 -0
 - {vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/WHEEL +0 -0
 
| 
         @@ -70,30 +70,64 @@ This is the documentation for the functions you have access to. You may call any 
     | 
|
| 
       70 
70 
     | 
    
         
             
            2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
         
     | 
| 
       71 
71 
     | 
    
         
             
            3. Your test case MUST run only on the given images which are {media}
         
     | 
| 
       72 
72 
     | 
    
         
             
            4. Print this final dictionary.
         
     | 
| 
      
 73 
     | 
    
         
            +
            5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
         
     | 
| 
       73 
74 
     | 
    
         | 
| 
       74 
75 
     | 
    
         
             
            **Example**:
         
     | 
| 
      
 76 
     | 
    
         
            +
            --- EXAMPLE1 ---
         
     | 
| 
       75 
77 
     | 
    
         
             
            plan1:
         
     | 
| 
       76 
78 
     | 
    
         
             
            - Load the image from the provided file path 'image.jpg'.
         
     | 
| 
       77 
     | 
    
         
            -
            - Use the ' 
     | 
| 
      
 79 
     | 
    
         
            +
            - Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
         
     | 
| 
       78 
80 
     | 
    
         
             
            plan2:
         
     | 
| 
       79 
81 
     | 
    
         
             
            - Load the image from the provided file path 'image.jpg'.
         
     | 
| 
       80 
     | 
    
         
            -
            - Use the ' 
     | 
| 
      
 82 
     | 
    
         
            +
            - Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
         
     | 
| 
       81 
83 
     | 
    
         
             
            - Count the number of detected objects labeled as 'person'.
         
     | 
| 
       82 
84 
     | 
    
         
             
            plan3:
         
     | 
| 
       83 
85 
     | 
    
         
             
            - Load the image from the provided file path 'image.jpg'.
         
     | 
| 
       84 
86 
     | 
    
         
             
            - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
         
     | 
| 
       85 
87 
     | 
    
         | 
| 
       86 
88 
     | 
    
         
             
            ```python
         
     | 
| 
       87 
     | 
    
         
            -
            from vision_agent.tools import load_image,  
     | 
| 
      
 89 
     | 
    
         
            +
            from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
         
     | 
| 
       88 
90 
     | 
    
         
             
            image = load_image("image.jpg")
         
     | 
| 
       89 
     | 
    
         
            -
            owl_v2_out =  
     | 
| 
      
 91 
     | 
    
         
            +
            owl_v2_out = owl_v2_image("person", image)
         
     | 
| 
       90 
92 
     | 
    
         | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
       92 
     | 
    
         
            -
             
     | 
| 
      
 93 
     | 
    
         
            +
            f2s2_out = florence2_sam2_image("person", image)
         
     | 
| 
      
 94 
     | 
    
         
            +
            # strip out the masks from the output becuase they don't provide useful information when printed
         
     | 
| 
      
 95 
     | 
    
         
            +
            f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
         
     | 
| 
       93 
96 
     | 
    
         | 
| 
       94 
97 
     | 
    
         
             
            cgd_out = countgd_counting(image)
         
     | 
| 
       95 
98 
     | 
    
         | 
| 
       96 
     | 
    
         
            -
            final_out = {{" 
     | 
| 
      
 99 
     | 
    
         
            +
            final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
         
     | 
| 
      
 100 
     | 
    
         
            +
            print(final_out)
         
     | 
| 
      
 101 
     | 
    
         
            +
             
     | 
| 
      
 102 
     | 
    
         
            +
            --- EXAMPLE2 ---
         
     | 
| 
      
 103 
     | 
    
         
            +
            plan1:
         
     | 
| 
      
 104 
     | 
    
         
            +
            - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
         
     | 
| 
      
 105 
     | 
    
         
            +
            - Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
         
     | 
| 
      
 106 
     | 
    
         
            +
            plan2:
         
     | 
| 
      
 107 
     | 
    
         
            +
            - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
         
     | 
| 
      
 108 
     | 
    
         
            +
            - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
         
     | 
| 
      
 109 
     | 
    
         
            +
            plan3:
         
     | 
| 
      
 110 
     | 
    
         
            +
            - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
         
     | 
| 
      
 111 
     | 
    
         
            +
            - Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
            ```python
         
     | 
| 
      
 115 
     | 
    
         
            +
            from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
            # sample at 1 FPS and use the first 10 frames to reduce processing time
         
     | 
| 
      
 118 
     | 
    
         
            +
            frames = extract_frames("video.mp4", 1)
         
     | 
| 
      
 119 
     | 
    
         
            +
            frames = [f[0] for f in frames][:10]
         
     | 
| 
      
 120 
     | 
    
         
            +
             
     | 
| 
      
 121 
     | 
    
         
            +
            # plan1
         
     | 
| 
      
 122 
     | 
    
         
            +
            owl_v2_out = [owl_v2_image("person", f) for f in frames]
         
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
      
 124 
     | 
    
         
            +
            # plan2
         
     | 
| 
      
 125 
     | 
    
         
            +
            florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
         
     | 
| 
      
 126 
     | 
    
         
            +
             
     | 
| 
      
 127 
     | 
    
         
            +
            # plan3
         
     | 
| 
      
 128 
     | 
    
         
            +
            countgd_out = [countgd_counting(f) for f in frames]
         
     | 
| 
      
 129 
     | 
    
         
            +
             
     | 
| 
      
 130 
     | 
    
         
            +
            final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
         
     | 
| 
       97 
131 
     | 
    
         
             
            print(final_out)
         
     | 
| 
       98 
132 
     | 
    
         
             
            ```
         
     | 
| 
       99 
133 
     | 
    
         
             
            """
         
     | 
    
        vision_agent/tools/__init__.py
    CHANGED
    
    | 
         @@ -27,7 +27,7 @@ from .tools import ( 
     | 
|
| 
       27 
27 
     | 
    
         
             
                florence2_phrase_grounding,
         
     | 
| 
       28 
28 
     | 
    
         
             
                florence2_roberta_vqa,
         
     | 
| 
       29 
29 
     | 
    
         
             
                florence2_sam2_image,
         
     | 
| 
       30 
     | 
    
         
            -
                 
     | 
| 
      
 30 
     | 
    
         
            +
                florence2_sam2_video_tracking,
         
     | 
| 
       31 
31 
     | 
    
         
             
                generate_pose_image,
         
     | 
| 
       32 
32 
     | 
    
         
             
                generate_soft_edge_image,
         
     | 
| 
       33 
33 
     | 
    
         
             
                get_tool_documentation,
         
     | 
| 
         @@ -46,7 +46,8 @@ from .tools import ( 
     | 
|
| 
       46 
46 
     | 
    
         
             
                overlay_counting_results,
         
     | 
| 
       47 
47 
     | 
    
         
             
                overlay_heat_map,
         
     | 
| 
       48 
48 
     | 
    
         
             
                overlay_segmentation_masks,
         
     | 
| 
       49 
     | 
    
         
            -
                 
     | 
| 
      
 49 
     | 
    
         
            +
                owl_v2_image,
         
     | 
| 
      
 50 
     | 
    
         
            +
                owl_v2_video,
         
     | 
| 
       50 
51 
     | 
    
         
             
                save_image,
         
     | 
| 
       51 
52 
     | 
    
         
             
                save_json,
         
     | 
| 
       52 
53 
     | 
    
         
             
                save_video,
         
     | 
    
        vision_agent/tools/tools.py
    CHANGED
    
    | 
         @@ -145,15 +145,15 @@ def grounding_dino( 
     | 
|
| 
       145 
145 
     | 
    
         
             
                return return_data
         
     | 
| 
       146 
146 
     | 
    
         | 
| 
       147 
147 
     | 
    
         | 
| 
       148 
     | 
    
         
            -
            def  
     | 
| 
      
 148 
     | 
    
         
            +
            def owl_v2_image(
         
     | 
| 
       149 
149 
     | 
    
         
             
                prompt: str,
         
     | 
| 
       150 
150 
     | 
    
         
             
                image: np.ndarray,
         
     | 
| 
       151 
151 
     | 
    
         
             
                box_threshold: float = 0.10,
         
     | 
| 
       152 
152 
     | 
    
         
             
            ) -> List[Dict[str, Any]]:
         
     | 
| 
       153 
     | 
    
         
            -
                """' 
     | 
| 
       154 
     | 
    
         
            -
                prompt such as category names or referring expressions. The categories in 
     | 
| 
       155 
     | 
    
         
            -
                prompt are separated by commas. It returns a list of bounding boxes with 
     | 
| 
       156 
     | 
    
         
            -
                coordinates, label names and associated probability scores.
         
     | 
| 
      
 153 
     | 
    
         
            +
                """'owl_v2_image' is a tool that can detect and count multiple objects given a text
         
     | 
| 
      
 154 
     | 
    
         
            +
                prompt such as category names or referring expressions on images. The categories in
         
     | 
| 
      
 155 
     | 
    
         
            +
                text prompt are separated by commas. It returns a list of bounding boxes with
         
     | 
| 
      
 156 
     | 
    
         
            +
                normalized coordinates, label names and associated probability scores.
         
     | 
| 
       157 
157 
     | 
    
         | 
| 
       158 
158 
     | 
    
         
             
                Parameters:
         
     | 
| 
       159 
159 
     | 
    
         
             
                    prompt (str): The prompt to ground to the image.
         
     | 
| 
         @@ -170,32 +170,103 @@ def owl_v2( 
     | 
|
| 
       170 
170 
     | 
    
         | 
| 
       171 
171 
     | 
    
         
             
                Example
         
     | 
| 
       172 
172 
     | 
    
         
             
                -------
         
     | 
| 
       173 
     | 
    
         
            -
                    >>>  
     | 
| 
      
 173 
     | 
    
         
            +
                    >>> owl_v2_image("car, dinosaur", image)
         
     | 
| 
       174 
174 
     | 
    
         
             
                    [
         
     | 
| 
       175 
175 
     | 
    
         
             
                        {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
         
     | 
| 
       176 
176 
     | 
    
         
             
                        {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         
     | 
| 
       177 
177 
     | 
    
         
             
                    ]
         
     | 
| 
       178 
178 
     | 
    
         
             
                """
         
     | 
| 
       179 
179 
     | 
    
         
             
                image_size = image.shape[:2]
         
     | 
| 
       180 
     | 
    
         
            -
                 
     | 
| 
       181 
     | 
    
         
            -
                 
     | 
| 
      
 180 
     | 
    
         
            +
                buffer_bytes = numpy_to_bytes(image)
         
     | 
| 
      
 181 
     | 
    
         
            +
                files = [("image", buffer_bytes)]
         
     | 
| 
      
 182 
     | 
    
         
            +
                payload = {
         
     | 
| 
       182 
183 
     | 
    
         
             
                    "prompts": [s.strip() for s in prompt.split(",")],
         
     | 
| 
       183 
     | 
    
         
            -
                    " 
     | 
| 
       184 
     | 
    
         
            -
                    " 
     | 
| 
       185 
     | 
    
         
            -
                    "function_name": "owl_v2",
         
     | 
| 
      
 184 
     | 
    
         
            +
                    "model": "owlv2",
         
     | 
| 
      
 185 
     | 
    
         
            +
                    "function_name": "owl_v2_image",
         
     | 
| 
       186 
186 
     | 
    
         
             
                }
         
     | 
| 
       187 
     | 
    
         
            -
                 
     | 
| 
       188 
     | 
    
         
            -
             
     | 
| 
      
 187 
     | 
    
         
            +
                resp_data = send_inference_request(
         
     | 
| 
      
 188 
     | 
    
         
            +
                    payload, "text-to-object-detection", files=files, v2=True
         
     | 
| 
      
 189 
     | 
    
         
            +
                )
         
     | 
| 
      
 190 
     | 
    
         
            +
                bboxes = resp_data[0]
         
     | 
| 
      
 191 
     | 
    
         
            +
                bboxes_formatted = [
         
     | 
| 
      
 192 
     | 
    
         
            +
                    ODResponseData(
         
     | 
| 
      
 193 
     | 
    
         
            +
                        label=bbox["label"],
         
     | 
| 
      
 194 
     | 
    
         
            +
                        bbox=normalize_bbox(bbox["bounding_box"], image_size),
         
     | 
| 
      
 195 
     | 
    
         
            +
                        score=round(bbox["score"], 2),
         
     | 
| 
      
 196 
     | 
    
         
            +
                    )
         
     | 
| 
      
 197 
     | 
    
         
            +
                    for bbox in bboxes
         
     | 
| 
      
 198 
     | 
    
         
            +
                ]
         
     | 
| 
      
 199 
     | 
    
         
            +
                filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
         
     | 
| 
      
 200 
     | 
    
         
            +
                return [bbox.model_dump() for bbox in filtered_bboxes]
         
     | 
| 
      
 201 
     | 
    
         
            +
             
     | 
| 
      
 202 
     | 
    
         
            +
             
     | 
| 
      
 203 
     | 
    
         
            +
            def owl_v2_video(
         
     | 
| 
      
 204 
     | 
    
         
            +
                prompt: str,
         
     | 
| 
      
 205 
     | 
    
         
            +
                frames: List[np.ndarray],
         
     | 
| 
      
 206 
     | 
    
         
            +
                box_threshold: float = 0.10,
         
     | 
| 
      
 207 
     | 
    
         
            +
            ) -> List[List[Dict[str, Any]]]:
         
     | 
| 
      
 208 
     | 
    
         
            +
                """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
         
     | 
| 
      
 209 
     | 
    
         
            +
                objects per frame given a text prompt sucha s a category name or referring
         
     | 
| 
      
 210 
     | 
    
         
            +
                expression. The categories in text prompt are separated by commas. It returns a list
         
     | 
| 
      
 211 
     | 
    
         
            +
                of lists where each inner list contains the score, label, and bounding box of the
         
     | 
| 
      
 212 
     | 
    
         
            +
                detections for that frame.
         
     | 
| 
      
 213 
     | 
    
         
            +
             
     | 
| 
      
 214 
     | 
    
         
            +
                Parameters:
         
     | 
| 
      
 215 
     | 
    
         
            +
                    prompt (str): The prompt to ground to the video.
         
     | 
| 
      
 216 
     | 
    
         
            +
                    frames (List[np.ndarray]): The list of frames to ground the prompt to.
         
     | 
| 
      
 217 
     | 
    
         
            +
                    box_threshold (float, optional): The threshold for the box detection. Defaults
         
     | 
| 
      
 218 
     | 
    
         
            +
                        to 0.30.
         
     | 
| 
      
 219 
     | 
    
         
            +
             
     | 
| 
      
 220 
     | 
    
         
            +
                Returns:
         
     | 
| 
      
 221 
     | 
    
         
            +
                    List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
         
     | 
| 
      
 222 
     | 
    
         
            +
                        score, label, and bounding box of the detected objects with normalized
         
     | 
| 
      
 223 
     | 
    
         
            +
                        coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
         
     | 
| 
      
 224 
     | 
    
         
            +
                        coordinates of the top-left and xmax and ymax are the coordinates of the
         
     | 
| 
      
 225 
     | 
    
         
            +
                        bottom-right of the bounding box.
         
     | 
| 
      
 226 
     | 
    
         
            +
             
     | 
| 
      
 227 
     | 
    
         
            +
                Example
         
     | 
| 
      
 228 
     | 
    
         
            +
                -------
         
     | 
| 
      
 229 
     | 
    
         
            +
                    >>> owl_v2_video("car, dinosaur", frames)
         
     | 
| 
      
 230 
     | 
    
         
            +
                    [
         
     | 
| 
      
 231 
     | 
    
         
            +
                        [
         
     | 
| 
      
 232 
     | 
    
         
            +
                            {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
         
     | 
| 
      
 233 
     | 
    
         
            +
                            {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         
     | 
| 
      
 234 
     | 
    
         
            +
                        ],
         
     | 
| 
      
 235 
     | 
    
         
            +
                        ...
         
     | 
| 
      
 236 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 237 
     | 
    
         
            +
                """
         
     | 
| 
      
 238 
     | 
    
         
            +
                if len(frames) == 0:
         
     | 
| 
      
 239 
     | 
    
         
            +
                    raise ValueError("No frames provided")
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
                image_size = frames[0].shape[:2]
         
     | 
| 
      
 242 
     | 
    
         
            +
                buffer_bytes = frames_to_bytes(frames)
         
     | 
| 
      
 243 
     | 
    
         
            +
                files = [("video", buffer_bytes)]
         
     | 
| 
      
 244 
     | 
    
         
            +
                payload = {
         
     | 
| 
      
 245 
     | 
    
         
            +
                    "prompts": [s.strip() for s in prompt.split(",")],
         
     | 
| 
      
 246 
     | 
    
         
            +
                    "model": "owlv2",
         
     | 
| 
      
 247 
     | 
    
         
            +
                    "function_name": "owl_v2_video",
         
     | 
| 
      
 248 
     | 
    
         
            +
                }
         
     | 
| 
      
 249 
     | 
    
         
            +
                data: Dict[str, Any] = send_inference_request(
         
     | 
| 
      
 250 
     | 
    
         
            +
                    payload, "text-to-object-detection", files=files, v2=True
         
     | 
| 
      
 251 
     | 
    
         
            +
                )
         
     | 
| 
      
 252 
     | 
    
         
            +
                bboxes_formatted = []
         
     | 
| 
       189 
253 
     | 
    
         
             
                if data is not None:
         
     | 
| 
       190 
     | 
    
         
            -
                    for  
     | 
| 
       191 
     | 
    
         
            -
                         
     | 
| 
       192 
     | 
    
         
            -
             
     | 
| 
       193 
     | 
    
         
            -
             
     | 
| 
       194 
     | 
    
         
            -
                                 
     | 
| 
       195 
     | 
    
         
            -
             
     | 
| 
       196 
     | 
    
         
            -
             
     | 
| 
       197 
     | 
    
         
            -
             
     | 
| 
       198 
     | 
    
         
            -
             
     | 
| 
      
 254 
     | 
    
         
            +
                    for frame_data in data:
         
     | 
| 
      
 255 
     | 
    
         
            +
                        bboxes_formated_frame = []
         
     | 
| 
      
 256 
     | 
    
         
            +
                        for elt in frame_data:
         
     | 
| 
      
 257 
     | 
    
         
            +
                            bboxes_formated_frame.append(
         
     | 
| 
      
 258 
     | 
    
         
            +
                                ODResponseData(
         
     | 
| 
      
 259 
     | 
    
         
            +
                                    label=elt["label"],  # type: ignore
         
     | 
| 
      
 260 
     | 
    
         
            +
                                    bbox=normalize_bbox(elt["bounding_box"], image_size),  # type: ignore
         
     | 
| 
      
 261 
     | 
    
         
            +
                                    score=round(elt["score"], 2),  # type: ignore
         
     | 
| 
      
 262 
     | 
    
         
            +
                                )
         
     | 
| 
      
 263 
     | 
    
         
            +
                            )
         
     | 
| 
      
 264 
     | 
    
         
            +
                        bboxes_formatted.append(bboxes_formated_frame)
         
     | 
| 
      
 265 
     | 
    
         
            +
             
     | 
| 
      
 266 
     | 
    
         
            +
                filtered_bboxes = [
         
     | 
| 
      
 267 
     | 
    
         
            +
                    filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
         
     | 
| 
      
 268 
     | 
    
         
            +
                ]
         
     | 
| 
      
 269 
     | 
    
         
            +
                return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
         
     | 
| 
       199 
270 
     | 
    
         | 
| 
       200 
271 
     | 
    
         | 
| 
       201 
272 
     | 
    
         
             
            def grounding_sam(
         
     | 
| 
         @@ -317,14 +388,14 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]] 
     | 
|
| 
       317 
388 
     | 
    
         
             
                return return_data
         
     | 
| 
       318 
389 
     | 
    
         | 
| 
       319 
390 
     | 
    
         | 
| 
       320 
     | 
    
         
            -
            def  
     | 
| 
      
 391 
     | 
    
         
            +
            def florence2_sam2_video_tracking(
         
     | 
| 
       321 
392 
     | 
    
         
             
                prompt: str, frames: List[np.ndarray]
         
     | 
| 
       322 
393 
     | 
    
         
             
            ) -> List[List[Dict[str, Any]]]:
         
     | 
| 
       323 
     | 
    
         
            -
                """' 
     | 
| 
       324 
     | 
    
         
            -
                in a video given a text prompt such as category names or referring 
     | 
| 
       325 
     | 
    
         
            -
                can optionally separate the categories in the text with commas. It 
     | 
| 
       326 
     | 
    
         
            -
                entities present in the first frame and only returns segmentation 
     | 
| 
       327 
     | 
    
         
            -
                useful for tracking and counting without duplicating counts.
         
     | 
| 
      
 394 
     | 
    
         
            +
                """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
         
     | 
| 
      
 395 
     | 
    
         
            +
                entities in a video given a text prompt such as category names or referring
         
     | 
| 
      
 396 
     | 
    
         
            +
                expressions. You can optionally separate the categories in the text with commas. It
         
     | 
| 
      
 397 
     | 
    
         
            +
                only tracks entities present in the first frame and only returns segmentation
         
     | 
| 
      
 398 
     | 
    
         
            +
                masks. It is useful for tracking and counting without duplicating counts.
         
     | 
| 
       328 
399 
     | 
    
         | 
| 
       329 
400 
     | 
    
         
             
                Parameters:
         
     | 
| 
       330 
401 
     | 
    
         
             
                    prompt (str): The prompt to ground to the video.
         
     | 
| 
         @@ -351,14 +422,15 @@ def florence2_sam2_video( 
     | 
|
| 
       351 
422 
     | 
    
         
             
                                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
         
     | 
| 
       352 
423 
     | 
    
         
             
                            },
         
     | 
| 
       353 
424 
     | 
    
         
             
                        ],
         
     | 
| 
      
 425 
     | 
    
         
            +
                        ...
         
     | 
| 
       354 
426 
     | 
    
         
             
                    ]
         
     | 
| 
       355 
427 
     | 
    
         
             
                """
         
     | 
| 
       356 
428 
     | 
    
         | 
| 
       357 
429 
     | 
    
         
             
                buffer_bytes = frames_to_bytes(frames)
         
     | 
| 
       358 
430 
     | 
    
         
             
                files = [("video", buffer_bytes)]
         
     | 
| 
       359 
431 
     | 
    
         
             
                payload = {
         
     | 
| 
       360 
     | 
    
         
            -
                    "prompts": prompt.split(","),
         
     | 
| 
       361 
     | 
    
         
            -
                    "function_name": " 
     | 
| 
      
 432 
     | 
    
         
            +
                    "prompts": [s.strip() for s in prompt.split(",")],
         
     | 
| 
      
 433 
     | 
    
         
            +
                    "function_name": "florence2_sam2_video_tracking",
         
     | 
| 
       362 
434 
     | 
    
         
             
                }
         
     | 
| 
       363 
435 
     | 
    
         
             
                data: Dict[str, Any] = send_inference_request(
         
     | 
| 
       364 
436 
     | 
    
         
             
                    payload, "florence2-sam2", files=files, v2=True
         
     | 
| 
         @@ -549,7 +621,14 @@ def countgd_counting( 
     | 
|
| 
       549 
621 
     | 
    
         
             
                    payload, "text-to-object-detection", files=files, metadata=metadata
         
     | 
| 
       550 
622 
     | 
    
         
             
                )
         
     | 
| 
       551 
623 
     | 
    
         
             
                bboxes_per_frame = resp_data[0]
         
     | 
| 
       552 
     | 
    
         
            -
                bboxes_formatted = [ 
     | 
| 
      
 624 
     | 
    
         
            +
                bboxes_formatted = [
         
     | 
| 
      
 625 
     | 
    
         
            +
                    ODResponseData(
         
     | 
| 
      
 626 
     | 
    
         
            +
                        label=bbox["label"],
         
     | 
| 
      
 627 
     | 
    
         
            +
                        bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
         
     | 
| 
      
 628 
     | 
    
         
            +
                        score=round(bbox["score"], 2),
         
     | 
| 
      
 629 
     | 
    
         
            +
                    )
         
     | 
| 
      
 630 
     | 
    
         
            +
                    for bbox in bboxes_per_frame
         
     | 
| 
      
 631 
     | 
    
         
            +
                ]
         
     | 
| 
       553 
632 
     | 
    
         
             
                filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
         
     | 
| 
       554 
633 
     | 
    
         
             
                return [bbox.model_dump() for bbox in filtered_bboxes]
         
     | 
| 
       555 
634 
     | 
    
         | 
| 
         @@ -601,7 +680,14 @@ def countgd_example_based_counting( 
     | 
|
| 
       601 
680 
     | 
    
         
             
                    payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
         
     | 
| 
       602 
681 
     | 
    
         
             
                )
         
     | 
| 
       603 
682 
     | 
    
         
             
                bboxes_per_frame = resp_data[0]
         
     | 
| 
       604 
     | 
    
         
            -
                bboxes_formatted = [ 
     | 
| 
      
 683 
     | 
    
         
            +
                bboxes_formatted = [
         
     | 
| 
      
 684 
     | 
    
         
            +
                    ODResponseData(
         
     | 
| 
      
 685 
     | 
    
         
            +
                        label=bbox["label"],
         
     | 
| 
      
 686 
     | 
    
         
            +
                        bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
         
     | 
| 
      
 687 
     | 
    
         
            +
                        score=round(bbox["score"], 2),
         
     | 
| 
      
 688 
     | 
    
         
            +
                    )
         
     | 
| 
      
 689 
     | 
    
         
            +
                    for bbox in bboxes_per_frame
         
     | 
| 
      
 690 
     | 
    
         
            +
                ]
         
     | 
| 
       605 
691 
     | 
    
         
             
                filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
         
     | 
| 
       606 
692 
     | 
    
         
             
                return [bbox.model_dump() for bbox in filtered_bboxes]
         
     | 
| 
       607 
693 
     | 
    
         | 
| 
         @@ -1374,12 +1460,12 @@ def closest_box_distance( 
     | 
|
| 
       1374 
1460 
     | 
    
         
             
            def extract_frames(
         
     | 
| 
       1375 
1461 
     | 
    
         
             
                video_uri: Union[str, Path], fps: float = 1
         
     | 
| 
       1376 
1462 
     | 
    
         
             
            ) -> List[Tuple[np.ndarray, float]]:
         
     | 
| 
       1377 
     | 
    
         
            -
                """'extract_frames' extracts frames from a video which can be a file path or 
     | 
| 
       1378 
     | 
    
         
            -
                link, returns a list of tuples (frame, timestamp), where timestamp is the 
     | 
| 
       1379 
     | 
    
         
            -
                time in seconds where the frame was captured. The frame is a numpy array.
         
     | 
| 
      
 1463 
     | 
    
         
            +
                """'extract_frames' extracts frames from a video which can be a file path, url or
         
     | 
| 
      
 1464 
     | 
    
         
            +
                youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
         
     | 
| 
      
 1465 
     | 
    
         
            +
                relative time in seconds where the frame was captured. The frame is a numpy array.
         
     | 
| 
       1380 
1466 
     | 
    
         | 
| 
       1381 
1467 
     | 
    
         
             
                Parameters:
         
     | 
| 
       1382 
     | 
    
         
            -
                    video_uri (Union[str, Path]): The path to the video file or youtube link
         
     | 
| 
      
 1468 
     | 
    
         
            +
                    video_uri (Union[str, Path]): The path to the video file, url or youtube link
         
     | 
| 
       1383 
1469 
     | 
    
         
             
                    fps (float, optional): The frame rate per second to extract the frames. Defaults
         
     | 
| 
       1384 
1470 
     | 
    
         
             
                        to 10.
         
     | 
| 
       1385 
1471 
     | 
    
         | 
| 
         @@ -1518,7 +1604,9 @@ def save_video( 
     | 
|
| 
       1518 
1604 
     | 
    
         
             
                    raise ValueError(f"fps must be greater than 0 got {fps}")
         
     | 
| 
       1519 
1605 
     | 
    
         | 
| 
       1520 
1606 
     | 
    
         
             
                if output_video_path is None:
         
     | 
| 
       1521 
     | 
    
         
            -
                    output_video_path = tempfile.NamedTemporaryFile( 
     | 
| 
      
 1607 
     | 
    
         
            +
                    output_video_path = tempfile.NamedTemporaryFile(
         
     | 
| 
      
 1608 
     | 
    
         
            +
                        delete=False, suffix=".mp4"
         
     | 
| 
      
 1609 
     | 
    
         
            +
                    ).name
         
     | 
| 
       1522 
1610 
     | 
    
         | 
| 
       1523 
1611 
     | 
    
         
             
                output_video_path = video_writer(frames, fps, output_video_path)
         
     | 
| 
       1524 
1612 
     | 
    
         
             
                _save_video_to_result(output_video_path)
         
     | 
| 
         @@ -1818,7 +1906,8 @@ def overlay_counting_results( 
     | 
|
| 
       1818 
1906 
     | 
    
         | 
| 
       1819 
1907 
     | 
    
         | 
| 
       1820 
1908 
     | 
    
         
             
            FUNCTION_TOOLS = [
         
     | 
| 
       1821 
     | 
    
         
            -
                 
     | 
| 
      
 1909 
     | 
    
         
            +
                owl_v2_image,
         
     | 
| 
      
 1910 
     | 
    
         
            +
                owl_v2_video,
         
     | 
| 
       1822 
1911 
     | 
    
         
             
                ocr,
         
     | 
| 
       1823 
1912 
     | 
    
         
             
                clip,
         
     | 
| 
       1824 
1913 
     | 
    
         
             
                vit_image_classification,
         
     | 
| 
         @@ -1827,7 +1916,7 @@ FUNCTION_TOOLS = [ 
     | 
|
| 
       1827 
1916 
     | 
    
         
             
                florence2_image_caption,
         
     | 
| 
       1828 
1917 
     | 
    
         
             
                florence2_ocr,
         
     | 
| 
       1829 
1918 
     | 
    
         
             
                florence2_sam2_image,
         
     | 
| 
       1830 
     | 
    
         
            -
                 
     | 
| 
      
 1919 
     | 
    
         
            +
                florence2_sam2_video_tracking,
         
     | 
| 
       1831 
1920 
     | 
    
         
             
                florence2_phrase_grounding,
         
     | 
| 
       1832 
1921 
     | 
    
         
             
                ixc25_image_vqa,
         
     | 
| 
       1833 
1922 
     | 
    
         
             
                ixc25_video_vqa,
         
     | 
    
        vision_agent/utils/video.py
    CHANGED
    
    | 
         @@ -4,6 +4,7 @@ import tempfile 
     | 
|
| 
       4 
4 
     | 
    
         
             
            from functools import lru_cache
         
     | 
| 
       5 
5 
     | 
    
         
             
            from typing import List, Optional, Tuple
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
      
 7 
     | 
    
         
            +
            import av  # type: ignore
         
     | 
| 
       7 
8 
     | 
    
         
             
            import cv2
         
     | 
| 
       8 
9 
     | 
    
         
             
            import numpy as np
         
     | 
| 
       9 
10 
     | 
    
         
             
            from decord import VideoReader  # type: ignore
         
     | 
| 
         @@ -43,18 +44,36 @@ def play_video(video_base64: str) -> None: 
     | 
|
| 
       43 
44 
     | 
    
         
             
                    cv2.destroyAllWindows()
         
     | 
| 
       44 
45 
     | 
    
         | 
| 
       45 
46 
     | 
    
         | 
| 
      
 47 
     | 
    
         
            +
            def _resize_frame(frame: np.ndarray) -> np.ndarray:
         
     | 
| 
      
 48 
     | 
    
         
            +
                height, width = frame.shape[:2]
         
     | 
| 
      
 49 
     | 
    
         
            +
                new_width = width - (width % 2)
         
     | 
| 
      
 50 
     | 
    
         
            +
                new_height = height - (height % 2)
         
     | 
| 
      
 51 
     | 
    
         
            +
                return cv2.resize(frame, (new_width, new_height))
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
       46 
54 
     | 
    
         
             
            def video_writer(
         
     | 
| 
       47 
55 
     | 
    
         
             
                frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
         
     | 
| 
       48 
56 
     | 
    
         
             
            ) -> str:
         
     | 
| 
       49 
57 
     | 
    
         
             
                if filename is None:
         
     | 
| 
       50 
58 
     | 
    
         
             
                    filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
                 
     | 
| 
      
 59 
     | 
    
         
            +
                container = av.open(filename, mode="w")
         
     | 
| 
      
 60 
     | 
    
         
            +
                stream = container.add_stream("h264", rate=fps)
         
     | 
| 
       53 
61 
     | 
    
         
             
                height, width = frames[0].shape[:2]
         
     | 
| 
       54 
     | 
    
         
            -
                 
     | 
| 
      
 62 
     | 
    
         
            +
                stream.height = height - (height % 2)
         
     | 
| 
      
 63 
     | 
    
         
            +
                stream.width = width - (width % 2)
         
     | 
| 
      
 64 
     | 
    
         
            +
                stream.pix_fmt = "yuv420p"
         
     | 
| 
       55 
65 
     | 
    
         
             
                for frame in frames:
         
     | 
| 
       56 
     | 
    
         
            -
                     
     | 
| 
       57 
     | 
    
         
            -
             
     | 
| 
      
 66 
     | 
    
         
            +
                    # Remove the alpha channel (convert RGBA to RGB)
         
     | 
| 
      
 67 
     | 
    
         
            +
                    frame_rgb = frame[:, :, :3]
         
     | 
| 
      
 68 
     | 
    
         
            +
                    # Resize the frame to make dimensions divisible by 2
         
     | 
| 
      
 69 
     | 
    
         
            +
                    frame_rgb = _resize_frame(frame_rgb)
         
     | 
| 
      
 70 
     | 
    
         
            +
                    av_frame = av.VideoFrame.from_ndarray(frame_rgb, format="rgb24")
         
     | 
| 
      
 71 
     | 
    
         
            +
                    for packet in stream.encode(av_frame):
         
     | 
| 
      
 72 
     | 
    
         
            +
                        container.mux(packet)
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
                for packet in stream.encode():
         
     | 
| 
      
 75 
     | 
    
         
            +
                    container.mux(packet)
         
     | 
| 
      
 76 
     | 
    
         
            +
                container.close()
         
     | 
| 
       58 
77 
     | 
    
         
             
                return filename
         
     | 
| 
       59 
78 
     | 
    
         | 
| 
       60 
79 
     | 
    
         | 
| 
         @@ -1,6 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            Metadata-Version: 2.1
         
     | 
| 
       2 
2 
     | 
    
         
             
            Name: vision-agent
         
     | 
| 
       3 
     | 
    
         
            -
            Version: 0.2. 
     | 
| 
      
 3 
     | 
    
         
            +
            Version: 0.2.128
         
     | 
| 
       4 
4 
     | 
    
         
             
            Summary: Toolset for Vision Agent
         
     | 
| 
       5 
5 
     | 
    
         
             
            Author: Landing AI
         
     | 
| 
       6 
6 
     | 
    
         
             
            Author-email: dev@landing.ai
         
     | 
| 
         @@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.9 
     | 
|
| 
       10 
10 
     | 
    
         
             
            Classifier: Programming Language :: Python :: 3.10
         
     | 
| 
       11 
11 
     | 
    
         
             
            Classifier: Programming Language :: Python :: 3.11
         
     | 
| 
       12 
12 
     | 
    
         
             
            Requires-Dist: anthropic (>=0.31.0,<0.32.0)
         
     | 
| 
      
 13 
     | 
    
         
            +
            Requires-Dist: av (>=11.0.0,<12.0.0)
         
     | 
| 
       13 
14 
     | 
    
         
             
            Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
         
     | 
| 
       14 
15 
     | 
    
         
             
            Requires-Dist: e2b-code-interpreter (==0.0.11a37)
         
     | 
| 
       15 
16 
     | 
    
         
             
            Requires-Dist: eva-decord (>=0.6.1,<0.7.0)
         
     | 
| 
         @@ -4,7 +4,7 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5 
     | 
|
| 
       4 
4 
     | 
    
         
             
            vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
         
     | 
| 
       5 
5 
     | 
    
         
             
            vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
         
     | 
| 
       6 
6 
     | 
    
         
             
            vision_agent/agent/vision_agent_coder.py,sha256=_2QQd_nTGojkk2ZOiMevVCY6-eUA9q1QdCWH7-Noq4w,34237
         
     | 
| 
       7 
     | 
    
         
            -
            vision_agent/agent/vision_agent_coder_prompts.py,sha256= 
     | 
| 
      
 7 
     | 
    
         
            +
            vision_agent/agent/vision_agent_coder_prompts.py,sha256=nj4iRRSAWYHjKqyUSp12aTCV1D5iUVCHeezVXoozS4M,12687
         
     | 
| 
       8 
8 
     | 
    
         
             
            vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
         
     | 
| 
       9 
9 
     | 
    
         
             
            vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
       10 
10 
     | 
    
         
             
            vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
         
     | 
| 
         @@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r 
     | 
|
| 
       14 
14 
     | 
    
         
             
            vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
         
     | 
| 
       15 
15 
     | 
    
         
             
            vision_agent/lmm/lmm.py,sha256=092oefI65_QSRvQm2znXkjTdzlZTh-Ni_38610kfbJg,16836
         
     | 
| 
       16 
16 
     | 
    
         
             
            vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
         
     | 
| 
       17 
     | 
    
         
            -
            vision_agent/tools/__init__.py,sha256= 
     | 
| 
      
 17 
     | 
    
         
            +
            vision_agent/tools/__init__.py,sha256=nx60_hujcnLz3d2wQlCbcerUmT6R2vxRy66IsQjdB3M,2364
         
     | 
| 
       18 
18 
     | 
    
         
             
            vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
         
     | 
| 
       19 
19 
     | 
    
         
             
            vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
         
     | 
| 
       20 
20 
     | 
    
         
             
            vision_agent/tools/tool_utils.py,sha256=62NVlojPMf9MuJ-3yJEcrB3mzmOxN2HrNQzzjVa-FZg,7527
         
     | 
| 
       21 
     | 
    
         
            -
            vision_agent/tools/tools.py,sha256= 
     | 
| 
      
 21 
     | 
    
         
            +
            vision_agent/tools/tools.py,sha256=p6QUo7V03UZOKBAGfabVWdPm9vUT9tyP_utCv0yKfcY,68659
         
     | 
| 
       22 
22 
     | 
    
         
             
            vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
         
     | 
| 
       23 
23 
     | 
    
         
             
            vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
         
     | 
| 
       24 
24 
     | 
    
         
             
            vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
         
     | 
| 
         @@ -26,8 +26,8 @@ vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4 
     | 
|
| 
       26 
26 
     | 
    
         
             
            vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
         
     | 
| 
       27 
27 
     | 
    
         
             
            vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
         
     | 
| 
       28 
28 
     | 
    
         
             
            vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
         
     | 
| 
       29 
     | 
    
         
            -
            vision_agent/utils/video.py,sha256= 
     | 
| 
       30 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       31 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       32 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       33 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
      
 29 
     | 
    
         
            +
            vision_agent/utils/video.py,sha256=GmJqu_3WhBMEwP4HToMMp8EwgftliHSpv5nd-QEDOcs,4528
         
     | 
| 
      
 30 
     | 
    
         
            +
            vision_agent-0.2.128.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         
     | 
| 
      
 31 
     | 
    
         
            +
            vision_agent-0.2.128.dist-info/METADATA,sha256=4E1im4aLvJnSR-tKxWUtKyJ0ZbkHxYMYxfqGz_0Layw,12295
         
     | 
| 
      
 32 
     | 
    
         
            +
            vision_agent-0.2.128.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
         
     | 
| 
      
 33 
     | 
    
         
            +
            vision_agent-0.2.128.dist-info/RECORD,,
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     |