vision-agent 0.2.182__py3-none-any.whl → 0.2.183__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +33 -2
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.183.dist-info}/METADATA +1 -1
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.183.dist-info}/RECORD +6 -6
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.183.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.183.dist-info}/WHEEL +0 -0
    
        vision_agent/tools/__init__.py
    CHANGED
    
    
    
        vision_agent/tools/tools.py
    CHANGED
    
    | @@ -930,6 +930,37 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: | |
| 930 930 | 
             
                return cast(str, data["answer"])
         | 
| 931 931 |  | 
| 932 932 |  | 
| 933 | 
            +
            def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
         | 
| 934 | 
            +
                """'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
         | 
| 935 | 
            +
                including regular videos or videos of documents or presentations. It returns text
         | 
| 936 | 
            +
                as an answer to the question.
         | 
| 937 | 
            +
             | 
| 938 | 
            +
                Parameters:
         | 
| 939 | 
            +
                    prompt (str): The question about the video
         | 
| 940 | 
            +
                    frames (List[np.ndarray]): The reference frames used for the question
         | 
| 941 | 
            +
             | 
| 942 | 
            +
                Returns:
         | 
| 943 | 
            +
                    str: A string which is the answer to the given prompt.
         | 
| 944 | 
            +
             | 
| 945 | 
            +
                Example
         | 
| 946 | 
            +
                -------
         | 
| 947 | 
            +
                    >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
         | 
| 948 | 
            +
                    'Lionel Messi'
         | 
| 949 | 
            +
                """
         | 
| 950 | 
            +
             | 
| 951 | 
            +
                buffer_bytes = frames_to_bytes(frames)
         | 
| 952 | 
            +
                files = [("video", buffer_bytes)]
         | 
| 953 | 
            +
                payload = {
         | 
| 954 | 
            +
                    "prompt": prompt,
         | 
| 955 | 
            +
                    "model": "qwen2vl",
         | 
| 956 | 
            +
                    "function_name": "qwen2_vl_video_vqa",
         | 
| 957 | 
            +
                }
         | 
| 958 | 
            +
                data: Dict[str, Any] = send_inference_request(
         | 
| 959 | 
            +
                    payload, "image-to-text", files=files, v2=True
         | 
| 960 | 
            +
                )
         | 
| 961 | 
            +
                return cast(str, data)
         | 
| 962 | 
            +
             | 
| 963 | 
            +
             | 
| 933 964 | 
             
            def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
         | 
| 934 965 | 
             
                """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
         | 
| 935 966 | 
             
                including regular images or images of documents or presentations. It returns text
         | 
| @@ -2238,13 +2269,13 @@ FUNCTION_TOOLS = [ | |
| 2238 2269 | 
             
                florence2_sam2_image,
         | 
| 2239 2270 | 
             
                florence2_sam2_video_tracking,
         | 
| 2240 2271 | 
             
                florence2_phrase_grounding,
         | 
| 2241 | 
            -
                ixc25_image_vqa,
         | 
| 2242 | 
            -
                ixc25_video_vqa,
         | 
| 2243 2272 | 
             
                detr_segmentation,
         | 
| 2244 2273 | 
             
                depth_anything_v2,
         | 
| 2245 2274 | 
             
                generate_pose_image,
         | 
| 2246 2275 | 
             
                closest_mask_distance,
         | 
| 2247 2276 | 
             
                closest_box_distance,
         | 
| 2277 | 
            +
                qwen2_vl_images_vqa,
         | 
| 2278 | 
            +
                qwen2_vl_video_vqa,
         | 
| 2248 2279 | 
             
            ]
         | 
| 2249 2280 |  | 
| 2250 2281 | 
             
            UTIL_TOOLS = [
         | 
| @@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r | |
| 16 16 | 
             
            vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
         | 
| 17 17 | 
             
            vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
         | 
| 18 18 | 
             
            vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
         | 
| 19 | 
            -
            vision_agent/tools/__init__.py,sha256= | 
| 19 | 
            +
            vision_agent/tools/__init__.py,sha256=17wZ4ZsoSTZZaiqBTi6pqAKUr-qf58_T_zH2GXOi1KU,2771
         | 
| 20 20 | 
             
            vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
         | 
| 21 21 | 
             
            vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
         | 
| 22 22 | 
             
            vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
         | 
| 23 | 
            -
            vision_agent/tools/tools.py,sha256= | 
| 23 | 
            +
            vision_agent/tools/tools.py,sha256=vc0T940b-rRiGAOJttn7BsuCpVh9rJaivOmorpE41AA,81134
         | 
| 24 24 | 
             
            vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
         | 
| 25 25 | 
             
            vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
         | 
| 26 26 | 
             
            vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
         | 
| @@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd | |
| 29 29 | 
             
            vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
         | 
| 30 30 | 
             
            vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
         | 
| 31 31 | 
             
            vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
         | 
| 32 | 
            -
            vision_agent-0.2. | 
| 33 | 
            -
            vision_agent-0.2. | 
| 34 | 
            -
            vision_agent-0.2. | 
| 35 | 
            -
            vision_agent-0.2. | 
| 32 | 
            +
            vision_agent-0.2.183.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         | 
| 33 | 
            +
            vision_agent-0.2.183.dist-info/METADATA,sha256=9V38VymRic0fe2uqCIjl3nhuVJYx49ZQox69izWD8k8,18330
         | 
| 34 | 
            +
            vision_agent-0.2.183.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
         | 
| 35 | 
            +
            vision_agent-0.2.183.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |