vision-agent 0.2.182__py3-none-any.whl → 0.2.184__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +2 -0
- vision_agent/tools/tools.py +109 -2
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.184.dist-info}/METADATA +1 -1
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.184.dist-info}/RECORD +6 -6
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.184.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.184.dist-info}/WHEEL +0 -0
    
        vision_agent/tools/__init__.py
    CHANGED
    
    
    
        vision_agent/tools/tools.py
    CHANGED
    
    | @@ -930,6 +930,37 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: | |
| 930 930 | 
             
                return cast(str, data["answer"])
         | 
| 931 931 |  | 
| 932 932 |  | 
| 933 | 
            +
            def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
         | 
| 934 | 
            +
                """'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
         | 
| 935 | 
            +
                including regular videos or videos of documents or presentations. It returns text
         | 
| 936 | 
            +
                as an answer to the question.
         | 
| 937 | 
            +
             | 
| 938 | 
            +
                Parameters:
         | 
| 939 | 
            +
                    prompt (str): The question about the video
         | 
| 940 | 
            +
                    frames (List[np.ndarray]): The reference frames used for the question
         | 
| 941 | 
            +
             | 
| 942 | 
            +
                Returns:
         | 
| 943 | 
            +
                    str: A string which is the answer to the given prompt.
         | 
| 944 | 
            +
             | 
| 945 | 
            +
                Example
         | 
| 946 | 
            +
                -------
         | 
| 947 | 
            +
                    >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
         | 
| 948 | 
            +
                    'Lionel Messi'
         | 
| 949 | 
            +
                """
         | 
| 950 | 
            +
             | 
| 951 | 
            +
                buffer_bytes = frames_to_bytes(frames)
         | 
| 952 | 
            +
                files = [("video", buffer_bytes)]
         | 
| 953 | 
            +
                payload = {
         | 
| 954 | 
            +
                    "prompt": prompt,
         | 
| 955 | 
            +
                    "model": "qwen2vl",
         | 
| 956 | 
            +
                    "function_name": "qwen2_vl_video_vqa",
         | 
| 957 | 
            +
                }
         | 
| 958 | 
            +
                data: Dict[str, Any] = send_inference_request(
         | 
| 959 | 
            +
                    payload, "image-to-text", files=files, v2=True
         | 
| 960 | 
            +
                )
         | 
| 961 | 
            +
                return cast(str, data)
         | 
| 962 | 
            +
             | 
| 963 | 
            +
             | 
| 933 964 | 
             
            def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
         | 
| 934 965 | 
             
                """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
         | 
| 935 966 | 
             
                including regular images or images of documents or presentations. It returns text
         | 
| @@ -1742,6 +1773,82 @@ def closest_box_distance( | |
| 1742 1773 | 
             
                return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
         | 
| 1743 1774 |  | 
| 1744 1775 |  | 
| 1776 | 
            +
            def flux_image_inpainting(
         | 
| 1777 | 
            +
                prompt: str,
         | 
| 1778 | 
            +
                image: np.ndarray,
         | 
| 1779 | 
            +
                mask: np.ndarray,
         | 
| 1780 | 
            +
            ) -> np.ndarray:
         | 
| 1781 | 
            +
                """'flux_image_inpainting' performs image inpainting to fill the masked regions,
         | 
| 1782 | 
            +
                given by mask, in the image, given image based on the text prompt and surrounding image context.
         | 
| 1783 | 
            +
                It can be used to edit regions of an image according to the prompt given.
         | 
| 1784 | 
            +
             | 
| 1785 | 
            +
                Parameters:
         | 
| 1786 | 
            +
                    prompt (str): A detailed text description guiding what should be generated
         | 
| 1787 | 
            +
                        in the masked area. More detailed and specific prompts typically yield better results.
         | 
| 1788 | 
            +
                    image (np.ndarray): The source image to be inpainted.
         | 
| 1789 | 
            +
                        The image will serve as the base context for the inpainting process.
         | 
| 1790 | 
            +
                    mask (np.ndarray): A binary mask image with 0's and 1's,
         | 
| 1791 | 
            +
                        where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
         | 
| 1792 | 
            +
             | 
| 1793 | 
            +
                Returns:
         | 
| 1794 | 
            +
                    np.ndarray:
         | 
| 1795 | 
            +
                        The generated image(s) as a numpy array in RGB format
         | 
| 1796 | 
            +
                        with values ranging from 0 to 255.
         | 
| 1797 | 
            +
             | 
| 1798 | 
            +
                -------
         | 
| 1799 | 
            +
                Example:
         | 
| 1800 | 
            +
                    >>> # Generate inpainting
         | 
| 1801 | 
            +
                    >>> result = flux_image_inpainting(
         | 
| 1802 | 
            +
                    ...     prompt="a modern black leather sofa with white pillows",
         | 
| 1803 | 
            +
                    ...     image=image,
         | 
| 1804 | 
            +
                    ...     mask=mask,
         | 
| 1805 | 
            +
                    ... )
         | 
| 1806 | 
            +
                    >>> save_image(result, "inpainted_room.png")
         | 
| 1807 | 
            +
                """
         | 
| 1808 | 
            +
                if (
         | 
| 1809 | 
            +
                    image.shape[0] < 8
         | 
| 1810 | 
            +
                    or image.shape[1] < 8
         | 
| 1811 | 
            +
                    or mask.shape[0] < 8
         | 
| 1812 | 
            +
                    or mask.shape[1] < 8
         | 
| 1813 | 
            +
                ):
         | 
| 1814 | 
            +
                    raise ValueError("The image or mask does not have enough size for inpainting")
         | 
| 1815 | 
            +
             | 
| 1816 | 
            +
                if np.array_equal(mask, mask.astype(bool).astype(int)):
         | 
| 1817 | 
            +
                    mask = np.where(mask > 0, 255, 0).astype(np.uint8)
         | 
| 1818 | 
            +
                else:
         | 
| 1819 | 
            +
                    raise ValueError("The mask should be a binary mask with 0's and 1's")
         | 
| 1820 | 
            +
             | 
| 1821 | 
            +
                image_file = numpy_to_bytes(image)
         | 
| 1822 | 
            +
                mask_file = numpy_to_bytes(mask)
         | 
| 1823 | 
            +
             | 
| 1824 | 
            +
                files = [
         | 
| 1825 | 
            +
                    ("image", image_file),
         | 
| 1826 | 
            +
                    ("mask_image", mask_file),
         | 
| 1827 | 
            +
                ]
         | 
| 1828 | 
            +
             | 
| 1829 | 
            +
                payload = {
         | 
| 1830 | 
            +
                    "prompt": prompt,
         | 
| 1831 | 
            +
                    "task": "inpainting",
         | 
| 1832 | 
            +
                    "height": image.shape[0],
         | 
| 1833 | 
            +
                    "width": image.shape[1],
         | 
| 1834 | 
            +
                    "strength": 0.99,
         | 
| 1835 | 
            +
                    "guidance_scale": 18,
         | 
| 1836 | 
            +
                    "num_inference_steps": 20,
         | 
| 1837 | 
            +
                    "seed": None,
         | 
| 1838 | 
            +
                }
         | 
| 1839 | 
            +
             | 
| 1840 | 
            +
                response = send_inference_request(
         | 
| 1841 | 
            +
                    payload=payload,
         | 
| 1842 | 
            +
                    endpoint_name="flux1",
         | 
| 1843 | 
            +
                    files=files,
         | 
| 1844 | 
            +
                    v2=True,
         | 
| 1845 | 
            +
                    metadata_payload={"function_name": "flux_image_inpainting"},
         | 
| 1846 | 
            +
                )
         | 
| 1847 | 
            +
             | 
| 1848 | 
            +
                output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
         | 
| 1849 | 
            +
                return output_image
         | 
| 1850 | 
            +
             | 
| 1851 | 
            +
             | 
| 1745 1852 | 
             
            # Utility and visualization functions
         | 
| 1746 1853 |  | 
| 1747 1854 |  | 
| @@ -2238,13 +2345,13 @@ FUNCTION_TOOLS = [ | |
| 2238 2345 | 
             
                florence2_sam2_image,
         | 
| 2239 2346 | 
             
                florence2_sam2_video_tracking,
         | 
| 2240 2347 | 
             
                florence2_phrase_grounding,
         | 
| 2241 | 
            -
                ixc25_image_vqa,
         | 
| 2242 | 
            -
                ixc25_video_vqa,
         | 
| 2243 2348 | 
             
                detr_segmentation,
         | 
| 2244 2349 | 
             
                depth_anything_v2,
         | 
| 2245 2350 | 
             
                generate_pose_image,
         | 
| 2246 2351 | 
             
                closest_mask_distance,
         | 
| 2247 2352 | 
             
                closest_box_distance,
         | 
| 2353 | 
            +
                qwen2_vl_images_vqa,
         | 
| 2354 | 
            +
                qwen2_vl_video_vqa,
         | 
| 2248 2355 | 
             
            ]
         | 
| 2249 2356 |  | 
| 2250 2357 | 
             
            UTIL_TOOLS = [
         | 
| @@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r | |
| 16 16 | 
             
            vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
         | 
| 17 17 | 
             
            vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
         | 
| 18 18 | 
             
            vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
         | 
| 19 | 
            -
            vision_agent/tools/__init__.py,sha256= | 
| 19 | 
            +
            vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-xo,2798
         | 
| 20 20 | 
             
            vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
         | 
| 21 21 | 
             
            vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
         | 
| 22 22 | 
             
            vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
         | 
| 23 | 
            -
            vision_agent/tools/tools.py,sha256= | 
| 23 | 
            +
            vision_agent/tools/tools.py,sha256=kHeBjiVvncQJeL_Gni84bgHOCgxko4XO7otpt8IyWU4,83610
         | 
| 24 24 | 
             
            vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
         | 
| 25 25 | 
             
            vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
         | 
| 26 26 | 
             
            vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
         | 
| @@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd | |
| 29 29 | 
             
            vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
         | 
| 30 30 | 
             
            vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
         | 
| 31 31 | 
             
            vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
         | 
| 32 | 
            -
            vision_agent-0.2. | 
| 33 | 
            -
            vision_agent-0.2. | 
| 34 | 
            -
            vision_agent-0.2. | 
| 35 | 
            -
            vision_agent-0.2. | 
| 32 | 
            +
            vision_agent-0.2.184.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         | 
| 33 | 
            +
            vision_agent-0.2.184.dist-info/METADATA,sha256=n8BeCLsPCBXDsr0FCmRBtScseMyJ8TuR68MWlqeO9Is,18330
         | 
| 34 | 
            +
            vision_agent-0.2.184.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
         | 
| 35 | 
            +
            vision_agent-0.2.184.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |