vision-agent 1.0.11__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +137 -14
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/METADATA +2 -1
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/RECORD +6 -6
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/LICENSE +0 -0
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/WHEEL +0 -0
    
        vision_agent/tools/__init__.py
    CHANGED
    
    | @@ -8,6 +8,7 @@ from .planner_tools import judge_od_results | |
| 8 8 | 
             
            from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
         | 
| 9 9 | 
             
            from .tools import (
         | 
| 10 10 | 
             
                activity_recognition,
         | 
| 11 | 
            +
                agentic_document_extraction,
         | 
| 11 12 | 
             
                agentic_object_detection,
         | 
| 12 13 | 
             
                agentic_sam2_instance_segmentation,
         | 
| 13 14 | 
             
                agentic_sam2_video_tracking,
         | 
    
        vision_agent/tools/tools.py
    CHANGED
    
    | @@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed | |
| 9 9 | 
             
            from importlib import resources
         | 
| 10 10 | 
             
            from pathlib import Path
         | 
| 11 11 | 
             
            from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
         | 
| 12 | 
            +
            from warnings import warn
         | 
| 12 13 |  | 
| 13 14 | 
             
            import cv2
         | 
| 14 15 | 
             
            import numpy as np
         | 
| @@ -18,6 +19,7 @@ from IPython.display import display | |
| 18 19 | 
             
            from PIL import Image, ImageDraw, ImageFont
         | 
| 19 20 | 
             
            from pillow_heif import register_heif_opener  # type: ignore
         | 
| 20 21 | 
             
            from pytube import YouTube  # type: ignore
         | 
| 22 | 
            +
            import pymupdf  # type: ignore
         | 
| 21 23 |  | 
| 22 24 | 
             
            from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
         | 
| 23 25 | 
             
            from vision_agent.utils.execute import FileSerializer, MimeType
         | 
| @@ -2143,6 +2145,11 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]: | |
| 2143 2145 | 
             
                                            'summary': 'This table illustrates a trend of ...'},
         | 
| 2144 2146 | 
             
                                ],
         | 
| 2145 2147 | 
             
                """
         | 
| 2148 | 
            +
                warning = (
         | 
| 2149 | 
            +
                    "This function is deprecated. For document extraction please use the agentic-doc python package on "
         | 
| 2150 | 
            +
                    "https://pypi.org/project/agentic-doc/ or the agentic_document_extraction function."
         | 
| 2151 | 
            +
                )
         | 
| 2152 | 
            +
                warn(warning, DeprecationWarning, stacklevel=2)
         | 
| 2146 2153 |  | 
| 2147 2154 | 
             
                image_file = numpy_to_bytes(image)
         | 
| 2148 2155 |  | 
| @@ -2184,6 +2191,76 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]: | |
| 2184 2191 | 
             
                return data
         | 
| 2185 2192 |  | 
| 2186 2193 |  | 
| 2194 | 
            +
            def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
         | 
| 2195 | 
            +
                """'agentic_document_extraction' is a tool that can extract structured information out of
         | 
| 2196 | 
            +
                documents with different layouts. It returns the extracted data in a structured
         | 
| 2197 | 
            +
                hierarchical format containing text, tables, figures, charts, and other
         | 
| 2198 | 
            +
                information.
         | 
| 2199 | 
            +
             | 
| 2200 | 
            +
                Parameters:
         | 
| 2201 | 
            +
                    image (np.ndarray): The document image to analyze
         | 
| 2202 | 
            +
             | 
| 2203 | 
            +
                Returns:
         | 
| 2204 | 
            +
                    Dict[str, Any]: A dictionary containing the extracted information.
         | 
| 2205 | 
            +
             | 
| 2206 | 
            +
                Example
         | 
| 2207 | 
            +
                -------
         | 
| 2208 | 
            +
                    >>> agentic_document_analysis(image)
         | 
| 2209 | 
            +
                    {
         | 
| 2210 | 
            +
                        "markdown": "# Document title\n\n## Document subtitle\n\nThis is a sample document.",
         | 
| 2211 | 
            +
                        "chunks": [
         | 
| 2212 | 
            +
                            {
         | 
| 2213 | 
            +
                                "text": "# Document title",
         | 
| 2214 | 
            +
                                "grounding": [
         | 
| 2215 | 
            +
                                    {
         | 
| 2216 | 
            +
                                        "box": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
         | 
| 2217 | 
            +
                                        "page": 0
         | 
| 2218 | 
            +
                                    }
         | 
| 2219 | 
            +
                                ],
         | 
| 2220 | 
            +
                                "chunk_type": "page_header",
         | 
| 2221 | 
            +
                                "chunk_id": "622e0374-c50e-4960-a013-650138b42528"
         | 
| 2222 | 
            +
                            },
         | 
| 2223 | 
            +
                        ...
         | 
| 2224 | 
            +
                        ]
         | 
| 2225 | 
            +
                    }
         | 
| 2226 | 
            +
                """
         | 
| 2227 | 
            +
             | 
| 2228 | 
            +
                image_file = numpy_to_bytes(image)
         | 
| 2229 | 
            +
             | 
| 2230 | 
            +
                files = [("image", image_file)]
         | 
| 2231 | 
            +
             | 
| 2232 | 
            +
                payload = {
         | 
| 2233 | 
            +
                    "model": "agentic-document-analysis",
         | 
| 2234 | 
            +
                }
         | 
| 2235 | 
            +
             | 
| 2236 | 
            +
                data: Dict[str, Any] = send_inference_request(
         | 
| 2237 | 
            +
                    payload=payload,
         | 
| 2238 | 
            +
                    endpoint_name="agentic-document-analysis",
         | 
| 2239 | 
            +
                    files=files,
         | 
| 2240 | 
            +
                    v2=True,
         | 
| 2241 | 
            +
                    metadata_payload={"function_name": "agentic_document_analysis"},
         | 
| 2242 | 
            +
                )
         | 
| 2243 | 
            +
             | 
| 2244 | 
            +
                # don't display normalized bboxes
         | 
| 2245 | 
            +
                _display_tool_trace(
         | 
| 2246 | 
            +
                    agentic_document_extraction.__name__,
         | 
| 2247 | 
            +
                    payload,
         | 
| 2248 | 
            +
                    data,
         | 
| 2249 | 
            +
                    files,
         | 
| 2250 | 
            +
                )
         | 
| 2251 | 
            +
             | 
| 2252 | 
            +
                def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
         | 
| 2253 | 
            +
                    for chunk in data["chunks"]:
         | 
| 2254 | 
            +
                        for grounding in chunk["grounding"]:
         | 
| 2255 | 
            +
                            box = grounding["box"]
         | 
| 2256 | 
            +
                            grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
         | 
| 2257 | 
            +
                    return data
         | 
| 2258 | 
            +
             | 
| 2259 | 
            +
                data = transform_boxes(data)
         | 
| 2260 | 
            +
             | 
| 2261 | 
            +
                return data
         | 
| 2262 | 
            +
             | 
| 2263 | 
            +
             | 
| 2187 2264 | 
             
            def document_qa(
         | 
| 2188 2265 | 
             
                prompt: str,
         | 
| 2189 2266 | 
             
                image: np.ndarray,
         | 
| @@ -2211,29 +2288,25 @@ def document_qa( | |
| 2211 2288 | 
             
                files = [("image", image_file)]
         | 
| 2212 2289 |  | 
| 2213 2290 | 
             
                payload = {
         | 
| 2214 | 
            -
                    "model": "document-analysis",
         | 
| 2291 | 
            +
                    "model": "agentic-document-analysis",
         | 
| 2215 2292 | 
             
                }
         | 
| 2216 2293 |  | 
| 2217 2294 | 
             
                data: Dict[str, Any] = send_inference_request(
         | 
| 2218 2295 | 
             
                    payload=payload,
         | 
| 2219 | 
            -
                    endpoint_name="document-analysis",
         | 
| 2296 | 
            +
                    endpoint_name="agentic-document-analysis",
         | 
| 2220 2297 | 
             
                    files=files,
         | 
| 2221 2298 | 
             
                    v2=True,
         | 
| 2222 2299 | 
             
                    metadata_payload={"function_name": "document_qa"},
         | 
| 2223 2300 | 
             
                )
         | 
| 2224 2301 |  | 
| 2225 | 
            -
                def  | 
| 2226 | 
            -
                     | 
| 2227 | 
            -
                         | 
| 2228 | 
            -
                             | 
| 2229 | 
            -
             | 
| 2230 | 
            -
             | 
| 2231 | 
            -
                    elif isinstance(data, List):
         | 
| 2232 | 
            -
                        for i in range(len(data)):
         | 
| 2233 | 
            -
                            data[i] = normalize(data[i])
         | 
| 2234 | 
            -
                    return data  # type: ignore
         | 
| 2302 | 
            +
                def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
         | 
| 2303 | 
            +
                    for chunk in data["chunks"]:
         | 
| 2304 | 
            +
                        for grounding in chunk["grounding"]:
         | 
| 2305 | 
            +
                            box = grounding["box"]
         | 
| 2306 | 
            +
                            grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
         | 
| 2307 | 
            +
                    return data
         | 
| 2235 2308 |  | 
| 2236 | 
            -
                data =  | 
| 2309 | 
            +
                data = transform_boxes(data)
         | 
| 2237 2310 |  | 
| 2238 2311 | 
             
                prompt = f"""
         | 
| 2239 2312 | 
             
            Document Context:
         | 
| @@ -3075,6 +3148,56 @@ def save_image(image: np.ndarray, file_path: str) -> None: | |
| 3075 3148 | 
             
                pil_image.save(file_path)
         | 
| 3076 3149 |  | 
| 3077 3150 |  | 
| 3151 | 
            +
            def load_pdf(pdf_path: str) -> List[np.ndarray]:
         | 
| 3152 | 
            +
                """'load_pdf' is a utility function that loads a PDF from the given file path string and converts each page to an image.
         | 
| 3153 | 
            +
             | 
| 3154 | 
            +
                Parameters:
         | 
| 3155 | 
            +
                    pdf_path (str): The path to the PDF file.
         | 
| 3156 | 
            +
             | 
| 3157 | 
            +
                Returns:
         | 
| 3158 | 
            +
                    List[np.ndarray]: A list of images as NumPy arrays, one for each page of the PDF.
         | 
| 3159 | 
            +
             | 
| 3160 | 
            +
                Example
         | 
| 3161 | 
            +
                -------
         | 
| 3162 | 
            +
                    >>> load_pdf("path/to/document.pdf")
         | 
| 3163 | 
            +
                """
         | 
| 3164 | 
            +
             | 
| 3165 | 
            +
                # Handle URL case
         | 
| 3166 | 
            +
                if pdf_path.startswith(("http", "https")):
         | 
| 3167 | 
            +
                    _, pdf_suffix = os.path.splitext(pdf_path)
         | 
| 3168 | 
            +
                    with tempfile.NamedTemporaryFile(delete=False, suffix=pdf_suffix) as tmp_file:
         | 
| 3169 | 
            +
                        # Download the PDF and save it to the temporary file
         | 
| 3170 | 
            +
                        with urllib.request.urlopen(pdf_path) as response:
         | 
| 3171 | 
            +
                            tmp_file.write(response.read())
         | 
| 3172 | 
            +
                        pdf_path = tmp_file.name
         | 
| 3173 | 
            +
             | 
| 3174 | 
            +
                # Open the PDF
         | 
| 3175 | 
            +
                doc = pymupdf.open(pdf_path)
         | 
| 3176 | 
            +
                images = []
         | 
| 3177 | 
            +
             | 
| 3178 | 
            +
                # Convert each page to an image
         | 
| 3179 | 
            +
                for page_num in range(len(doc)):
         | 
| 3180 | 
            +
                    page = doc.load_page(page_num)
         | 
| 3181 | 
            +
             | 
| 3182 | 
            +
                    # Render page to an image
         | 
| 3183 | 
            +
                    pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2))
         | 
| 3184 | 
            +
             | 
| 3185 | 
            +
                    # Convert to PIL Image
         | 
| 3186 | 
            +
                    img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
         | 
| 3187 | 
            +
             | 
| 3188 | 
            +
                    # Convert to numpy array
         | 
| 3189 | 
            +
                    images.append(np.array(img))
         | 
| 3190 | 
            +
             | 
| 3191 | 
            +
                # Close the document
         | 
| 3192 | 
            +
                doc.close()
         | 
| 3193 | 
            +
             | 
| 3194 | 
            +
                # Clean up temporary file if it was a URL
         | 
| 3195 | 
            +
                if pdf_path.startswith(("http", "https")):
         | 
| 3196 | 
            +
                    os.unlink(pdf_path)
         | 
| 3197 | 
            +
             | 
| 3198 | 
            +
                return images
         | 
| 3199 | 
            +
             | 
| 3200 | 
            +
             | 
| 3078 3201 | 
             
            def save_video(
         | 
| 3079 3202 | 
             
                frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 5
         | 
| 3080 3203 | 
             
            ) -> str:
         | 
| @@ -3488,7 +3611,7 @@ FUNCTION_TOOLS = [ | |
| 3488 3611 | 
             
                florence2_sam2_instance_segmentation,
         | 
| 3489 3612 | 
             
                florence2_sam2_video_tracking,
         | 
| 3490 3613 | 
             
                claude35_text_extraction,
         | 
| 3491 | 
            -
                 | 
| 3614 | 
            +
                agentic_document_extraction,
         | 
| 3492 3615 | 
             
                document_qa,
         | 
| 3493 3616 | 
             
                ocr,
         | 
| 3494 3617 | 
             
                qwen25_vl_images_vqa,
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.3
         | 
| 2 2 | 
             
            Name: vision-agent
         | 
| 3 | 
            -
            Version: 1. | 
| 3 | 
            +
            Version: 1.1.2
         | 
| 4 4 | 
             
            Summary: Toolset for Vision Agent
         | 
| 5 5 | 
             
            Author: Landing AI
         | 
| 6 6 | 
             
            Author-email: dev@landing.ai
         | 
| @@ -28,6 +28,7 @@ Requires-Dist: pandas (==2.*) | |
| 28 28 | 
             
            Requires-Dist: pillow (==10.*)
         | 
| 29 29 | 
             
            Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
         | 
| 30 30 | 
             
            Requires-Dist: pydantic (>=2.0.0,<3.0.0)
         | 
| 31 | 
            +
            Requires-Dist: pymupdf (>=1.23.0,<2.0.0)
         | 
| 31 32 | 
             
            Requires-Dist: pytube (==15.0.0)
         | 
| 32 33 | 
             
            Requires-Dist: requests (==2.*)
         | 
| 33 34 | 
             
            Requires-Dist: rich (>=13.7.1,<14.0.0)
         | 
| @@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c | |
| 26 26 | 
             
            vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
         | 
| 27 27 | 
             
            vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
         | 
| 28 28 | 
             
            vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
         | 
| 29 | 
            -
            vision_agent/tools/__init__.py,sha256= | 
| 29 | 
            +
            vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
         | 
| 30 30 | 
             
            vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
         | 
| 31 31 | 
             
            vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
         | 
| 32 32 | 
             
            vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
         | 
| 33 | 
            -
            vision_agent/tools/tools.py,sha256= | 
| 33 | 
            +
            vision_agent/tools/tools.py,sha256=dKKrfKxqQYVDFRsLjMMpp1z4_5k68pkaoZUMf1BMc_Q,125694
         | 
| 34 34 | 
             
            vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
         | 
| 35 35 | 
             
            vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
         | 
| 36 36 | 
             
            vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
         | 
| @@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=XbH5wuD1nlaKe6zBeVPqUbQDsK6D-eCskKND3rRHOzo,8 | |
| 40 40 | 
             
            vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
         | 
| 41 41 | 
             
            vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
         | 
| 42 42 | 
             
            vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
         | 
| 43 | 
            -
            vision_agent-1. | 
| 44 | 
            -
            vision_agent-1. | 
| 45 | 
            -
            vision_agent-1. | 
| 46 | 
            -
            vision_agent-1. | 
| 43 | 
            +
            vision_agent-1.1.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         | 
| 44 | 
            +
            vision_agent-1.1.2.dist-info/METADATA,sha256=JxWPwfrAwtWx0Fpqq9b9Se7LZi22Ddqiw-YxX6nHe0A,12573
         | 
| 45 | 
            +
            vision_agent-1.1.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
         | 
| 46 | 
            +
            vision_agent-1.1.2.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |