PyPI - vision-agent - Versions diffs - 1.0.11__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

vision-agent 1.0.11py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

vision_agent/tools/__init__.py CHANGED Viewed

@@ -8,6 +8,7 @@ from .planner_tools import judge_od_results
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 from .tools import (
     activity_recognition,
+    agentic_document_extraction,
     agentic_object_detection,
     agentic_sam2_instance_segmentation,
     agentic_sam2_video_tracking,

vision_agent/tools/tools.py CHANGED Viewed

@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from importlib import resources
 from pathlib import Path
 from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from warnings import warn
 import cv2
 import numpy as np
@@ -18,6 +19,7 @@ from IPython.display import display
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
+import pymupdf  # type: ignore
 from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
 from vision_agent.utils.execute import FileSerializer, MimeType
@@ -2143,6 +2145,11 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
                                 'summary': 'This table illustrates a trend of ...'},
                     ],
     """
+    warning = (
+        "This function is deprecated. For document extraction please use the agentic-doc python package on "
+        "https://pypi.org/project/agentic-doc/ or the agentic_document_extraction function."
+    )
+    warn(warning, DeprecationWarning, stacklevel=2)
     image_file = numpy_to_bytes(image)
@@ -2184,6 +2191,76 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
     return data
+def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
+    """'agentic_document_extraction' is a tool that can extract structured information out of
+    documents with different layouts. It returns the extracted data in a structured
+    hierarchical format containing text, tables, figures, charts, and other
+    information.
+    Parameters:
+        image (np.ndarray): The document image to analyze
+    Returns:
+        Dict[str, Any]: A dictionary containing the extracted information.
+    Example
+    -------
+        >>> agentic_document_analysis(image)
+        {
+            "markdown": "# Document title\n\n## Document subtitle\n\nThis is a sample document.",
+            "chunks": [
+                {
+                    "text": "# Document title",
+                    "grounding": [
+                        {
+                            "box": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
+                            "page": 0
+                        }
+                    ],
+                    "chunk_type": "page_header",
+                    "chunk_id": "622e0374-c50e-4960-a013-650138b42528"
+                },
+            ...
+            ]
+        }
+    """
+    image_file = numpy_to_bytes(image)
+    files = [("image", image_file)]
+    payload = {
+        "model": "agentic-document-analysis",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload=payload,
+        endpoint_name="agentic-document-analysis",
+        files=files,
+        v2=True,
+        metadata_payload={"function_name": "agentic_document_analysis"},
+    )
+    # don't display normalized bboxes
+    _display_tool_trace(
+        agentic_document_extraction.__name__,
+        payload,
+        data,
+        files,
+    )
+    def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
+        for chunk in data["chunks"]:
+            for grounding in chunk["grounding"]:
+                box = grounding["box"]
+                grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
+        return data
+    data = transform_boxes(data)
+    return data
 def document_qa(
     prompt: str,
     image: np.ndarray,
@@ -2211,29 +2288,25 @@ def document_qa(
     files = [("image", image_file)]
     payload = {
-        "model": "document-analysis",
+        "model": "agentic-document-analysis",
     }
     data: Dict[str, Any] = send_inference_request(
         payload=payload,
-        endpoint_name="document-analysis",
+        endpoint_name="agentic-document-analysis",
         files=files,
         v2=True,
         metadata_payload={"function_name": "document_qa"},
     )
-    def normalize(data: Any) -> Dict[str, Any]:
-        if isinstance(data, Dict):
-            if "bbox" in data:
-                data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
-            for key in data:
-                data[key] = normalize(data[key])
-        elif isinstance(data, List):
-            for i in range(len(data)):
-                data[i] = normalize(data[i])
-        return data  # type: ignore
+    def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
+        for chunk in data["chunks"]:
+            for grounding in chunk["grounding"]:
+                box = grounding["box"]
+                grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
+        return data
-    data = normalize(data)
+    data = transform_boxes(data)
     prompt = f"""
 Document Context:
@@ -3075,6 +3148,56 @@ def save_image(image: np.ndarray, file_path: str) -> None:
     pil_image.save(file_path)
+def load_pdf(pdf_path: str) -> List[np.ndarray]:
+    """'load_pdf' is a utility function that loads a PDF from the given file path string and converts each page to an image.
+    Parameters:
+        pdf_path (str): The path to the PDF file.
+    Returns:
+        List[np.ndarray]: A list of images as NumPy arrays, one for each page of the PDF.
+    Example
+    -------
+        >>> load_pdf("path/to/document.pdf")
+    """
+    # Handle URL case
+    if pdf_path.startswith(("http", "https")):
+        _, pdf_suffix = os.path.splitext(pdf_path)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=pdf_suffix) as tmp_file:
+            # Download the PDF and save it to the temporary file
+            with urllib.request.urlopen(pdf_path) as response:
+                tmp_file.write(response.read())
+            pdf_path = tmp_file.name
+    # Open the PDF
+    doc = pymupdf.open(pdf_path)
+    images = []
+    # Convert each page to an image
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        # Render page to an image
+        pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2))
+        # Convert to PIL Image
+        img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
+        # Convert to numpy array
+        images.append(np.array(img))
+    # Close the document
+    doc.close()
+    # Clean up temporary file if it was a URL
+    if pdf_path.startswith(("http", "https")):
+        os.unlink(pdf_path)
+    return images
 def save_video(
     frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 5
 ) -> str:
@@ -3488,7 +3611,7 @@ FUNCTION_TOOLS = [
     florence2_sam2_instance_segmentation,
     florence2_sam2_video_tracking,
     claude35_text_extraction,
-    document_extraction,
+    agentic_document_extraction,
     document_qa,
     ocr,
     qwen25_vl_images_vqa,

{vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: vision-agent
-Version: 1.0.11
+Version: 1.1.2
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -28,6 +28,7 @@ Requires-Dist: pandas (==2.*)
 Requires-Dist: pillow (==10.*)
 Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
+Requires-Dist: pymupdf (>=1.23.0,<2.0.0)
 Requires-Dist: pytube (==15.0.0)
 Requires-Dist: requests (==2.*)
 Requires-Dist: rich (>=13.7.1,<14.0.0)

{vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/RECORD RENAMED Viewed

@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
 vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
 vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
-vision_agent/tools/__init__.py,sha256=H8M5v--cANBiOWvAfUJNj9cq9PKm_DjRrG1MeNRWpHs,2434
+vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
 vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
 vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tools.py,sha256=4gwL8EFMwm6l0MujftJ8G8BO2z8Dh_a4FPjy_xUmYqs,121889
+vision_agent/tools/tools.py,sha256=dKKrfKxqQYVDFRsLjMMpp1z4_5k68pkaoZUMf1BMc_Q,125694
 vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
 vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
 vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=XbH5wuD1nlaKe6zBeVPqUbQDsK6D-eCskKND3rRHOzo,8
 vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
 vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
 vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
-vision_agent-1.0.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-1.0.11.dist-info/METADATA,sha256=dbo4wR0zh5vN19V2uj65t1avenlKmG-L-ykf7BK2dns,12533
-vision_agent-1.0.11.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
-vision_agent-1.0.11.dist-info/RECORD,,
+vision_agent-1.1.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-1.1.2.dist-info/METADATA,sha256=JxWPwfrAwtWx0Fpqq9b9Se7LZi22Ddqiw-YxX6nHe0A,12573
+vision_agent-1.1.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
+vision_agent-1.1.2.dist-info/RECORD,,

{vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 1.0.11__py3-none-any.whl → 1.1.2__py3-none-any.whl

vision-agent 1.0.11py3-none-any.whl → 1.1.2py3-none-any.whl