vision-agent 1.0.11__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ from .planner_tools import judge_od_results
8
8
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
9
9
  from .tools import (
10
10
  activity_recognition,
11
+ agentic_document_extraction,
11
12
  agentic_object_detection,
12
13
  agentic_sam2_instance_segmentation,
13
14
  agentic_sam2_video_tracking,
@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from importlib import resources
10
10
  from pathlib import Path
11
11
  from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
12
+ from warnings import warn
12
13
 
13
14
  import cv2
14
15
  import numpy as np
@@ -18,6 +19,7 @@ from IPython.display import display
18
19
  from PIL import Image, ImageDraw, ImageFont
19
20
  from pillow_heif import register_heif_opener # type: ignore
20
21
  from pytube import YouTube # type: ignore
22
+ import pymupdf # type: ignore
21
23
 
22
24
  from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
23
25
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -2143,6 +2145,11 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2143
2145
  'summary': 'This table illustrates a trend of ...'},
2144
2146
  ],
2145
2147
  """
2148
+ warning = (
2149
+ "This function is deprecated. For document extraction please use the agentic-doc python package on "
2150
+ "https://pypi.org/project/agentic-doc/ or the agentic_document_extraction function."
2151
+ )
2152
+ warn(warning, DeprecationWarning, stacklevel=2)
2146
2153
 
2147
2154
  image_file = numpy_to_bytes(image)
2148
2155
 
@@ -2184,6 +2191,76 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2184
2191
  return data
2185
2192
 
2186
2193
 
2194
+ def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
2195
+ """'agentic_document_extraction' is a tool that can extract structured information out of
2196
+ documents with different layouts. It returns the extracted data in a structured
2197
+ hierarchical format containing text, tables, figures, charts, and other
2198
+ information.
2199
+
2200
+ Parameters:
2201
+ image (np.ndarray): The document image to analyze
2202
+
2203
+ Returns:
2204
+ Dict[str, Any]: A dictionary containing the extracted information.
2205
+
2206
+ Example
2207
+ -------
2208
+ >>> agentic_document_analysis(image)
2209
+ {
2210
+ "markdown": "# Document title\n\n## Document subtitle\n\nThis is a sample document.",
2211
+ "chunks": [
2212
+ {
2213
+ "text": "# Document title",
2214
+ "grounding": [
2215
+ {
2216
+ "box": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
2217
+ "page": 0
2218
+ }
2219
+ ],
2220
+ "chunk_type": "page_header",
2221
+ "chunk_id": "622e0374-c50e-4960-a013-650138b42528"
2222
+ },
2223
+ ...
2224
+ ]
2225
+ }
2226
+ """
2227
+
2228
+ image_file = numpy_to_bytes(image)
2229
+
2230
+ files = [("image", image_file)]
2231
+
2232
+ payload = {
2233
+ "model": "agentic-document-analysis",
2234
+ }
2235
+
2236
+ data: Dict[str, Any] = send_inference_request(
2237
+ payload=payload,
2238
+ endpoint_name="agentic-document-analysis",
2239
+ files=files,
2240
+ v2=True,
2241
+ metadata_payload={"function_name": "agentic_document_analysis"},
2242
+ )
2243
+
2244
+ # don't display normalized bboxes
2245
+ _display_tool_trace(
2246
+ agentic_document_extraction.__name__,
2247
+ payload,
2248
+ data,
2249
+ files,
2250
+ )
2251
+
2252
+ def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
2253
+ for chunk in data["chunks"]:
2254
+ for grounding in chunk["grounding"]:
2255
+ box = grounding["box"]
2256
+ grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
2257
+ return data
2258
+
2259
+ data = transform_boxes(data)
2260
+
2261
+ return data
2262
+
2263
+
2187
2264
  def document_qa(
2188
2265
  prompt: str,
2189
2266
  image: np.ndarray,
@@ -2211,29 +2288,25 @@ def document_qa(
2211
2288
  files = [("image", image_file)]
2212
2289
 
2213
2290
  payload = {
2214
- "model": "document-analysis",
2291
+ "model": "agentic-document-analysis",
2215
2292
  }
2216
2293
 
2217
2294
  data: Dict[str, Any] = send_inference_request(
2218
2295
  payload=payload,
2219
- endpoint_name="document-analysis",
2296
+ endpoint_name="agentic-document-analysis",
2220
2297
  files=files,
2221
2298
  v2=True,
2222
2299
  metadata_payload={"function_name": "document_qa"},
2223
2300
  )
2224
2301
 
2225
- def normalize(data: Any) -> Dict[str, Any]:
2226
- if isinstance(data, Dict):
2227
- if "bbox" in data:
2228
- data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
2229
- for key in data:
2230
- data[key] = normalize(data[key])
2231
- elif isinstance(data, List):
2232
- for i in range(len(data)):
2233
- data[i] = normalize(data[i])
2234
- return data # type: ignore
2302
+ def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
2303
+ for chunk in data["chunks"]:
2304
+ for grounding in chunk["grounding"]:
2305
+ box = grounding["box"]
2306
+ grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
2307
+ return data
2235
2308
 
2236
- data = normalize(data)
2309
+ data = transform_boxes(data)
2237
2310
 
2238
2311
  prompt = f"""
2239
2312
  Document Context:
@@ -3075,6 +3148,56 @@ def save_image(image: np.ndarray, file_path: str) -> None:
3075
3148
  pil_image.save(file_path)
3076
3149
 
3077
3150
 
3151
+ def load_pdf(pdf_path: str) -> List[np.ndarray]:
3152
+ """'load_pdf' is a utility function that loads a PDF from the given file path string and converts each page to an image.
3153
+
3154
+ Parameters:
3155
+ pdf_path (str): The path to the PDF file.
3156
+
3157
+ Returns:
3158
+ List[np.ndarray]: A list of images as NumPy arrays, one for each page of the PDF.
3159
+
3160
+ Example
3161
+ -------
3162
+ >>> load_pdf("path/to/document.pdf")
3163
+ """
3164
+
3165
+ # Handle URL case
3166
+ if pdf_path.startswith(("http", "https")):
3167
+ _, pdf_suffix = os.path.splitext(pdf_path)
3168
+ with tempfile.NamedTemporaryFile(delete=False, suffix=pdf_suffix) as tmp_file:
3169
+ # Download the PDF and save it to the temporary file
3170
+ with urllib.request.urlopen(pdf_path) as response:
3171
+ tmp_file.write(response.read())
3172
+ pdf_path = tmp_file.name
3173
+
3174
+ # Open the PDF
3175
+ doc = pymupdf.open(pdf_path)
3176
+ images = []
3177
+
3178
+ # Convert each page to an image
3179
+ for page_num in range(len(doc)):
3180
+ page = doc.load_page(page_num)
3181
+
3182
+ # Render page to an image
3183
+ pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2))
3184
+
3185
+ # Convert to PIL Image
3186
+ img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
3187
+
3188
+ # Convert to numpy array
3189
+ images.append(np.array(img))
3190
+
3191
+ # Close the document
3192
+ doc.close()
3193
+
3194
+ # Clean up temporary file if it was a URL
3195
+ if pdf_path.startswith(("http", "https")):
3196
+ os.unlink(pdf_path)
3197
+
3198
+ return images
3199
+
3200
+
3078
3201
  def save_video(
3079
3202
  frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 5
3080
3203
  ) -> str:
@@ -3488,7 +3611,7 @@ FUNCTION_TOOLS = [
3488
3611
  florence2_sam2_instance_segmentation,
3489
3612
  florence2_sam2_video_tracking,
3490
3613
  claude35_text_extraction,
3491
- document_extraction,
3614
+ agentic_document_extraction,
3492
3615
  document_qa,
3493
3616
  ocr,
3494
3617
  qwen25_vl_images_vqa,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: vision-agent
3
- Version: 1.0.11
3
+ Version: 1.1.2
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -28,6 +28,7 @@ Requires-Dist: pandas (==2.*)
28
28
  Requires-Dist: pillow (==10.*)
29
29
  Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
30
30
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
31
+ Requires-Dist: pymupdf (>=1.23.0,<2.0.0)
31
32
  Requires-Dist: pytube (==15.0.0)
32
33
  Requires-Dist: requests (==2.*)
33
34
  Requires-Dist: rich (>=13.7.1,<14.0.0)
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
26
26
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
27
27
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
28
28
  vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
29
- vision_agent/tools/__init__.py,sha256=H8M5v--cANBiOWvAfUJNj9cq9PKm_DjRrG1MeNRWpHs,2434
29
+ vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
30
30
  vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
31
31
  vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tools.py,sha256=4gwL8EFMwm6l0MujftJ8G8BO2z8Dh_a4FPjy_xUmYqs,121889
33
+ vision_agent/tools/tools.py,sha256=dKKrfKxqQYVDFRsLjMMpp1z4_5k68pkaoZUMf1BMc_Q,125694
34
34
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
35
35
  vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
36
36
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=XbH5wuD1nlaKe6zBeVPqUbQDsK6D-eCskKND3rRHOzo,8
40
40
  vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
41
41
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
42
42
  vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
43
- vision_agent-1.0.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-1.0.11.dist-info/METADATA,sha256=dbo4wR0zh5vN19V2uj65t1avenlKmG-L-ykf7BK2dns,12533
45
- vision_agent-1.0.11.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
46
- vision_agent-1.0.11.dist-info/RECORD,,
43
+ vision_agent-1.1.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-1.1.2.dist-info/METADATA,sha256=JxWPwfrAwtWx0Fpqq9b9Se7LZi22Ddqiw-YxX6nHe0A,12573
45
+ vision_agent-1.1.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
46
+ vision_agent-1.1.2.dist-info/RECORD,,