vision-agent 1.0.11__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ from .planner_tools import judge_od_results
8
8
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
9
9
  from .tools import (
10
10
  activity_recognition,
11
+ agentic_document_extraction,
11
12
  agentic_object_detection,
12
13
  agentic_sam2_instance_segmentation,
13
14
  agentic_sam2_video_tracking,
@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from importlib import resources
10
10
  from pathlib import Path
11
11
  from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
12
+ from warnings import warn
12
13
 
13
14
  import cv2
14
15
  import numpy as np
@@ -2143,6 +2144,11 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2143
2144
  'summary': 'This table illustrates a trend of ...'},
2144
2145
  ],
2145
2146
  """
2147
+ warning = (
2148
+ "This function is deprecated. For document extraction please use the agentic-doc python package on "
2149
+ "https://pypi.org/project/agentic-doc/ or the agentic_document_extraction function."
2150
+ )
2151
+ warn(warning, DeprecationWarning, stacklevel=2)
2146
2152
 
2147
2153
  image_file = numpy_to_bytes(image)
2148
2154
 
@@ -2184,6 +2190,76 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2184
2190
  return data
2185
2191
 
2186
2192
 
2193
+ def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
2194
+ """'agentic_document_extraction' is a tool that can extract structured information out of
2195
+ documents with different layouts. It returns the extracted data in a structured
2196
+ hierarchical format containing text, tables, figures, charts, and other
2197
+ information.
2198
+
2199
+ Parameters:
2200
+ image (np.ndarray): The document image to analyze
2201
+
2202
+ Returns:
2203
+ Dict[str, Any]: A dictionary containing the extracted information.
2204
+
2205
+ Example
2206
+ -------
2207
+ >>> agentic_document_analysis(image)
2208
+ {
2209
+ "markdown": "# Document title\n\n## Document subtitle\n\nThis is a sample document.",
2210
+ "chunks": [
2211
+ {
2212
+ "text": "# Document title",
2213
+ "grounding": [
2214
+ {
2215
+ "box": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
2216
+ "page": 0
2217
+ }
2218
+ ],
2219
+ "chunk_type": "page_header",
2220
+ "chunk_id": "622e0374-c50e-4960-a013-650138b42528"
2221
+ },
2222
+ ...
2223
+ ]
2224
+ }
2225
+ """
2226
+
2227
+ image_file = numpy_to_bytes(image)
2228
+
2229
+ files = [("image", image_file)]
2230
+
2231
+ payload = {
2232
+ "model": "agentic-document-analysis",
2233
+ }
2234
+
2235
+ data: Dict[str, Any] = send_inference_request(
2236
+ payload=payload,
2237
+ endpoint_name="agentic-document-analysis",
2238
+ files=files,
2239
+ v2=True,
2240
+ metadata_payload={"function_name": "agentic_document_analysis"},
2241
+ )
2242
+
2243
+ # don't display normalized bboxes
2244
+ _display_tool_trace(
2245
+ agentic_document_extraction.__name__,
2246
+ payload,
2247
+ data,
2248
+ files,
2249
+ )
2250
+
2251
+ def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
2252
+ for chunk in data["chunks"]:
2253
+ for grounding in chunk["grounding"]:
2254
+ box = grounding["box"]
2255
+ grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
2256
+ return data
2257
+
2258
+ data = transform_boxes(data)
2259
+
2260
+ return data
2261
+
2262
+
2187
2263
  def document_qa(
2188
2264
  prompt: str,
2189
2265
  image: np.ndarray,
@@ -2211,29 +2287,25 @@ def document_qa(
2211
2287
  files = [("image", image_file)]
2212
2288
 
2213
2289
  payload = {
2214
- "model": "document-analysis",
2290
+ "model": "agentic-document-analysis",
2215
2291
  }
2216
2292
 
2217
2293
  data: Dict[str, Any] = send_inference_request(
2218
2294
  payload=payload,
2219
- endpoint_name="document-analysis",
2295
+ endpoint_name="agentic-document-analysis",
2220
2296
  files=files,
2221
2297
  v2=True,
2222
2298
  metadata_payload={"function_name": "document_qa"},
2223
2299
  )
2224
2300
 
2225
- def normalize(data: Any) -> Dict[str, Any]:
2226
- if isinstance(data, Dict):
2227
- if "bbox" in data:
2228
- data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
2229
- for key in data:
2230
- data[key] = normalize(data[key])
2231
- elif isinstance(data, List):
2232
- for i in range(len(data)):
2233
- data[i] = normalize(data[i])
2234
- return data # type: ignore
2301
+ def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
2302
+ for chunk in data["chunks"]:
2303
+ for grounding in chunk["grounding"]:
2304
+ box = grounding["box"]
2305
+ grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
2306
+ return data
2235
2307
 
2236
- data = normalize(data)
2308
+ data = transform_boxes(data)
2237
2309
 
2238
2310
  prompt = f"""
2239
2311
  Document Context:
@@ -3488,7 +3560,7 @@ FUNCTION_TOOLS = [
3488
3560
  florence2_sam2_instance_segmentation,
3489
3561
  florence2_sam2_video_tracking,
3490
3562
  claude35_text_extraction,
3491
- document_extraction,
3563
+ agentic_document_extraction,
3492
3564
  document_qa,
3493
3565
  ocr,
3494
3566
  qwen25_vl_images_vqa,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: vision-agent
3
- Version: 1.0.11
3
+ Version: 1.1.1
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
26
26
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
27
27
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
28
28
  vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
29
- vision_agent/tools/__init__.py,sha256=H8M5v--cANBiOWvAfUJNj9cq9PKm_DjRrG1MeNRWpHs,2434
29
+ vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
30
30
  vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
31
31
  vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tools.py,sha256=4gwL8EFMwm6l0MujftJ8G8BO2z8Dh_a4FPjy_xUmYqs,121889
33
+ vision_agent/tools/tools.py,sha256=yr45nu0Hr9_KT5sQn6ggaI0FP65XP0dlAmBtHnKihPU,124180
34
34
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
35
35
  vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
36
36
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=XbH5wuD1nlaKe6zBeVPqUbQDsK6D-eCskKND3rRHOzo,8
40
40
  vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
41
41
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
42
42
  vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
43
- vision_agent-1.0.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-1.0.11.dist-info/METADATA,sha256=dbo4wR0zh5vN19V2uj65t1avenlKmG-L-ykf7BK2dns,12533
45
- vision_agent-1.0.11.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
46
- vision_agent-1.0.11.dist-info/RECORD,,
43
+ vision_agent-1.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-1.1.1.dist-info/METADATA,sha256=T9CSlGabaZwR1u2ZQlV2wTkXNpQZi5Nn1KwJMSo7s2o,12532
45
+ vision_agent-1.1.1.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
46
+ vision_agent-1.1.1.dist-info/RECORD,,