vision-agent 1.0.10__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +86 -14
- {vision_agent-1.0.10.dist-info → vision_agent-1.1.1.dist-info}/METADATA +3 -3
- {vision_agent-1.0.10.dist-info → vision_agent-1.1.1.dist-info}/RECORD +6 -6
- {vision_agent-1.0.10.dist-info → vision_agent-1.1.1.dist-info}/LICENSE +0 -0
- {vision_agent-1.0.10.dist-info → vision_agent-1.1.1.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
@@ -8,6 +8,7 @@ from .planner_tools import judge_od_results
|
|
8
8
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
9
9
|
from .tools import (
|
10
10
|
activity_recognition,
|
11
|
+
agentic_document_extraction,
|
11
12
|
agentic_object_detection,
|
12
13
|
agentic_sam2_instance_segmentation,
|
13
14
|
agentic_sam2_video_tracking,
|
vision_agent/tools/tools.py
CHANGED
@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
9
|
from importlib import resources
|
10
10
|
from pathlib import Path
|
11
11
|
from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
12
|
+
from warnings import warn
|
12
13
|
|
13
14
|
import cv2
|
14
15
|
import numpy as np
|
@@ -2143,6 +2144,11 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2143
2144
|
'summary': 'This table illustrates a trend of ...'},
|
2144
2145
|
],
|
2145
2146
|
"""
|
2147
|
+
warning = (
|
2148
|
+
"This function is deprecated. For document extraction please use the agentic-doc python package on "
|
2149
|
+
"https://pypi.org/project/agentic-doc/ or the agentic_document_extraction function."
|
2150
|
+
)
|
2151
|
+
warn(warning, DeprecationWarning, stacklevel=2)
|
2146
2152
|
|
2147
2153
|
image_file = numpy_to_bytes(image)
|
2148
2154
|
|
@@ -2184,6 +2190,76 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2184
2190
|
return data
|
2185
2191
|
|
2186
2192
|
|
2193
|
+
def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
2194
|
+
"""'agentic_document_extraction' is a tool that can extract structured information out of
|
2195
|
+
documents with different layouts. It returns the extracted data in a structured
|
2196
|
+
hierarchical format containing text, tables, figures, charts, and other
|
2197
|
+
information.
|
2198
|
+
|
2199
|
+
Parameters:
|
2200
|
+
image (np.ndarray): The document image to analyze
|
2201
|
+
|
2202
|
+
Returns:
|
2203
|
+
Dict[str, Any]: A dictionary containing the extracted information.
|
2204
|
+
|
2205
|
+
Example
|
2206
|
+
-------
|
2207
|
+
>>> agentic_document_analysis(image)
|
2208
|
+
{
|
2209
|
+
"markdown": "# Document title\n\n## Document subtitle\n\nThis is a sample document.",
|
2210
|
+
"chunks": [
|
2211
|
+
{
|
2212
|
+
"text": "# Document title",
|
2213
|
+
"grounding": [
|
2214
|
+
{
|
2215
|
+
"box": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
|
2216
|
+
"page": 0
|
2217
|
+
}
|
2218
|
+
],
|
2219
|
+
"chunk_type": "page_header",
|
2220
|
+
"chunk_id": "622e0374-c50e-4960-a013-650138b42528"
|
2221
|
+
},
|
2222
|
+
...
|
2223
|
+
]
|
2224
|
+
}
|
2225
|
+
"""
|
2226
|
+
|
2227
|
+
image_file = numpy_to_bytes(image)
|
2228
|
+
|
2229
|
+
files = [("image", image_file)]
|
2230
|
+
|
2231
|
+
payload = {
|
2232
|
+
"model": "agentic-document-analysis",
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
data: Dict[str, Any] = send_inference_request(
|
2236
|
+
payload=payload,
|
2237
|
+
endpoint_name="agentic-document-analysis",
|
2238
|
+
files=files,
|
2239
|
+
v2=True,
|
2240
|
+
metadata_payload={"function_name": "agentic_document_analysis"},
|
2241
|
+
)
|
2242
|
+
|
2243
|
+
# don't display normalized bboxes
|
2244
|
+
_display_tool_trace(
|
2245
|
+
agentic_document_extraction.__name__,
|
2246
|
+
payload,
|
2247
|
+
data,
|
2248
|
+
files,
|
2249
|
+
)
|
2250
|
+
|
2251
|
+
def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
|
2252
|
+
for chunk in data["chunks"]:
|
2253
|
+
for grounding in chunk["grounding"]:
|
2254
|
+
box = grounding["box"]
|
2255
|
+
grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
|
2256
|
+
return data
|
2257
|
+
|
2258
|
+
data = transform_boxes(data)
|
2259
|
+
|
2260
|
+
return data
|
2261
|
+
|
2262
|
+
|
2187
2263
|
def document_qa(
|
2188
2264
|
prompt: str,
|
2189
2265
|
image: np.ndarray,
|
@@ -2211,29 +2287,25 @@ def document_qa(
|
|
2211
2287
|
files = [("image", image_file)]
|
2212
2288
|
|
2213
2289
|
payload = {
|
2214
|
-
"model": "document-analysis",
|
2290
|
+
"model": "agentic-document-analysis",
|
2215
2291
|
}
|
2216
2292
|
|
2217
2293
|
data: Dict[str, Any] = send_inference_request(
|
2218
2294
|
payload=payload,
|
2219
|
-
endpoint_name="document-analysis",
|
2295
|
+
endpoint_name="agentic-document-analysis",
|
2220
2296
|
files=files,
|
2221
2297
|
v2=True,
|
2222
2298
|
metadata_payload={"function_name": "document_qa"},
|
2223
2299
|
)
|
2224
2300
|
|
2225
|
-
def
|
2226
|
-
|
2227
|
-
|
2228
|
-
|
2229
|
-
|
2230
|
-
|
2231
|
-
elif isinstance(data, List):
|
2232
|
-
for i in range(len(data)):
|
2233
|
-
data[i] = normalize(data[i])
|
2234
|
-
return data # type: ignore
|
2301
|
+
def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
|
2302
|
+
for chunk in data["chunks"]:
|
2303
|
+
for grounding in chunk["grounding"]:
|
2304
|
+
box = grounding["box"]
|
2305
|
+
grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
|
2306
|
+
return data
|
2235
2307
|
|
2236
|
-
data =
|
2308
|
+
data = transform_boxes(data)
|
2237
2309
|
|
2238
2310
|
prompt = f"""
|
2239
2311
|
Document Context:
|
@@ -3488,7 +3560,7 @@ FUNCTION_TOOLS = [
|
|
3488
3560
|
florence2_sam2_instance_segmentation,
|
3489
3561
|
florence2_sam2_video_tracking,
|
3490
3562
|
claude35_text_extraction,
|
3491
|
-
|
3563
|
+
agentic_document_extraction,
|
3492
3564
|
document_qa,
|
3493
3565
|
ocr,
|
3494
3566
|
qwen25_vl_images_vqa,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.1.1
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -65,10 +65,10 @@ VisionAgent is the Visual AI Pilot from LandingAI. Submit a prompt and image to
|
|
65
65
|
If you are a seasoned developer who wants to build locally using this library and enjoys having more control then we recommend setting this up. Otherwise, you can use the [VisionAgent web app](https://va.landing.ai/).
|
66
66
|
|
67
67
|
## Get Your VisionAgent API Key
|
68
|
-
The most important step is to [signup]
|
68
|
+
The most important step is to [signup](https://va.landing.ai/agent) and obtain your [API key](https://va.landing.ai/account/api-key).
|
69
69
|
|
70
70
|
### Other Prerequisites
|
71
|
-
- Python version
|
71
|
+
- Python version 3.9 or higher
|
72
72
|
- [Anthropic API key](#get-an-anthropic-api-key)
|
73
73
|
- [Gemini API key](#get-a-gemini-api-key)
|
74
74
|
|
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
|
|
26
26
|
vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
27
27
|
vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
|
28
28
|
vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tools.py,sha256=yr45nu0Hr9_KT5sQn6ggaI0FP65XP0dlAmBtHnKihPU,124180
|
34
34
|
vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
|
35
35
|
vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
|
36
36
|
vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
|
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=XbH5wuD1nlaKe6zBeVPqUbQDsK6D-eCskKND3rRHOzo,8
|
|
40
40
|
vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
|
41
41
|
vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
|
42
42
|
vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
|
43
|
-
vision_agent-1.
|
44
|
-
vision_agent-1.
|
45
|
-
vision_agent-1.
|
46
|
-
vision_agent-1.
|
43
|
+
vision_agent-1.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-1.1.1.dist-info/METADATA,sha256=T9CSlGabaZwR1u2ZQlV2wTkXNpQZi5Nn1KwJMSo7s2o,12532
|
45
|
+
vision_agent-1.1.1.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
46
|
+
vision_agent-1.1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|