vision-agent 1.0.11__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +86 -14
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.1.dist-info}/METADATA +1 -1
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.1.dist-info}/RECORD +6 -6
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.1.dist-info}/LICENSE +0 -0
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.1.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
@@ -8,6 +8,7 @@ from .planner_tools import judge_od_results
|
|
8
8
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
9
9
|
from .tools import (
|
10
10
|
activity_recognition,
|
11
|
+
agentic_document_extraction,
|
11
12
|
agentic_object_detection,
|
12
13
|
agentic_sam2_instance_segmentation,
|
13
14
|
agentic_sam2_video_tracking,
|
vision_agent/tools/tools.py
CHANGED
@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
9
|
from importlib import resources
|
10
10
|
from pathlib import Path
|
11
11
|
from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
12
|
+
from warnings import warn
|
12
13
|
|
13
14
|
import cv2
|
14
15
|
import numpy as np
|
@@ -2143,6 +2144,11 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2143
2144
|
'summary': 'This table illustrates a trend of ...'},
|
2144
2145
|
],
|
2145
2146
|
"""
|
2147
|
+
warning = (
|
2148
|
+
"This function is deprecated. For document extraction please use the agentic-doc python package on "
|
2149
|
+
"https://pypi.org/project/agentic-doc/ or the agentic_document_extraction function."
|
2150
|
+
)
|
2151
|
+
warn(warning, DeprecationWarning, stacklevel=2)
|
2146
2152
|
|
2147
2153
|
image_file = numpy_to_bytes(image)
|
2148
2154
|
|
@@ -2184,6 +2190,76 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2184
2190
|
return data
|
2185
2191
|
|
2186
2192
|
|
2193
|
+
def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
2194
|
+
"""'agentic_document_extraction' is a tool that can extract structured information out of
|
2195
|
+
documents with different layouts. It returns the extracted data in a structured
|
2196
|
+
hierarchical format containing text, tables, figures, charts, and other
|
2197
|
+
information.
|
2198
|
+
|
2199
|
+
Parameters:
|
2200
|
+
image (np.ndarray): The document image to analyze
|
2201
|
+
|
2202
|
+
Returns:
|
2203
|
+
Dict[str, Any]: A dictionary containing the extracted information.
|
2204
|
+
|
2205
|
+
Example
|
2206
|
+
-------
|
2207
|
+
>>> agentic_document_analysis(image)
|
2208
|
+
{
|
2209
|
+
"markdown": "# Document title\n\n## Document subtitle\n\nThis is a sample document.",
|
2210
|
+
"chunks": [
|
2211
|
+
{
|
2212
|
+
"text": "# Document title",
|
2213
|
+
"grounding": [
|
2214
|
+
{
|
2215
|
+
"box": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
|
2216
|
+
"page": 0
|
2217
|
+
}
|
2218
|
+
],
|
2219
|
+
"chunk_type": "page_header",
|
2220
|
+
"chunk_id": "622e0374-c50e-4960-a013-650138b42528"
|
2221
|
+
},
|
2222
|
+
...
|
2223
|
+
]
|
2224
|
+
}
|
2225
|
+
"""
|
2226
|
+
|
2227
|
+
image_file = numpy_to_bytes(image)
|
2228
|
+
|
2229
|
+
files = [("image", image_file)]
|
2230
|
+
|
2231
|
+
payload = {
|
2232
|
+
"model": "agentic-document-analysis",
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
data: Dict[str, Any] = send_inference_request(
|
2236
|
+
payload=payload,
|
2237
|
+
endpoint_name="agentic-document-analysis",
|
2238
|
+
files=files,
|
2239
|
+
v2=True,
|
2240
|
+
metadata_payload={"function_name": "agentic_document_analysis"},
|
2241
|
+
)
|
2242
|
+
|
2243
|
+
# don't display normalized bboxes
|
2244
|
+
_display_tool_trace(
|
2245
|
+
agentic_document_extraction.__name__,
|
2246
|
+
payload,
|
2247
|
+
data,
|
2248
|
+
files,
|
2249
|
+
)
|
2250
|
+
|
2251
|
+
def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
|
2252
|
+
for chunk in data["chunks"]:
|
2253
|
+
for grounding in chunk["grounding"]:
|
2254
|
+
box = grounding["box"]
|
2255
|
+
grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
|
2256
|
+
return data
|
2257
|
+
|
2258
|
+
data = transform_boxes(data)
|
2259
|
+
|
2260
|
+
return data
|
2261
|
+
|
2262
|
+
|
2187
2263
|
def document_qa(
|
2188
2264
|
prompt: str,
|
2189
2265
|
image: np.ndarray,
|
@@ -2211,29 +2287,25 @@ def document_qa(
|
|
2211
2287
|
files = [("image", image_file)]
|
2212
2288
|
|
2213
2289
|
payload = {
|
2214
|
-
"model": "document-analysis",
|
2290
|
+
"model": "agentic-document-analysis",
|
2215
2291
|
}
|
2216
2292
|
|
2217
2293
|
data: Dict[str, Any] = send_inference_request(
|
2218
2294
|
payload=payload,
|
2219
|
-
endpoint_name="document-analysis",
|
2295
|
+
endpoint_name="agentic-document-analysis",
|
2220
2296
|
files=files,
|
2221
2297
|
v2=True,
|
2222
2298
|
metadata_payload={"function_name": "document_qa"},
|
2223
2299
|
)
|
2224
2300
|
|
2225
|
-
def
|
2226
|
-
|
2227
|
-
|
2228
|
-
|
2229
|
-
|
2230
|
-
|
2231
|
-
elif isinstance(data, List):
|
2232
|
-
for i in range(len(data)):
|
2233
|
-
data[i] = normalize(data[i])
|
2234
|
-
return data # type: ignore
|
2301
|
+
def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
|
2302
|
+
for chunk in data["chunks"]:
|
2303
|
+
for grounding in chunk["grounding"]:
|
2304
|
+
box = grounding["box"]
|
2305
|
+
grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
|
2306
|
+
return data
|
2235
2307
|
|
2236
|
-
data =
|
2308
|
+
data = transform_boxes(data)
|
2237
2309
|
|
2238
2310
|
prompt = f"""
|
2239
2311
|
Document Context:
|
@@ -3488,7 +3560,7 @@ FUNCTION_TOOLS = [
|
|
3488
3560
|
florence2_sam2_instance_segmentation,
|
3489
3561
|
florence2_sam2_video_tracking,
|
3490
3562
|
claude35_text_extraction,
|
3491
|
-
|
3563
|
+
agentic_document_extraction,
|
3492
3564
|
document_qa,
|
3493
3565
|
ocr,
|
3494
3566
|
qwen25_vl_images_vqa,
|
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
|
|
26
26
|
vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
27
27
|
vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
|
28
28
|
vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tools.py,sha256=yr45nu0Hr9_KT5sQn6ggaI0FP65XP0dlAmBtHnKihPU,124180
|
34
34
|
vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
|
35
35
|
vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
|
36
36
|
vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
|
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=XbH5wuD1nlaKe6zBeVPqUbQDsK6D-eCskKND3rRHOzo,8
|
|
40
40
|
vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
|
41
41
|
vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
|
42
42
|
vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
|
43
|
-
vision_agent-1.
|
44
|
-
vision_agent-1.
|
45
|
-
vision_agent-1.
|
46
|
-
vision_agent-1.
|
43
|
+
vision_agent-1.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-1.1.1.dist-info/METADATA,sha256=T9CSlGabaZwR1u2ZQlV2wTkXNpQZi5Nn1KwJMSo7s2o,12532
|
45
|
+
vision_agent-1.1.1.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
46
|
+
vision_agent-1.1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|