vision-agent 1.0.11__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +137 -14
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/METADATA +2 -1
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/RECORD +6 -6
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/LICENSE +0 -0
- {vision_agent-1.0.11.dist-info → vision_agent-1.1.2.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
@@ -8,6 +8,7 @@ from .planner_tools import judge_od_results
|
|
8
8
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
9
9
|
from .tools import (
|
10
10
|
activity_recognition,
|
11
|
+
agentic_document_extraction,
|
11
12
|
agentic_object_detection,
|
12
13
|
agentic_sam2_instance_segmentation,
|
13
14
|
agentic_sam2_video_tracking,
|
vision_agent/tools/tools.py
CHANGED
@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
9
|
from importlib import resources
|
10
10
|
from pathlib import Path
|
11
11
|
from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
12
|
+
from warnings import warn
|
12
13
|
|
13
14
|
import cv2
|
14
15
|
import numpy as np
|
@@ -18,6 +19,7 @@ from IPython.display import display
|
|
18
19
|
from PIL import Image, ImageDraw, ImageFont
|
19
20
|
from pillow_heif import register_heif_opener # type: ignore
|
20
21
|
from pytube import YouTube # type: ignore
|
22
|
+
import pymupdf # type: ignore
|
21
23
|
|
22
24
|
from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
|
23
25
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -2143,6 +2145,11 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2143
2145
|
'summary': 'This table illustrates a trend of ...'},
|
2144
2146
|
],
|
2145
2147
|
"""
|
2148
|
+
warning = (
|
2149
|
+
"This function is deprecated. For document extraction please use the agentic-doc python package on "
|
2150
|
+
"https://pypi.org/project/agentic-doc/ or the agentic_document_extraction function."
|
2151
|
+
)
|
2152
|
+
warn(warning, DeprecationWarning, stacklevel=2)
|
2146
2153
|
|
2147
2154
|
image_file = numpy_to_bytes(image)
|
2148
2155
|
|
@@ -2184,6 +2191,76 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2184
2191
|
return data
|
2185
2192
|
|
2186
2193
|
|
2194
|
+
def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
2195
|
+
"""'agentic_document_extraction' is a tool that can extract structured information out of
|
2196
|
+
documents with different layouts. It returns the extracted data in a structured
|
2197
|
+
hierarchical format containing text, tables, figures, charts, and other
|
2198
|
+
information.
|
2199
|
+
|
2200
|
+
Parameters:
|
2201
|
+
image (np.ndarray): The document image to analyze
|
2202
|
+
|
2203
|
+
Returns:
|
2204
|
+
Dict[str, Any]: A dictionary containing the extracted information.
|
2205
|
+
|
2206
|
+
Example
|
2207
|
+
-------
|
2208
|
+
>>> agentic_document_analysis(image)
|
2209
|
+
{
|
2210
|
+
"markdown": "# Document title\n\n## Document subtitle\n\nThis is a sample document.",
|
2211
|
+
"chunks": [
|
2212
|
+
{
|
2213
|
+
"text": "# Document title",
|
2214
|
+
"grounding": [
|
2215
|
+
{
|
2216
|
+
"box": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
|
2217
|
+
"page": 0
|
2218
|
+
}
|
2219
|
+
],
|
2220
|
+
"chunk_type": "page_header",
|
2221
|
+
"chunk_id": "622e0374-c50e-4960-a013-650138b42528"
|
2222
|
+
},
|
2223
|
+
...
|
2224
|
+
]
|
2225
|
+
}
|
2226
|
+
"""
|
2227
|
+
|
2228
|
+
image_file = numpy_to_bytes(image)
|
2229
|
+
|
2230
|
+
files = [("image", image_file)]
|
2231
|
+
|
2232
|
+
payload = {
|
2233
|
+
"model": "agentic-document-analysis",
|
2234
|
+
}
|
2235
|
+
|
2236
|
+
data: Dict[str, Any] = send_inference_request(
|
2237
|
+
payload=payload,
|
2238
|
+
endpoint_name="agentic-document-analysis",
|
2239
|
+
files=files,
|
2240
|
+
v2=True,
|
2241
|
+
metadata_payload={"function_name": "agentic_document_analysis"},
|
2242
|
+
)
|
2243
|
+
|
2244
|
+
# don't display normalized bboxes
|
2245
|
+
_display_tool_trace(
|
2246
|
+
agentic_document_extraction.__name__,
|
2247
|
+
payload,
|
2248
|
+
data,
|
2249
|
+
files,
|
2250
|
+
)
|
2251
|
+
|
2252
|
+
def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
|
2253
|
+
for chunk in data["chunks"]:
|
2254
|
+
for grounding in chunk["grounding"]:
|
2255
|
+
box = grounding["box"]
|
2256
|
+
grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
|
2257
|
+
return data
|
2258
|
+
|
2259
|
+
data = transform_boxes(data)
|
2260
|
+
|
2261
|
+
return data
|
2262
|
+
|
2263
|
+
|
2187
2264
|
def document_qa(
|
2188
2265
|
prompt: str,
|
2189
2266
|
image: np.ndarray,
|
@@ -2211,29 +2288,25 @@ def document_qa(
|
|
2211
2288
|
files = [("image", image_file)]
|
2212
2289
|
|
2213
2290
|
payload = {
|
2214
|
-
"model": "document-analysis",
|
2291
|
+
"model": "agentic-document-analysis",
|
2215
2292
|
}
|
2216
2293
|
|
2217
2294
|
data: Dict[str, Any] = send_inference_request(
|
2218
2295
|
payload=payload,
|
2219
|
-
endpoint_name="document-analysis",
|
2296
|
+
endpoint_name="agentic-document-analysis",
|
2220
2297
|
files=files,
|
2221
2298
|
v2=True,
|
2222
2299
|
metadata_payload={"function_name": "document_qa"},
|
2223
2300
|
)
|
2224
2301
|
|
2225
|
-
def
|
2226
|
-
|
2227
|
-
|
2228
|
-
|
2229
|
-
|
2230
|
-
|
2231
|
-
elif isinstance(data, List):
|
2232
|
-
for i in range(len(data)):
|
2233
|
-
data[i] = normalize(data[i])
|
2234
|
-
return data # type: ignore
|
2302
|
+
def transform_boxes(data: Dict[str, Any]) -> Dict[str, Any]:
|
2303
|
+
for chunk in data["chunks"]:
|
2304
|
+
for grounding in chunk["grounding"]:
|
2305
|
+
box = grounding["box"]
|
2306
|
+
grounding["box"] = [box["l"], box["t"], box["r"], box["b"]]
|
2307
|
+
return data
|
2235
2308
|
|
2236
|
-
data =
|
2309
|
+
data = transform_boxes(data)
|
2237
2310
|
|
2238
2311
|
prompt = f"""
|
2239
2312
|
Document Context:
|
@@ -3075,6 +3148,56 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
3075
3148
|
pil_image.save(file_path)
|
3076
3149
|
|
3077
3150
|
|
3151
|
+
def load_pdf(pdf_path: str) -> List[np.ndarray]:
|
3152
|
+
"""'load_pdf' is a utility function that loads a PDF from the given file path string and converts each page to an image.
|
3153
|
+
|
3154
|
+
Parameters:
|
3155
|
+
pdf_path (str): The path to the PDF file.
|
3156
|
+
|
3157
|
+
Returns:
|
3158
|
+
List[np.ndarray]: A list of images as NumPy arrays, one for each page of the PDF.
|
3159
|
+
|
3160
|
+
Example
|
3161
|
+
-------
|
3162
|
+
>>> load_pdf("path/to/document.pdf")
|
3163
|
+
"""
|
3164
|
+
|
3165
|
+
# Handle URL case
|
3166
|
+
if pdf_path.startswith(("http", "https")):
|
3167
|
+
_, pdf_suffix = os.path.splitext(pdf_path)
|
3168
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=pdf_suffix) as tmp_file:
|
3169
|
+
# Download the PDF and save it to the temporary file
|
3170
|
+
with urllib.request.urlopen(pdf_path) as response:
|
3171
|
+
tmp_file.write(response.read())
|
3172
|
+
pdf_path = tmp_file.name
|
3173
|
+
|
3174
|
+
# Open the PDF
|
3175
|
+
doc = pymupdf.open(pdf_path)
|
3176
|
+
images = []
|
3177
|
+
|
3178
|
+
# Convert each page to an image
|
3179
|
+
for page_num in range(len(doc)):
|
3180
|
+
page = doc.load_page(page_num)
|
3181
|
+
|
3182
|
+
# Render page to an image
|
3183
|
+
pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2))
|
3184
|
+
|
3185
|
+
# Convert to PIL Image
|
3186
|
+
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
3187
|
+
|
3188
|
+
# Convert to numpy array
|
3189
|
+
images.append(np.array(img))
|
3190
|
+
|
3191
|
+
# Close the document
|
3192
|
+
doc.close()
|
3193
|
+
|
3194
|
+
# Clean up temporary file if it was a URL
|
3195
|
+
if pdf_path.startswith(("http", "https")):
|
3196
|
+
os.unlink(pdf_path)
|
3197
|
+
|
3198
|
+
return images
|
3199
|
+
|
3200
|
+
|
3078
3201
|
def save_video(
|
3079
3202
|
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 5
|
3080
3203
|
) -> str:
|
@@ -3488,7 +3611,7 @@ FUNCTION_TOOLS = [
|
|
3488
3611
|
florence2_sam2_instance_segmentation,
|
3489
3612
|
florence2_sam2_video_tracking,
|
3490
3613
|
claude35_text_extraction,
|
3491
|
-
|
3614
|
+
agentic_document_extraction,
|
3492
3615
|
document_qa,
|
3493
3616
|
ocr,
|
3494
3617
|
qwen25_vl_images_vqa,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.1.2
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -28,6 +28,7 @@ Requires-Dist: pandas (==2.*)
|
|
28
28
|
Requires-Dist: pillow (==10.*)
|
29
29
|
Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
30
30
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
31
|
+
Requires-Dist: pymupdf (>=1.23.0,<2.0.0)
|
31
32
|
Requires-Dist: pytube (==15.0.0)
|
32
33
|
Requires-Dist: requests (==2.*)
|
33
34
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
|
|
26
26
|
vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
27
27
|
vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
|
28
28
|
vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tools.py,sha256=dKKrfKxqQYVDFRsLjMMpp1z4_5k68pkaoZUMf1BMc_Q,125694
|
34
34
|
vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
|
35
35
|
vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
|
36
36
|
vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
|
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=XbH5wuD1nlaKe6zBeVPqUbQDsK6D-eCskKND3rRHOzo,8
|
|
40
40
|
vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
|
41
41
|
vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
|
42
42
|
vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
|
43
|
-
vision_agent-1.
|
44
|
-
vision_agent-1.
|
45
|
-
vision_agent-1.
|
46
|
-
vision_agent-1.
|
43
|
+
vision_agent-1.1.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-1.1.2.dist-info/METADATA,sha256=JxWPwfrAwtWx0Fpqq9b9Se7LZi22Ddqiw-YxX6nHe0A,12573
|
45
|
+
vision_agent-1.1.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
46
|
+
vision_agent-1.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|