vision-agent 0.2.213__py3-none-any.whl → 0.2.215__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +58 -0
- vision_agent/utils/video.py +3 -3
- {vision_agent-0.2.213.dist-info → vision_agent-0.2.215.dist-info}/METADATA +1 -1
- {vision_agent-0.2.213.dist-info → vision_agent-0.2.215.dist-info}/RECORD +7 -7
- {vision_agent-0.2.213.dist-info → vision_agent-0.2.215.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.213.dist-info → vision_agent-0.2.215.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -1879,6 +1879,64 @@ def closest_box_distance(
|
|
1879
1879
|
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
1880
1880
|
|
1881
1881
|
|
1882
|
+
def document_analysis(image: np.ndarray) -> Dict[str, Any]:
|
1883
|
+
"""'document_analysis' is an understanding tool that can handle various
|
1884
|
+
types of document image layouts. It returns a structured output containing the text,
|
1885
|
+
tables, pictures, charts and information caption, summary, labels, bounding boxes, etc
|
1886
|
+
avoiding information loss.
|
1887
|
+
|
1888
|
+
Parameters:
|
1889
|
+
image (np.ndarray): The document image to analyze
|
1890
|
+
|
1891
|
+
Returns:
|
1892
|
+
Dict[str, Any]: A dictionary containing the extracted information.
|
1893
|
+
|
1894
|
+
Example
|
1895
|
+
-------
|
1896
|
+
>>> document_analysis(image)
|
1897
|
+
{'pages': [{'bbox': [left_0, top_0, right_0, bottom_0],
|
1898
|
+
'chunks': [{'bbox': [left_1, top_1, right_1, bottom_1],
|
1899
|
+
'caption': 'TITLE',
|
1900
|
+
'label': 'page_header',
|
1901
|
+
'summary': 'The image contains a single word ...' },
|
1902
|
+
{'bbox': [left_2, top_2, right_2, bottom_2],
|
1903
|
+
'caption': {'data': [{'value': 200, 'year': '2024' ...},
|
1904
|
+
'title': 'Total CapEx Spending',
|
1905
|
+
'type': 'bar chart',
|
1906
|
+
'unit': 'Billion USD',
|
1907
|
+
'xAxis': 'Year',
|
1908
|
+
'yAxis': 'Total CapEx Spending'},
|
1909
|
+
'label': 'picture',
|
1910
|
+
'summary': 'This bar chart illustrates the trend of ...'},
|
1911
|
+
],
|
1912
|
+
"""
|
1913
|
+
|
1914
|
+
image_file = numpy_to_bytes(image)
|
1915
|
+
|
1916
|
+
files = [("image", image_file)]
|
1917
|
+
|
1918
|
+
payload = {
|
1919
|
+
"model": "document-analysis",
|
1920
|
+
}
|
1921
|
+
|
1922
|
+
response: dict[str, Any] = send_inference_request(
|
1923
|
+
payload=payload,
|
1924
|
+
endpoint_name="document-analysis",
|
1925
|
+
files=files,
|
1926
|
+
v2=True,
|
1927
|
+
metadata_payload={"function_name": "document_analysis"},
|
1928
|
+
)
|
1929
|
+
|
1930
|
+
_display_tool_trace(
|
1931
|
+
document_analysis.__name__,
|
1932
|
+
payload,
|
1933
|
+
response,
|
1934
|
+
files,
|
1935
|
+
)
|
1936
|
+
|
1937
|
+
return response
|
1938
|
+
|
1939
|
+
|
1882
1940
|
# Utility and visualization functions
|
1883
1941
|
|
1884
1942
|
|
vision_agent/utils/video.py
CHANGED
@@ -106,9 +106,9 @@ def frames_to_bytes(
|
|
106
106
|
return buffer_bytes
|
107
107
|
|
108
108
|
|
109
|
-
# WARNING:
|
110
|
-
# contents change but the filename remains the same it will return the old file contents
|
111
|
-
#
|
109
|
+
# WARNING: This cache is a little dangerous because if the underlying video
|
110
|
+
# contents change but the filename remains the same it will return the old file contents.
|
111
|
+
# For vision agent it's unlikely to change the file contents while keeping the
|
112
112
|
# same file name and the time savings are very large.
|
113
113
|
@lru_cache(maxsize=8)
|
114
114
|
def extract_frames_from_video(
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=Ny522Y4h1xDQTW6kBP_ceUM4jc0Y14dRhcHdtMDdr24,2793
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=xzN1uOkVQ9l1MaXsJxT_VlDp6nLQfdBX04kex_jE0fc,92692
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -39,8 +39,8 @@ vision_agent/utils/execute.py,sha256=ktJX1gWBk4D_tXeWV5olGUMC4dU_Z6m5oSv-6Yu1O0w
|
|
39
39
|
vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
|
40
40
|
vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
|
-
vision_agent/utils/video.py,sha256=
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
42
|
+
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
+
vision_agent-0.2.215.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.215.dist-info/METADATA,sha256=nSGpnpDpzJmWmGYDSShBvfjD5dbB6ZWSgOXGQ2Ci_yM,19071
|
45
|
+
vision_agent-0.2.215.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.215.dist-info/RECORD,,
|
File without changes
|
File without changes
|