vision-agent 0.2.214__py3-none-any.whl → 0.2.215__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +58 -0
- {vision_agent-0.2.214.dist-info → vision_agent-0.2.215.dist-info}/METADATA +1 -1
- {vision_agent-0.2.214.dist-info → vision_agent-0.2.215.dist-info}/RECORD +6 -6
- {vision_agent-0.2.214.dist-info → vision_agent-0.2.215.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.214.dist-info → vision_agent-0.2.215.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -1879,6 +1879,64 @@ def closest_box_distance(
|
|
1879
1879
|
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
1880
1880
|
|
1881
1881
|
|
1882
|
+
def document_analysis(image: np.ndarray) -> Dict[str, Any]:
|
1883
|
+
"""'document_analysis' is an understanding tool that can handle various
|
1884
|
+
types of document image layouts. It returns a structured output containing the text,
|
1885
|
+
tables, pictures, charts and information caption, summary, labels, bounding boxes, etc
|
1886
|
+
avoiding information loss.
|
1887
|
+
|
1888
|
+
Parameters:
|
1889
|
+
image (np.ndarray): The document image to analyze
|
1890
|
+
|
1891
|
+
Returns:
|
1892
|
+
Dict[str, Any]: A dictionary containing the extracted information.
|
1893
|
+
|
1894
|
+
Example
|
1895
|
+
-------
|
1896
|
+
>>> document_analysis(image)
|
1897
|
+
{'pages': [{'bbox': [left_0, top_0, right_0, bottom_0],
|
1898
|
+
'chunks': [{'bbox': [left_1, top_1, right_1, bottom_1],
|
1899
|
+
'caption': 'TITLE',
|
1900
|
+
'label': 'page_header',
|
1901
|
+
'summary': 'The image contains a single word ...' },
|
1902
|
+
{'bbox': [left_2, top_2, right_2, bottom_2],
|
1903
|
+
'caption': {'data': [{'value': 200, 'year': '2024' ...},
|
1904
|
+
'title': 'Total CapEx Spending',
|
1905
|
+
'type': 'bar chart',
|
1906
|
+
'unit': 'Billion USD',
|
1907
|
+
'xAxis': 'Year',
|
1908
|
+
'yAxis': 'Total CapEx Spending'},
|
1909
|
+
'label': 'picture',
|
1910
|
+
'summary': 'This bar chart illustrates the trend of ...'},
|
1911
|
+
],
|
1912
|
+
"""
|
1913
|
+
|
1914
|
+
image_file = numpy_to_bytes(image)
|
1915
|
+
|
1916
|
+
files = [("image", image_file)]
|
1917
|
+
|
1918
|
+
payload = {
|
1919
|
+
"model": "document-analysis",
|
1920
|
+
}
|
1921
|
+
|
1922
|
+
response: dict[str, Any] = send_inference_request(
|
1923
|
+
payload=payload,
|
1924
|
+
endpoint_name="document-analysis",
|
1925
|
+
files=files,
|
1926
|
+
v2=True,
|
1927
|
+
metadata_payload={"function_name": "document_analysis"},
|
1928
|
+
)
|
1929
|
+
|
1930
|
+
_display_tool_trace(
|
1931
|
+
document_analysis.__name__,
|
1932
|
+
payload,
|
1933
|
+
response,
|
1934
|
+
files,
|
1935
|
+
)
|
1936
|
+
|
1937
|
+
return response
|
1938
|
+
|
1939
|
+
|
1882
1940
|
# Utility and visualization functions
|
1883
1941
|
|
1884
1942
|
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=Ny522Y4h1xDQTW6kBP_ceUM4jc0Y14dRhcHdtMDdr24,2793
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=xzN1uOkVQ9l1MaXsJxT_VlDp6nLQfdBX04kex_jE0fc,92692
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
|
|
40
40
|
vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
43
|
+
vision_agent-0.2.215.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.215.dist-info/METADATA,sha256=nSGpnpDpzJmWmGYDSShBvfjD5dbB6ZWSgOXGQ2Ci_yM,19071
|
45
|
+
vision_agent-0.2.215.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.215.dist-info/RECORD,,
|
File without changes
|
File without changes
|