vision-agent 0.2.213__py3-none-any.whl → 0.2.215__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +58 -0
- vision_agent/utils/video.py +3 -3
- {vision_agent-0.2.213.dist-info → vision_agent-0.2.215.dist-info}/METADATA +1 -1
- {vision_agent-0.2.213.dist-info → vision_agent-0.2.215.dist-info}/RECORD +7 -7
- {vision_agent-0.2.213.dist-info → vision_agent-0.2.215.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.213.dist-info → vision_agent-0.2.215.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -1879,6 +1879,64 @@ def closest_box_distance(
|
|
1879
1879
|
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
1880
1880
|
|
1881
1881
|
|
1882
|
+
def document_analysis(image: np.ndarray) -> Dict[str, Any]:
|
1883
|
+
"""'document_analysis' is an understanding tool that can handle various
|
1884
|
+
types of document image layouts. It returns a structured output containing the text,
|
1885
|
+
tables, pictures, charts and information caption, summary, labels, bounding boxes, etc
|
1886
|
+
avoiding information loss.
|
1887
|
+
|
1888
|
+
Parameters:
|
1889
|
+
image (np.ndarray): The document image to analyze
|
1890
|
+
|
1891
|
+
Returns:
|
1892
|
+
Dict[str, Any]: A dictionary containing the extracted information.
|
1893
|
+
|
1894
|
+
Example
|
1895
|
+
-------
|
1896
|
+
>>> document_analysis(image)
|
1897
|
+
{'pages': [{'bbox': [left_0, top_0, right_0, bottom_0],
|
1898
|
+
'chunks': [{'bbox': [left_1, top_1, right_1, bottom_1],
|
1899
|
+
'caption': 'TITLE',
|
1900
|
+
'label': 'page_header',
|
1901
|
+
'summary': 'The image contains a single word ...' },
|
1902
|
+
{'bbox': [left_2, top_2, right_2, bottom_2],
|
1903
|
+
'caption': {'data': [{'value': 200, 'year': '2024' ...},
|
1904
|
+
'title': 'Total CapEx Spending',
|
1905
|
+
'type': 'bar chart',
|
1906
|
+
'unit': 'Billion USD',
|
1907
|
+
'xAxis': 'Year',
|
1908
|
+
'yAxis': 'Total CapEx Spending'},
|
1909
|
+
'label': 'picture',
|
1910
|
+
'summary': 'This bar chart illustrates the trend of ...'},
|
1911
|
+
],
|
1912
|
+
"""
|
1913
|
+
|
1914
|
+
image_file = numpy_to_bytes(image)
|
1915
|
+
|
1916
|
+
files = [("image", image_file)]
|
1917
|
+
|
1918
|
+
payload = {
|
1919
|
+
"model": "document-analysis",
|
1920
|
+
}
|
1921
|
+
|
1922
|
+
response: dict[str, Any] = send_inference_request(
|
1923
|
+
payload=payload,
|
1924
|
+
endpoint_name="document-analysis",
|
1925
|
+
files=files,
|
1926
|
+
v2=True,
|
1927
|
+
metadata_payload={"function_name": "document_analysis"},
|
1928
|
+
)
|
1929
|
+
|
1930
|
+
_display_tool_trace(
|
1931
|
+
document_analysis.__name__,
|
1932
|
+
payload,
|
1933
|
+
response,
|
1934
|
+
files,
|
1935
|
+
)
|
1936
|
+
|
1937
|
+
return response
|
1938
|
+
|
1939
|
+
|
1882
1940
|
# Utility and visualization functions
|
1883
1941
|
|
1884
1942
|
|
vision_agent/utils/video.py
CHANGED
@@ -106,9 +106,9 @@ def frames_to_bytes(
|
|
106
106
|
return buffer_bytes
|
107
107
|
|
108
108
|
|
109
|
-
# WARNING:
|
110
|
-
# contents change but the filename remains the same it will return the old file contents
|
111
|
-
#
|
109
|
+
# WARNING: This cache is a little dangerous because if the underlying video
|
110
|
+
# contents change but the filename remains the same it will return the old file contents.
|
111
|
+
# For vision agent it's unlikely to change the file contents while keeping the
|
112
112
|
# same file name and the time savings are very large.
|
113
113
|
@lru_cache(maxsize=8)
|
114
114
|
def extract_frames_from_video(
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=Ny522Y4h1xDQTW6kBP_ceUM4jc0Y14dRhcHdtMDdr24,2793
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=xzN1uOkVQ9l1MaXsJxT_VlDp6nLQfdBX04kex_jE0fc,92692
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -39,8 +39,8 @@ vision_agent/utils/execute.py,sha256=ktJX1gWBk4D_tXeWV5olGUMC4dU_Z6m5oSv-6Yu1O0w
|
|
39
39
|
vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
|
40
40
|
vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
|
-
vision_agent/utils/video.py,sha256=
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
42
|
+
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
+
vision_agent-0.2.215.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.215.dist-info/METADATA,sha256=nSGpnpDpzJmWmGYDSShBvfjD5dbB6ZWSgOXGQ2Ci_yM,19071
|
45
|
+
vision_agent-0.2.215.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.215.dist-info/RECORD,,
|
File without changes
|
File without changes
|