vision-agent 0.2.213__py3-none-any.whl → 0.2.215__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,6 +32,7 @@ from .tools import (
32
32
  countgd_sam2_video_tracking,
33
33
  depth_anything_v2,
34
34
  detr_segmentation,
35
+ document_analysis,
35
36
  extract_frames_and_timestamps,
36
37
  florence2_ocr,
37
38
  florence2_phrase_grounding,
@@ -1879,6 +1879,64 @@ def closest_box_distance(
1879
1879
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1880
1880
 
1881
1881
 
1882
+ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1883
+ """'document_analysis' is an understanding tool that can handle various
1884
+ types of document image layouts. It returns a structured output containing the text,
1885
+ tables, pictures, charts and information caption, summary, labels, bounding boxes, etc
1886
+ avoiding information loss.
1887
+
1888
+ Parameters:
1889
+ image (np.ndarray): The document image to analyze
1890
+
1891
+ Returns:
1892
+ Dict[str, Any]: A dictionary containing the extracted information.
1893
+
1894
+ Example
1895
+ -------
1896
+ >>> document_analysis(image)
1897
+ {'pages': [{'bbox': [left_0, top_0, right_0, bottom_0],
1898
+ 'chunks': [{'bbox': [left_1, top_1, right_1, bottom_1],
1899
+ 'caption': 'TITLE',
1900
+ 'label': 'page_header',
1901
+ 'summary': 'The image contains a single word ...' },
1902
+ {'bbox': [left_2, top_2, right_2, bottom_2],
1903
+ 'caption': {'data': [{'value': 200, 'year': '2024' ...},
1904
+ 'title': 'Total CapEx Spending',
1905
+ 'type': 'bar chart',
1906
+ 'unit': 'Billion USD',
1907
+ 'xAxis': 'Year',
1908
+ 'yAxis': 'Total CapEx Spending'},
1909
+ 'label': 'picture',
1910
+ 'summary': 'This bar chart illustrates the trend of ...'},
1911
+ ],
1912
+ """
1913
+
1914
+ image_file = numpy_to_bytes(image)
1915
+
1916
+ files = [("image", image_file)]
1917
+
1918
+ payload = {
1919
+ "model": "document-analysis",
1920
+ }
1921
+
1922
+ response: dict[str, Any] = send_inference_request(
1923
+ payload=payload,
1924
+ endpoint_name="document-analysis",
1925
+ files=files,
1926
+ v2=True,
1927
+ metadata_payload={"function_name": "document_analysis"},
1928
+ )
1929
+
1930
+ _display_tool_trace(
1931
+ document_analysis.__name__,
1932
+ payload,
1933
+ response,
1934
+ files,
1935
+ )
1936
+
1937
+ return response
1938
+
1939
+
1882
1940
  # Utility and visualization functions
1883
1941
 
1884
1942
 
@@ -106,9 +106,9 @@ def frames_to_bytes(
106
106
  return buffer_bytes
107
107
 
108
108
 
109
- # WARNING: this cache is cache is a little dangerous because if the underlying video
110
- # contents change but the filename remains the same it will return the old file contents
111
- # but for vision agent it's unlikely to change the file contents while keeping the
109
+ # WARNING: This cache is a little dangerous because if the underlying video
110
+ # contents change but the filename remains the same it will return the old file contents.
111
+ # For vision agent it's unlikely to change the file contents while keeping the
112
112
  # same file name and the time savings are very large.
113
113
  @lru_cache(maxsize=8)
114
114
  def extract_frames_from_video(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.213
3
+ Version: 0.2.215
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=InL8zUTRN8i_9J6r2wAtYdtNrVkElqdO_p-e2OA8q5A,2770
29
+ vision_agent/tools/__init__.py,sha256=Ny522Y4h1xDQTW6kBP_ceUM4jc0Y14dRhcHdtMDdr24,2793
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
34
- vision_agent/tools/tools.py,sha256=ZcXEI0Pb54OGXnLWi690SFx22k7JlEmQ-N16LzRLHlk,90627
34
+ vision_agent/tools/tools.py,sha256=xzN1uOkVQ9l1MaXsJxT_VlDp6nLQfdBX04kex_jE0fc,92692
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -39,8 +39,8 @@ vision_agent/utils/execute.py,sha256=ktJX1gWBk4D_tXeWV5olGUMC4dU_Z6m5oSv-6Yu1O0w
39
39
  vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
40
40
  vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
- vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
43
- vision_agent-0.2.213.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.213.dist-info/METADATA,sha256=iXy6vkFwSXz6UQW1LjuZMCj6YT8YwmjGklhmulFOoIc,19071
45
- vision_agent-0.2.213.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.213.dist-info/RECORD,,
42
+ vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
+ vision_agent-0.2.215.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.215.dist-info/METADATA,sha256=nSGpnpDpzJmWmGYDSShBvfjD5dbB6ZWSgOXGQ2Ci_yM,19071
45
+ vision_agent-0.2.215.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.215.dist-info/RECORD,,