vision-agent 0.2.213__py3-none-any.whl → 0.2.215__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -32,6 +32,7 @@ from .tools import (
32
32
  countgd_sam2_video_tracking,
33
33
  depth_anything_v2,
34
34
  detr_segmentation,
35
+ document_analysis,
35
36
  extract_frames_and_timestamps,
36
37
  florence2_ocr,
37
38
  florence2_phrase_grounding,
@@ -1879,6 +1879,64 @@ def closest_box_distance(
1879
1879
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1880
1880
 
1881
1881
 
1882
+ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1883
+ """'document_analysis' is an understanding tool that can handle various
1884
+ types of document image layouts. It returns a structured output containing the text,
1885
+ tables, pictures, charts and information caption, summary, labels, bounding boxes, etc
1886
+ avoiding information loss.
1887
+
1888
+ Parameters:
1889
+ image (np.ndarray): The document image to analyze
1890
+
1891
+ Returns:
1892
+ Dict[str, Any]: A dictionary containing the extracted information.
1893
+
1894
+ Example
1895
+ -------
1896
+ >>> document_analysis(image)
1897
+ {'pages': [{'bbox': [left_0, top_0, right_0, bottom_0],
1898
+ 'chunks': [{'bbox': [left_1, top_1, right_1, bottom_1],
1899
+ 'caption': 'TITLE',
1900
+ 'label': 'page_header',
1901
+ 'summary': 'The image contains a single word ...' },
1902
+ {'bbox': [left_2, top_2, right_2, bottom_2],
1903
+ 'caption': {'data': [{'value': 200, 'year': '2024' ...},
1904
+ 'title': 'Total CapEx Spending',
1905
+ 'type': 'bar chart',
1906
+ 'unit': 'Billion USD',
1907
+ 'xAxis': 'Year',
1908
+ 'yAxis': 'Total CapEx Spending'},
1909
+ 'label': 'picture',
1910
+ 'summary': 'This bar chart illustrates the trend of ...'},
1911
+ ],
1912
+ """
1913
+
1914
+ image_file = numpy_to_bytes(image)
1915
+
1916
+ files = [("image", image_file)]
1917
+
1918
+ payload = {
1919
+ "model": "document-analysis",
1920
+ }
1921
+
1922
+ response: dict[str, Any] = send_inference_request(
1923
+ payload=payload,
1924
+ endpoint_name="document-analysis",
1925
+ files=files,
1926
+ v2=True,
1927
+ metadata_payload={"function_name": "document_analysis"},
1928
+ )
1929
+
1930
+ _display_tool_trace(
1931
+ document_analysis.__name__,
1932
+ payload,
1933
+ response,
1934
+ files,
1935
+ )
1936
+
1937
+ return response
1938
+
1939
+
1882
1940
  # Utility and visualization functions
1883
1941
 
1884
1942
 
@@ -106,9 +106,9 @@ def frames_to_bytes(
106
106
  return buffer_bytes
107
107
 
108
108
 
109
- # WARNING: this cache is cache is a little dangerous because if the underlying video
110
- # contents change but the filename remains the same it will return the old file contents
111
- # but for vision agent it's unlikely to change the file contents while keeping the
109
+ # WARNING: This cache is a little dangerous because if the underlying video
110
+ # contents change but the filename remains the same it will return the old file contents.
111
+ # For vision agent it's unlikely to change the file contents while keeping the
112
112
  # same file name and the time savings are very large.
113
113
  @lru_cache(maxsize=8)
114
114
  def extract_frames_from_video(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.213
3
+ Version: 0.2.215
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=InL8zUTRN8i_9J6r2wAtYdtNrVkElqdO_p-e2OA8q5A,2770
29
+ vision_agent/tools/__init__.py,sha256=Ny522Y4h1xDQTW6kBP_ceUM4jc0Y14dRhcHdtMDdr24,2793
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
34
- vision_agent/tools/tools.py,sha256=ZcXEI0Pb54OGXnLWi690SFx22k7JlEmQ-N16LzRLHlk,90627
34
+ vision_agent/tools/tools.py,sha256=xzN1uOkVQ9l1MaXsJxT_VlDp6nLQfdBX04kex_jE0fc,92692
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -39,8 +39,8 @@ vision_agent/utils/execute.py,sha256=ktJX1gWBk4D_tXeWV5olGUMC4dU_Z6m5oSv-6Yu1O0w
39
39
  vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
40
40
  vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
- vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
43
- vision_agent-0.2.213.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.213.dist-info/METADATA,sha256=iXy6vkFwSXz6UQW1LjuZMCj6YT8YwmjGklhmulFOoIc,19071
45
- vision_agent-0.2.213.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.213.dist-info/RECORD,,
42
+ vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
+ vision_agent-0.2.215.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.215.dist-info/METADATA,sha256=nSGpnpDpzJmWmGYDSShBvfjD5dbB6ZWSgOXGQ2Ci_yM,19071
45
+ vision_agent-0.2.215.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.215.dist-info/RECORD,,