vision-agent 0.2.216__py3-none-any.whl → 0.2.217__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -33,6 +33,7 @@ from .tools import (
33
33
  depth_anything_v2,
34
34
  detr_segmentation,
35
35
  document_extraction,
36
+ document_qa,
36
37
  extract_frames_and_timestamps,
37
38
  florence2_ocr,
38
39
  florence2_phrase_grounding,
@@ -2174,6 +2174,77 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2174
2174
  return data
2175
2175
 
2176
2176
 
2177
+ def document_qa(
2178
+ prompt: str,
2179
+ image: np.ndarray,
2180
+ ) -> str:
2181
+ """'document_qa' is a tool that can answer any questions about arbitrary
2182
+ images of documents or presentations. It answers by analyzing the contextual document data
2183
+ and then using a model to answer specific questions. It returns text as an answer to the question.
2184
+
2185
+ Parameters:
2186
+ prompt (str): The question to be answered about the document image
2187
+ image (np.ndarray): The document image to analyze
2188
+
2189
+ Returns:
2190
+ str: The answer to the question based on the document's context.
2191
+
2192
+ Example
2193
+ -------
2194
+ >>> document_qa(image, question)
2195
+ 'The answer to the question ...'
2196
+ """
2197
+
2198
+ image_file = numpy_to_bytes(image)
2199
+
2200
+ files = [("image", image_file)]
2201
+
2202
+ payload = {
2203
+ "model": "document-analysis",
2204
+ }
2205
+
2206
+ data: dict[str, Any] = send_inference_request(
2207
+ payload=payload,
2208
+ endpoint_name="document-analysis",
2209
+ files=files,
2210
+ v2=True,
2211
+ metadata_payload={"function_name": "document_qa"},
2212
+ )
2213
+
2214
+ def normalize(data: Any) -> Dict[str, Any]:
2215
+ if isinstance(data, Dict):
2216
+ if "bbox" in data:
2217
+ data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
2218
+ for key in data:
2219
+ data[key] = normalize(data[key])
2220
+ elif isinstance(data, List):
2221
+ for i in range(len(data)):
2222
+ data[i] = normalize(data[i])
2223
+ return data # type: ignore
2224
+
2225
+ data = normalize(data)
2226
+
2227
+ prompt = f"""
2228
+ Document Context:
2229
+ {data}\n
2230
+ Question: {prompt}\n
2231
+ Please provide a clear, concise answer using only the information from the document. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
2232
+ """
2233
+
2234
+ lmm = AnthropicLMM()
2235
+ llm_output = lmm.generate(prompt=prompt)
2236
+ llm_output = cast(str, llm_output)
2237
+
2238
+ _display_tool_trace(
2239
+ document_qa.__name__,
2240
+ payload,
2241
+ llm_output,
2242
+ files,
2243
+ )
2244
+
2245
+ return llm_output
2246
+
2247
+
2177
2248
  # Utility and visualization functions
2178
2249
 
2179
2250
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.216
3
+ Version: 0.2.217
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=qzAqY2WnRLoClz3qiNtupkLtvpPlcGa5ZUCIs21WS7k,2795
29
+ vision_agent/tools/__init__.py,sha256=Jdq34jMw_KuYwk4Wexqm4DRjuLLoL1Q8wukm0NBv1Tc,2812
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=tU1qz_VIQM_yPKDmuxjMWu68ZlAZ7ePWI1g7zswyWhI,13540
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
34
- vision_agent/tools/tools.py,sha256=n6-UPaZ4XjF29_7EF5GRgx74GjiZ7HqZn4a1Aw-e4P0,94059
34
+ vision_agent/tools/tools.py,sha256=Xcm_9EQdDCR9X5FhIm7VJaTL0qWqhnJUVTRVrRtETrA,96112
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
40
40
  vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent-0.2.216.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.216.dist-info/METADATA,sha256=B88HzV_M0A12EmhiC-968LcdospsiOUUR-aTcZFTH8A,19071
45
- vision_agent-0.2.216.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.216.dist-info/RECORD,,
43
+ vision_agent-0.2.217.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.217.dist-info/METADATA,sha256=xl9AmXP9RBpC5frlASsiG7YktdIOTRuJgv8WZdRV_bA,19071
45
+ vision_agent-0.2.217.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.217.dist-info/RECORD,,