PyPI - vision-agent - Versions diffs - 0.2.216__tar.gz → 0.2.217__tar.gz - Mend

@@ -2174,6 +2174,77 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
     return data
+def document_qa(
+    prompt: str,
+    image: np.ndarray,
+) -> str:
+    """'document_qa' is a tool that can answer any questions about arbitrary
+    images of documents or presentations. It answers by analyzing the contextual document data
+    and then using a model to answer specific questions. It returns text as an answer to the question.
+    Parameters:
+        prompt (str): The question to be answered about the document image
+        image (np.ndarray): The document image to analyze
+    Returns:
+        str: The answer to the question based on the document's context.
+    Example
+    -------
+        >>> document_qa(image, question)
+        'The answer to the question ...'
+    """
+    image_file = numpy_to_bytes(image)
+    files = [("image", image_file)]
+    payload = {
+        "model": "document-analysis",
+    }
+    data: dict[str, Any] = send_inference_request(
+        payload=payload,
+        endpoint_name="document-analysis",
+        files=files,
+        v2=True,
+        metadata_payload={"function_name": "document_qa"},
+    )
+    def normalize(data: Any) -> Dict[str, Any]:
+        if isinstance(data, Dict):
+            if "bbox" in data:
+                data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
+            for key in data:
+                data[key] = normalize(data[key])
+        elif isinstance(data, List):
+            for i in range(len(data)):
+                data[i] = normalize(data[i])
+        return data  # type: ignore
+    data = normalize(data)
+    prompt = f"""
+    Document Context:
+    {data}\n
+    Question: {prompt}\n
+    Please provide a clear, concise answer using only the information from the document. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
+    """
+    lmm = AnthropicLMM()
+    llm_output = lmm.generate(prompt=prompt)
+    llm_output = cast(str, llm_output)
+    _display_tool_trace(
+        document_qa.__name__,
+        payload,
+        llm_output,
+        files,
+    )
+    return llm_output
 # Utility and visualization functions

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.216
+Version: 0.2.217
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.216"
+version = "0.2.217"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

@@ -33,6 +33,7 @@ from .tools import (
     depth_anything_v2,
     detr_segmentation,
     document_extraction,
+    document_qa,
     extract_frames_and_timestamps,
     florence2_ocr,
     florence2_phrase_grounding,

vision-agent 0.2.216__tar.gz → 0.2.217__tar.gz

vision-agent 0.2.216tar.gz → 0.2.217tar.gz