PyPI - vision-agent - Versions diffs - 0.2.178__tar.gz → 0.2.180__tar.gz - Mend

vision-agent 0.2.178tar.gz → 0.2.180tar.gz

Files changed (35) hide show

{vision_agent-0.2.178 → vision_agent-0.2.180}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.178
+Version: 0.2.180
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.178 → vision_agent-0.2.180}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.178"
+version = "0.2.180"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.178 → vision_agent-0.2.180}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -65,6 +65,7 @@ from .tools import (
     template_match,
     vit_image_classification,
     vit_nsfw_classification,
+    qwen2_vl_images_vqa,
     video_temporal_localization,
 )

{vision_agent-0.2.178 → vision_agent-0.2.180}/vision_agent/tools/tools.py RENAMED Viewed

@@ -852,6 +852,39 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
     return cast(str, data["answer"])
+def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
+    """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images
+    including regular images or images of documents or presentations. It returns text
+    as an answer to the question.
+    Parameters:
+        prompt (str): The question about the document image
+        images (List[np.ndarray]): The reference images used for the question
+    Returns:
+        str: A string which is the answer to the given prompt.
+    Example
+    -------
+        >>> qwen2_vl_images_vqa('Give a summary of the document', images)
+        'The document talks about the history of the United States of America and its...'
+    """
+    for image in images:
+        if image.shape[0] < 1 or image.shape[1] < 1:
+            raise ValueError(f"Image is empty, image shape: {image.shape}")
+    files = [("images", numpy_to_bytes(image)) for image in images]
+    payload = {
+        "prompt": prompt,
+        "model": "qwen2vl",
+        "function_name": "qwen2_vl_images_vqa",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "image-to-text", files=files, v2=True
+    )
+    return cast(str, data)
 def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
     including regular videos or videos of documents or presentations. It returns text
@@ -1937,8 +1970,14 @@ def overlay_bounding_boxes(
     medias_int: List[np.ndarray] = (
         [medias] if isinstance(medias, np.ndarray) else medias
     )
-    bbox_int = [bboxes] if isinstance(bboxes[0], dict) else bboxes
-    bbox_int = cast(List[List[Dict[str, Any]]], bbox_int)
+    if len(bboxes) == 0:
+        bbox_int: List[List[Dict[str, Any]]] = [[] for _ in medias_int]
+    else:
+        if isinstance(bboxes[0], dict):
+            bbox_int = [cast(List[Dict[str, Any]], bboxes)]
+        else:
+            bbox_int = cast(List[List[Dict[str, Any]]], bboxes)
     labels = set([bb["label"] for b in bbox_int for bb in b])
     if len(labels) > len(COLORS):