PyPI - vision-agent - Versions diffs - 0.2.178__py3-none-any.whl → 0.2.180__py3-none-any.whl - Mend

vision-agent 0.2.178py3-none-any.whl → 0.2.180py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

vision_agent/tools/__init__.py CHANGED Viewed

@@ -65,6 +65,7 @@ from .tools import (
     template_match,
     vit_image_classification,
     vit_nsfw_classification,
+    qwen2_vl_images_vqa,
     video_temporal_localization,
 )

vision_agent/tools/tools.py CHANGED Viewed

@@ -852,6 +852,39 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
     return cast(str, data["answer"])
+def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
+    """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images
+    including regular images or images of documents or presentations. It returns text
+    as an answer to the question.
+    Parameters:
+        prompt (str): The question about the document image
+        images (List[np.ndarray]): The reference images used for the question
+    Returns:
+        str: A string which is the answer to the given prompt.
+    Example
+    -------
+        >>> qwen2_vl_images_vqa('Give a summary of the document', images)
+        'The document talks about the history of the United States of America and its...'
+    """
+    for image in images:
+        if image.shape[0] < 1 or image.shape[1] < 1:
+            raise ValueError(f"Image is empty, image shape: {image.shape}")
+    files = [("images", numpy_to_bytes(image)) for image in images]
+    payload = {
+        "prompt": prompt,
+        "model": "qwen2vl",
+        "function_name": "qwen2_vl_images_vqa",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "image-to-text", files=files, v2=True
+    )
+    return cast(str, data)
 def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
     including regular videos or videos of documents or presentations. It returns text
@@ -1937,8 +1970,14 @@ def overlay_bounding_boxes(
     medias_int: List[np.ndarray] = (
         [medias] if isinstance(medias, np.ndarray) else medias
     )
-    bbox_int = [bboxes] if isinstance(bboxes[0], dict) else bboxes
-    bbox_int = cast(List[List[Dict[str, Any]]], bbox_int)
+    if len(bboxes) == 0:
+        bbox_int: List[List[Dict[str, Any]]] = [[] for _ in medias_int]
+    else:
+        if isinstance(bboxes[0], dict):
+            bbox_int = [cast(List[Dict[str, Any]], bboxes)]
+        else:
+            bbox_int = cast(List[List[Dict[str, Any]]], bboxes)
     labels = set([bb["label"] for b in bbox_int for bb in b])
     if len(labels) > len(COLORS):

{vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.178
+Version: 0.2.180
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/RECORD RENAMED Viewed

@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
 vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
 vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=QOfv679dbD48nITflt00i-sKe-asOGt_wd6JyInxgNw,2722
+vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdVA,2747
 vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
-vision_agent/tools/tools.py,sha256=DAmXGuE4Ma2vu2A8G9K9L-m9EKqU2TIg_Q7Cq9DnI_Y,79863
+vision_agent/tools/tools.py,sha256=9MbX3b_xff-cHeCh46_q6gt7b5jNSCVSwiu2rwM43Ws,81224
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
 vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
-vision_agent-0.2.178.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.178.dist-info/METADATA,sha256=bHWGGiuj8D4mlBt72OyPsoeQa2a9rucK-UXoqU_RmKA,18330
-vision_agent-0.2.178.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.178.dist-info/RECORD,,
+vision_agent-0.2.180.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.180.dist-info/METADATA,sha256=KHeuZn1H6KJXyMlkPyrmie_AqUL1MMALOIoU0kKzg2s,18330
+vision_agent-0.2.180.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.180.dist-info/RECORD,,

{vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.178__py3-none-any.whl → 0.2.180__py3-none-any.whl

vision-agent 0.2.178py3-none-any.whl → 0.2.180py3-none-any.whl