vision-agent 0.2.178__py3-none-any.whl → 0.2.180__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +41 -2
- {vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/METADATA +1 -1
- {vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/RECORD +6 -6
- {vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -852,6 +852,39 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
|
852
852
|
return cast(str, data["answer"])
|
853
853
|
|
854
854
|
|
855
|
+
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
856
|
+
"""'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images
|
857
|
+
including regular images or images of documents or presentations. It returns text
|
858
|
+
as an answer to the question.
|
859
|
+
|
860
|
+
Parameters:
|
861
|
+
prompt (str): The question about the document image
|
862
|
+
images (List[np.ndarray]): The reference images used for the question
|
863
|
+
|
864
|
+
Returns:
|
865
|
+
str: A string which is the answer to the given prompt.
|
866
|
+
|
867
|
+
Example
|
868
|
+
-------
|
869
|
+
>>> qwen2_vl_images_vqa('Give a summary of the document', images)
|
870
|
+
'The document talks about the history of the United States of America and its...'
|
871
|
+
"""
|
872
|
+
for image in images:
|
873
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
874
|
+
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
875
|
+
|
876
|
+
files = [("images", numpy_to_bytes(image)) for image in images]
|
877
|
+
payload = {
|
878
|
+
"prompt": prompt,
|
879
|
+
"model": "qwen2vl",
|
880
|
+
"function_name": "qwen2_vl_images_vqa",
|
881
|
+
}
|
882
|
+
data: Dict[str, Any] = send_inference_request(
|
883
|
+
payload, "image-to-text", files=files, v2=True
|
884
|
+
)
|
885
|
+
return cast(str, data)
|
886
|
+
|
887
|
+
|
855
888
|
def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
856
889
|
"""'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
|
857
890
|
including regular videos or videos of documents or presentations. It returns text
|
@@ -1937,8 +1970,14 @@ def overlay_bounding_boxes(
|
|
1937
1970
|
medias_int: List[np.ndarray] = (
|
1938
1971
|
[medias] if isinstance(medias, np.ndarray) else medias
|
1939
1972
|
)
|
1940
|
-
|
1941
|
-
|
1973
|
+
if len(bboxes) == 0:
|
1974
|
+
bbox_int: List[List[Dict[str, Any]]] = [[] for _ in medias_int]
|
1975
|
+
else:
|
1976
|
+
if isinstance(bboxes[0], dict):
|
1977
|
+
bbox_int = [cast(List[Dict[str, Any]], bboxes)]
|
1978
|
+
else:
|
1979
|
+
bbox_int = cast(List[List[Dict[str, Any]]], bboxes)
|
1980
|
+
|
1942
1981
|
labels = set([bb["label"] for b in bbox_int for bb in b])
|
1943
1982
|
|
1944
1983
|
if len(labels) > len(COLORS):
|
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
16
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
|
-
vision_agent/tools/__init__.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdVA,2747
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=9MbX3b_xff-cHeCh46_q6gt7b5jNSCVSwiu2rwM43Ws,81224
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.180.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.180.dist-info/METADATA,sha256=KHeuZn1H6KJXyMlkPyrmie_AqUL1MMALOIoU0kKzg2s,18330
|
34
|
+
vision_agent-0.2.180.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.180.dist-info/RECORD,,
|
File without changes
|
File without changes
|