vision-agent 0.2.178__py3-none-any.whl → 0.2.180__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +41 -2
- {vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/METADATA +1 -1
- {vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/RECORD +6 -6
- {vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.178.dist-info → vision_agent-0.2.180.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -852,6 +852,39 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
|
852
852
|
return cast(str, data["answer"])
|
853
853
|
|
854
854
|
|
855
|
+
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
856
|
+
"""'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images
|
857
|
+
including regular images or images of documents or presentations. It returns text
|
858
|
+
as an answer to the question.
|
859
|
+
|
860
|
+
Parameters:
|
861
|
+
prompt (str): The question about the document image
|
862
|
+
images (List[np.ndarray]): The reference images used for the question
|
863
|
+
|
864
|
+
Returns:
|
865
|
+
str: A string which is the answer to the given prompt.
|
866
|
+
|
867
|
+
Example
|
868
|
+
-------
|
869
|
+
>>> qwen2_vl_images_vqa('Give a summary of the document', images)
|
870
|
+
'The document talks about the history of the United States of America and its...'
|
871
|
+
"""
|
872
|
+
for image in images:
|
873
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
874
|
+
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
875
|
+
|
876
|
+
files = [("images", numpy_to_bytes(image)) for image in images]
|
877
|
+
payload = {
|
878
|
+
"prompt": prompt,
|
879
|
+
"model": "qwen2vl",
|
880
|
+
"function_name": "qwen2_vl_images_vqa",
|
881
|
+
}
|
882
|
+
data: Dict[str, Any] = send_inference_request(
|
883
|
+
payload, "image-to-text", files=files, v2=True
|
884
|
+
)
|
885
|
+
return cast(str, data)
|
886
|
+
|
887
|
+
|
855
888
|
def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
856
889
|
"""'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
|
857
890
|
including regular videos or videos of documents or presentations. It returns text
|
@@ -1937,8 +1970,14 @@ def overlay_bounding_boxes(
|
|
1937
1970
|
medias_int: List[np.ndarray] = (
|
1938
1971
|
[medias] if isinstance(medias, np.ndarray) else medias
|
1939
1972
|
)
|
1940
|
-
|
1941
|
-
|
1973
|
+
if len(bboxes) == 0:
|
1974
|
+
bbox_int: List[List[Dict[str, Any]]] = [[] for _ in medias_int]
|
1975
|
+
else:
|
1976
|
+
if isinstance(bboxes[0], dict):
|
1977
|
+
bbox_int = [cast(List[Dict[str, Any]], bboxes)]
|
1978
|
+
else:
|
1979
|
+
bbox_int = cast(List[List[Dict[str, Any]]], bboxes)
|
1980
|
+
|
1942
1981
|
labels = set([bb["label"] for b in bbox_int for bb in b])
|
1943
1982
|
|
1944
1983
|
if len(labels) > len(COLORS):
|
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
16
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
|
-
vision_agent/tools/__init__.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdVA,2747
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=9MbX3b_xff-cHeCh46_q6gt7b5jNSCVSwiu2rwM43Ws,81224
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.180.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.180.dist-info/METADATA,sha256=KHeuZn1H6KJXyMlkPyrmie_AqUL1MMALOIoU0kKzg2s,18330
|
34
|
+
vision_agent-0.2.180.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.180.dist-info/RECORD,,
|
File without changes
|
File without changes
|