vision-agent 0.2.182__py3-none-any.whl → 0.2.183__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +33 -2
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.183.dist-info}/METADATA +1 -1
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.183.dist-info}/RECORD +6 -6
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.183.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.182.dist-info → vision_agent-0.2.183.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -930,6 +930,37 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
930
930
|
return cast(str, data["answer"])
|
931
931
|
|
932
932
|
|
933
|
+
def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
934
|
+
"""'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
|
935
|
+
including regular videos or videos of documents or presentations. It returns text
|
936
|
+
as an answer to the question.
|
937
|
+
|
938
|
+
Parameters:
|
939
|
+
prompt (str): The question about the video
|
940
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
941
|
+
|
942
|
+
Returns:
|
943
|
+
str: A string which is the answer to the given prompt.
|
944
|
+
|
945
|
+
Example
|
946
|
+
-------
|
947
|
+
>>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
|
948
|
+
'Lionel Messi'
|
949
|
+
"""
|
950
|
+
|
951
|
+
buffer_bytes = frames_to_bytes(frames)
|
952
|
+
files = [("video", buffer_bytes)]
|
953
|
+
payload = {
|
954
|
+
"prompt": prompt,
|
955
|
+
"model": "qwen2vl",
|
956
|
+
"function_name": "qwen2_vl_video_vqa",
|
957
|
+
}
|
958
|
+
data: Dict[str, Any] = send_inference_request(
|
959
|
+
payload, "image-to-text", files=files, v2=True
|
960
|
+
)
|
961
|
+
return cast(str, data)
|
962
|
+
|
963
|
+
|
933
964
|
def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
|
934
965
|
"""'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
|
935
966
|
including regular images or images of documents or presentations. It returns text
|
@@ -2238,13 +2269,13 @@ FUNCTION_TOOLS = [
|
|
2238
2269
|
florence2_sam2_image,
|
2239
2270
|
florence2_sam2_video_tracking,
|
2240
2271
|
florence2_phrase_grounding,
|
2241
|
-
ixc25_image_vqa,
|
2242
|
-
ixc25_video_vqa,
|
2243
2272
|
detr_segmentation,
|
2244
2273
|
depth_anything_v2,
|
2245
2274
|
generate_pose_image,
|
2246
2275
|
closest_mask_distance,
|
2247
2276
|
closest_box_distance,
|
2277
|
+
qwen2_vl_images_vqa,
|
2278
|
+
qwen2_vl_video_vqa,
|
2248
2279
|
]
|
2249
2280
|
|
2250
2281
|
UTIL_TOOLS = [
|
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
16
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
|
-
vision_agent/tools/__init__.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=17wZ4ZsoSTZZaiqBTi6pqAKUr-qf58_T_zH2GXOi1KU,2771
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=vc0T940b-rRiGAOJttn7BsuCpVh9rJaivOmorpE41AA,81134
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.183.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.183.dist-info/METADATA,sha256=9V38VymRic0fe2uqCIjl3nhuVJYx49ZQox69izWD8k8,18330
|
34
|
+
vision_agent-0.2.183.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.183.dist-info/RECORD,,
|
File without changes
|
File without changes
|