vision-agent 0.2.177__py3-none-any.whl → 0.2.178__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +48 -0
- {vision_agent-0.2.177.dist-info → vision_agent-0.2.178.dist-info}/METADATA +1 -1
- {vision_agent-0.2.177.dist-info → vision_agent-0.2.178.dist-info}/RECORD +6 -6
- {vision_agent-0.2.177.dist-info → vision_agent-0.2.178.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.177.dist-info → vision_agent-0.2.178.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -975,6 +975,54 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
|
975
975
|
return answer["text"][0] # type: ignore
|
976
976
|
|
977
977
|
|
978
|
+
def video_temporal_localization(
|
979
|
+
prompt: str,
|
980
|
+
frames: List[np.ndarray],
|
981
|
+
model: str = "qwen2vl",
|
982
|
+
chunk_length: Optional[float] = None,
|
983
|
+
chunk_length_seconds: Optional[float] = None,
|
984
|
+
chunk_length_frames: Optional[int] = 2,
|
985
|
+
) -> List[float]:
|
986
|
+
"""'video_temporal_localization' is a tool that can find objects in a video given a question about it.
|
987
|
+
It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
|
988
|
+
|
989
|
+
Parameters:
|
990
|
+
prompt (str): The question about the video
|
991
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
992
|
+
model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
|
993
|
+
chunk_length (Optional[float]): length of each chunk in seconds
|
994
|
+
chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
|
995
|
+
chunk_length_frames (Optional[int]): length of each chunk in frames
|
996
|
+
|
997
|
+
Returns:
|
998
|
+
List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
|
999
|
+
|
1000
|
+
Example
|
1001
|
+
-------
|
1002
|
+
>>> video_temporal_localization('Did a goal happened?', frames)
|
1003
|
+
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
1004
|
+
"""
|
1005
|
+
|
1006
|
+
buffer_bytes = frames_to_bytes(frames)
|
1007
|
+
files = [("video", buffer_bytes)]
|
1008
|
+
payload: Dict[str, Any] = {
|
1009
|
+
"prompt": prompt,
|
1010
|
+
"model": model,
|
1011
|
+
"function_name": "video_temporal_localization",
|
1012
|
+
}
|
1013
|
+
if chunk_length is not None:
|
1014
|
+
payload["chunk_length"] = chunk_length
|
1015
|
+
if chunk_length_seconds is not None:
|
1016
|
+
payload["chunk_length_seconds"] = chunk_length_seconds
|
1017
|
+
if chunk_length_frames is not None:
|
1018
|
+
payload["chunk_length_frames"] = chunk_length_frames
|
1019
|
+
|
1020
|
+
data = send_inference_request(
|
1021
|
+
payload, "video-temporal-localization", files=files, v2=True
|
1022
|
+
)
|
1023
|
+
return [cast(float, value) for value in data]
|
1024
|
+
|
1025
|
+
|
978
1026
|
def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
979
1027
|
"""'clip' is a tool that can classify an image or a cropped detection given a list
|
980
1028
|
of input classes or tags. It returns the same list of the input classes along with
|
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
16
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
|
-
vision_agent/tools/__init__.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=QOfv679dbD48nITflt00i-sKe-asOGt_wd6JyInxgNw,2722
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=DAmXGuE4Ma2vu2A8G9K9L-m9EKqU2TIg_Q7Cq9DnI_Y,79863
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.178.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.178.dist-info/METADATA,sha256=bHWGGiuj8D4mlBt72OyPsoeQa2a9rucK-UXoqU_RmKA,18330
|
34
|
+
vision_agent-0.2.178.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.178.dist-info/RECORD,,
|
File without changes
|
File without changes
|