vision-agent 0.2.177__py3-none-any.whl → 0.2.178__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,6 +65,7 @@ from .tools import (
65
65
  template_match,
66
66
  vit_image_classification,
67
67
  vit_nsfw_classification,
68
+ video_temporal_localization,
68
69
  )
69
70
 
70
71
  __new_tools__ = [
@@ -975,6 +975,54 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
975
975
  return answer["text"][0] # type: ignore
976
976
 
977
977
 
978
+ def video_temporal_localization(
979
+ prompt: str,
980
+ frames: List[np.ndarray],
981
+ model: str = "qwen2vl",
982
+ chunk_length: Optional[float] = None,
983
+ chunk_length_seconds: Optional[float] = None,
984
+ chunk_length_frames: Optional[int] = 2,
985
+ ) -> List[float]:
986
+ """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
987
+ It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
988
+
989
+ Parameters:
990
+ prompt (str): The question about the video
991
+ frames (List[np.ndarray]): The reference frames used for the question
992
+ model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
993
+ chunk_length (Optional[float]): length of each chunk in seconds
994
+ chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
995
+ chunk_length_frames (Optional[int]): length of each chunk in frames
996
+
997
+ Returns:
998
+ List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
999
+
1000
+ Example
1001
+ -------
1002
+ >>> video_temporal_localization('Did a goal happened?', frames)
1003
+ [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
1004
+ """
1005
+
1006
+ buffer_bytes = frames_to_bytes(frames)
1007
+ files = [("video", buffer_bytes)]
1008
+ payload: Dict[str, Any] = {
1009
+ "prompt": prompt,
1010
+ "model": model,
1011
+ "function_name": "video_temporal_localization",
1012
+ }
1013
+ if chunk_length is not None:
1014
+ payload["chunk_length"] = chunk_length
1015
+ if chunk_length_seconds is not None:
1016
+ payload["chunk_length_seconds"] = chunk_length_seconds
1017
+ if chunk_length_frames is not None:
1018
+ payload["chunk_length_frames"] = chunk_length_frames
1019
+
1020
+ data = send_inference_request(
1021
+ payload, "video-temporal-localization", files=files, v2=True
1022
+ )
1023
+ return [cast(float, value) for value in data]
1024
+
1025
+
978
1026
  def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
979
1027
  """'clip' is a tool that can classify an image or a cropped detection given a list
980
1028
  of input classes or tags. It returns the same list of the input classes along with
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.177
3
+ Version: 0.2.178
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=q9lKBq1reyQzwsx2b5tS_zGDvIHdxz1Q6VEtguZagfo,2689
19
+ vision_agent/tools/__init__.py,sha256=QOfv679dbD48nITflt00i-sKe-asOGt_wd6JyInxgNw,2722
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
23
+ vision_agent/tools/tools.py,sha256=DAmXGuE4Ma2vu2A8G9K9L-m9EKqU2TIg_Q7Cq9DnI_Y,79863
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.177.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.177.dist-info/METADATA,sha256=Xz1KUOjaiAEFCT0SVKuWERZZMpsB_F14GLMdL9NDkB0,18330
34
- vision_agent-0.2.177.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.177.dist-info/RECORD,,
32
+ vision_agent-0.2.178.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.178.dist-info/METADATA,sha256=bHWGGiuj8D4mlBt72OyPsoeQa2a9rucK-UXoqU_RmKA,18330
34
+ vision_agent-0.2.178.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.178.dist-info/RECORD,,