vision-agent 0.2.185__py3-none-any.whl → 0.2.187__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1057,23 +1057,25 @@ def video_temporal_localization(
1057
1057
  prompt: str,
1058
1058
  frames: List[np.ndarray],
1059
1059
  model: str = "qwen2vl",
1060
- chunk_length: Optional[float] = None,
1061
- chunk_length_seconds: Optional[float] = None,
1062
1060
  chunk_length_frames: Optional[int] = 2,
1063
1061
  ) -> List[float]:
1064
- """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
1065
- It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
1062
+ """'video_temporal_localization' will run qwen2vl on each chunk_length_frames
1063
+ value selected for the video. It can detect multiple objects independently per
1064
+ chunk_length_frames given a text prompt such as a referring expression
1065
+ but does not track objects across frames.
1066
+ It returns a list of floats with a value of 1.0 if the objects are found in a given
1067
+ chunk_length_frames of the video.
1066
1068
 
1067
1069
  Parameters:
1068
1070
  prompt (str): The question about the video
1069
1071
  frames (List[np.ndarray]): The reference frames used for the question
1070
- model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1071
- chunk_length (Optional[float]): length of each chunk in seconds
1072
- chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
1072
+ model (str): The model to use for the inference. Valid values are
1073
+ 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1073
1074
  chunk_length_frames (Optional[int]): length of each chunk in frames
1074
1075
 
1075
1076
  Returns:
1076
- List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
1077
+ List[float]: A list of floats with a value of 1.0 if the objects to be found
1078
+ are present in the chunk_length_frames of the video.
1077
1079
 
1078
1080
  Example
1079
1081
  -------
@@ -1088,10 +1090,6 @@ def video_temporal_localization(
1088
1090
  "model": model,
1089
1091
  "function_name": "video_temporal_localization",
1090
1092
  }
1091
- if chunk_length is not None:
1092
- payload["chunk_length"] = chunk_length
1093
- if chunk_length_seconds is not None:
1094
- payload["chunk_length_seconds"] = chunk_length_seconds
1095
1093
  if chunk_length_frames is not None:
1096
1094
  payload["chunk_length_frames"] = chunk_length_frames
1097
1095
 
@@ -1790,9 +1788,8 @@ def flux_image_inpainting(
1790
1788
  where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
1791
1789
 
1792
1790
  Returns:
1793
- np.ndarray:
1794
- The generated image(s) as a numpy array in RGB format
1795
- with values ranging from 0 to 255.
1791
+ np.ndarray: The generated image(s) as a numpy array in RGB format with values
1792
+ ranging from 0 to 255.
1796
1793
 
1797
1794
  -------
1798
1795
  Example:
@@ -1874,6 +1871,9 @@ def extract_frames_and_timestamps(
1874
1871
  >>> extract_frames("path/to/video.mp4")
1875
1872
  [{"frame": np.ndarray, "timestamp": 0.0}, ...]
1876
1873
  """
1874
+ if isinstance(fps, str):
1875
+ # fps could be a string when it's passed in from a web endpoint deployment
1876
+ fps = float(fps)
1877
1877
 
1878
1878
  def reformat(
1879
1879
  frames_and_timestamps: List[Tuple[np.ndarray, float]],
@@ -1937,6 +1937,7 @@ def save_json(data: Any, file_path: str) -> None:
1937
1937
  return bool(obj)
1938
1938
  return json.JSONEncoder.default(self, obj)
1939
1939
 
1940
+ Path(file_path).parent.mkdir(parents=True, exist_ok=True)
1940
1941
  with open(file_path, "w") as f:
1941
1942
  json.dump(data, f, cls=NumpyEncoder)
1942
1943
 
@@ -1979,6 +1980,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
1979
1980
  -------
1980
1981
  >>> save_image(image)
1981
1982
  """
1983
+ Path(file_path).parent.mkdir(parents=True, exist_ok=True)
1982
1984
  from IPython.display import display
1983
1985
 
1984
1986
  if not isinstance(image, np.ndarray) or (
@@ -2009,6 +2011,9 @@ def save_video(
2009
2011
  >>> save_video(frames)
2010
2012
  "/tmp/tmpvideo123.mp4"
2011
2013
  """
2014
+ if isinstance(fps, str):
2015
+ # fps could be a string when it's passed in from a web endpoint deployment
2016
+ fps = float(fps)
2012
2017
  if fps <= 0:
2013
2018
  raise ValueError(f"fps must be greater than 0 got {fps}")
2014
2019
 
@@ -2025,6 +2030,8 @@ def save_video(
2025
2030
  output_video_path = tempfile.NamedTemporaryFile(
2026
2031
  delete=False, suffix=".mp4"
2027
2032
  ).name
2033
+ else:
2034
+ Path(output_video_path).parent.mkdir(parents=True, exist_ok=True)
2028
2035
 
2029
2036
  output_video_path = video_writer(frames, fps, output_video_path)
2030
2037
  _save_video_to_result(output_video_path)
@@ -2351,6 +2358,7 @@ FUNCTION_TOOLS = [
2351
2358
  closest_box_distance,
2352
2359
  qwen2_vl_images_vqa,
2353
2360
  qwen2_vl_video_vqa,
2361
+ video_temporal_localization,
2354
2362
  ]
2355
2363
 
2356
2364
  UTIL_TOOLS = [
@@ -58,6 +58,9 @@ def video_writer(
58
58
  fps: float = _DEFAULT_INPUT_FPS,
59
59
  filename: Optional[str] = None,
60
60
  ) -> str:
61
+ if isinstance(fps, str):
62
+ # fps could be a string when it's passed in from a web endpoint deployment
63
+ fps = float(fps)
61
64
  if filename is None:
62
65
  filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
63
66
  container = av.open(filename, mode="w")
@@ -92,6 +95,9 @@ def frames_to_bytes(
92
95
  fps: the frames per second of the video
93
96
  file_ext: the file extension of the video file
94
97
  """
98
+ if isinstance(fps, str):
99
+ # fps could be a string when it's passed in from a web endpoint deployment
100
+ fps = float(fps)
95
101
  with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as temp_file:
96
102
  video_writer(frames, fps, temp_file.name)
97
103
 
@@ -120,6 +126,9 @@ def extract_frames_from_video(
120
126
  from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
121
127
  the video. The frames are sorted by the timestamp in ascending order.
122
128
  """
129
+ if isinstance(fps, str):
130
+ # fps could be a string when it's passed in from a web endpoint deployment
131
+ fps = float(fps)
123
132
 
124
133
  cap = cv2.VideoCapture(video_uri)
125
134
  orig_fps = cap.get(cv2.CAP_PROP_FPS)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.185
3
+ Version: 0.2.187
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -63,10 +63,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
63
63
  ## Table of Contents
64
64
  - [🚀Quick Start](#quick-start)
65
65
  - [📚Documentation](#documentation)
66
- - [🔍🤖VisionAgent](#vision-agent-basic-usage)
66
+ - [🔍🤖VisionAgent](#visionagent-basic-usage)
67
67
  - [🛠️Tools](#tools)
68
68
  - [🤖LMMs](#lmms)
69
- - [💻🤖VisionAgent Coder](#vision-agent-coder)
69
+ - [💻🤖VisionAgent Coder](#visionagent-coder)
70
70
  - [🏗️Additional Backends](#additional-backends)
71
71
 
72
72
  ## Quick Start
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=us3fOV3JIqFB9WidEX6NT65HwJbIxhh59RRvUcMIshI,83251
23
+ vision_agent/tools/tools.py,sha256=rsvQ7cz2xiGiJZme8yb-r-omSarhtC0zapUSt3_pmuo,83541
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo
28
28
  vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
- vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.185.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.185.dist-info/METADATA,sha256=Wgo1bRpQ3MgqxIDpBiN0Tj0YAUBwRtYCQ7DmhJwgKpY,18330
34
- vision_agent-0.2.185.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.185.dist-info/RECORD,,
31
+ vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
32
+ vision_agent-0.2.187.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.187.dist-info/METADATA,sha256=SYs-27G_7CqSSP8tNatzvnca9BTA-gupcRSVSezmxsw,18328
34
+ vision_agent-0.2.187.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.187.dist-info/RECORD,,