PyPI - vision-agent - Versions diffs - 0.2.185__py3-none-any.whl → 0.2.187__py3-none-any.whl - Mend

vision-agent 0.2.185py3-none-any.whl → 0.2.187py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

vision_agent/tools/tools.py CHANGED Viewed

@@ -1057,23 +1057,25 @@ def video_temporal_localization(
     prompt: str,
     frames: List[np.ndarray],
     model: str = "qwen2vl",
-    chunk_length: Optional[float] = None,
-    chunk_length_seconds: Optional[float] = None,
     chunk_length_frames: Optional[int] = 2,
 ) -> List[float]:
-    """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
-    It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
+    """'video_temporal_localization' will run qwen2vl on each chunk_length_frames
+    value selected for the video. It can detect multiple objects independently per
+    chunk_length_frames given a text prompt such as a referring expression
+    but does not track objects across frames.
+    It returns a list of floats with a value of 1.0 if the objects are found in a given
+    chunk_length_frames of the video.
     Parameters:
         prompt (str): The question about the video
         frames (List[np.ndarray]): The reference frames used for the question
-        model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
-        chunk_length (Optional[float]): length of each chunk in seconds
-        chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
+        model (str): The model to use for the inference. Valid values are
+            'qwen2vl', 'gpt4o', 'internlm-xcomposer'
         chunk_length_frames (Optional[int]): length of each chunk in frames
     Returns:
-        List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
+        List[float]: A list of floats with a value of 1.0 if the objects to be found
+            are present in the chunk_length_frames of the video.
     Example
     -------
@@ -1088,10 +1090,6 @@ def video_temporal_localization(
         "model": model,
         "function_name": "video_temporal_localization",
     }
-    if chunk_length is not None:
-        payload["chunk_length"] = chunk_length
-    if chunk_length_seconds is not None:
-        payload["chunk_length_seconds"] = chunk_length_seconds
     if chunk_length_frames is not None:
         payload["chunk_length_frames"] = chunk_length_frames
@@ -1790,9 +1788,8 @@ def flux_image_inpainting(
             where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
     Returns:
-        np.ndarray:
-            The generated image(s) as a numpy array in RGB format
-            with values ranging from 0 to 255.
+        np.ndarray: The generated image(s) as a numpy array in RGB format with values
+            ranging from 0 to 255.
     -------
     Example:
@@ -1874,6 +1871,9 @@ def extract_frames_and_timestamps(
         >>> extract_frames("path/to/video.mp4")
         [{"frame": np.ndarray, "timestamp": 0.0}, ...]
     """
+    if isinstance(fps, str):
+        # fps could be a string when it's passed in from a web endpoint deployment
+        fps = float(fps)
     def reformat(
         frames_and_timestamps: List[Tuple[np.ndarray, float]],
@@ -1937,6 +1937,7 @@ def save_json(data: Any, file_path: str) -> None:
                 return bool(obj)
             return json.JSONEncoder.default(self, obj)
+    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
     with open(file_path, "w") as f:
         json.dump(data, f, cls=NumpyEncoder)
@@ -1979,6 +1980,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
     -------
         >>> save_image(image)
     """
+    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
     from IPython.display import display
     if not isinstance(image, np.ndarray) or (
@@ -2009,6 +2011,9 @@ def save_video(
         >>> save_video(frames)
         "/tmp/tmpvideo123.mp4"
     """
+    if isinstance(fps, str):
+        # fps could be a string when it's passed in from a web endpoint deployment
+        fps = float(fps)
     if fps <= 0:
         raise ValueError(f"fps must be greater than 0 got {fps}")
@@ -2025,6 +2030,8 @@ def save_video(
         output_video_path = tempfile.NamedTemporaryFile(
             delete=False, suffix=".mp4"
         ).name
+    else:
+        Path(output_video_path).parent.mkdir(parents=True, exist_ok=True)
     output_video_path = video_writer(frames, fps, output_video_path)
     _save_video_to_result(output_video_path)
@@ -2351,6 +2358,7 @@ FUNCTION_TOOLS = [
     closest_box_distance,
     qwen2_vl_images_vqa,
     qwen2_vl_video_vqa,
+    video_temporal_localization,
 ]
 UTIL_TOOLS = [

vision_agent/utils/video.py CHANGED Viewed

@@ -58,6 +58,9 @@ def video_writer(
     fps: float = _DEFAULT_INPUT_FPS,
     filename: Optional[str] = None,
 ) -> str:
+    if isinstance(fps, str):
+        # fps could be a string when it's passed in from a web endpoint deployment
+        fps = float(fps)
     if filename is None:
         filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
     container = av.open(filename, mode="w")
@@ -92,6 +95,9 @@ def frames_to_bytes(
         fps: the frames per second of the video
         file_ext: the file extension of the video file
     """
+    if isinstance(fps, str):
+        # fps could be a string when it's passed in from a web endpoint deployment
+        fps = float(fps)
     with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as temp_file:
         video_writer(frames, fps, temp_file.name)
@@ -120,6 +126,9 @@ def extract_frames_from_video(
             from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
             the video. The frames are sorted by the timestamp in ascending order.
     """
+    if isinstance(fps, str):
+        # fps could be a string when it's passed in from a web endpoint deployment
+        fps = float(fps)
     cap = cv2.VideoCapture(video_uri)
     orig_fps = cap.get(cv2.CAP_PROP_FPS)

{vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.185
+Version: 0.2.187
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -63,10 +63,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
 ## Table of Contents
 - [🚀Quick Start](#quick-start)
 - [📚Documentation](#documentation)
-- [🔍🤖VisionAgent](#vision-agent-basic-usage)
+- [🔍🤖VisionAgent](#visionagent-basic-usage)
 - [🛠️Tools](#tools)
 - [🤖LMMs](#lmms)
-- [💻🤖VisionAgent Coder](#vision-agent-coder)
+- [💻🤖VisionAgent Coder](#visionagent-coder)
 - [🏗️Additional Backends](#additional-backends)
 ## Quick Start

{vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/RECORD RENAMED Viewed

@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
 vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
-vision_agent/tools/tools.py,sha256=us3fOV3JIqFB9WidEX6NT65HwJbIxhh59RRvUcMIshI,83251
+vision_agent/tools/tools.py,sha256=rsvQ7cz2xiGiJZme8yb-r-omSarhtC0zapUSt3_pmuo,83541
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo
 vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
 vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
-vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
-vision_agent-0.2.185.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.185.dist-info/METADATA,sha256=Wgo1bRpQ3MgqxIDpBiN0Tj0YAUBwRtYCQ7DmhJwgKpY,18330
-vision_agent-0.2.185.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.185.dist-info/RECORD,,
+vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
+vision_agent-0.2.187.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.187.dist-info/METADATA,sha256=SYs-27G_7CqSSP8tNatzvnca9BTA-gupcRSVSezmxsw,18328
+vision_agent-0.2.187.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.187.dist-info/RECORD,,

{vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.185__py3-none-any.whl → 0.2.187__py3-none-any.whl

vision-agent 0.2.185py3-none-any.whl → 0.2.187py3-none-any.whl