PyPI - vision-agent - Versions diffs - 0.2.176__tar.gz → 0.2.178__tar.gz - Mend

vision-agent 0.2.176tar.gz → 0.2.178tar.gz

Files changed (35) hide show

{vision_agent-0.2.176 → vision_agent-0.2.178}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.176
+Version: 0.2.178
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.176 → vision_agent-0.2.178}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.176"
+version = "0.2.178"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -1,5 +1,17 @@
 from typing import Callable, List, Optional
+from .meta_tools import (
+    create_code_artifact,
+    edit_code_artifact,
+    edit_vision_code,
+    generate_vision_code,
+    get_tool_descriptions,
+    list_artifacts,
+    object_detection_fine_tuning,
+    open_code_artifact,
+    use_object_detection_fine_tuning,
+    view_media_artifact,
+)
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 from .tool_utils import get_tool_descriptions_by_names
 from .tools import (
@@ -53,6 +65,7 @@ from .tools import (
     template_match,
     vit_image_classification,
     vit_nsfw_classification,
+    video_temporal_localization,
 )
 __new_tools__ = [
@@ -65,7 +78,11 @@ def register_tool(imports: Optional[List] = None) -> Callable:
     def decorator(tool: Callable) -> Callable:
         import inspect
-        from .tools import get_tool_descriptions, get_tools_df, get_tools_info
+        from .tools import (  # noqa: F811
+            get_tool_descriptions,
+            get_tools_df,
+            get_tools_info,
+        )
         global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO

{vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/tools/tools.py RENAMED Viewed

@@ -975,6 +975,54 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
     return answer["text"][0]  # type: ignore
+def video_temporal_localization(
+    prompt: str,
+    frames: List[np.ndarray],
+    model: str = "qwen2vl",
+    chunk_length: Optional[float] = None,
+    chunk_length_seconds: Optional[float] = None,
+    chunk_length_frames: Optional[int] = 2,
+) -> List[float]:
+    """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
+    It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
+    Parameters:
+        prompt (str): The question about the video
+        frames (List[np.ndarray]): The reference frames used for the question
+        model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
+        chunk_length (Optional[float]): length of each chunk in seconds
+        chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
+        chunk_length_frames (Optional[int]): length of each chunk in frames
+    Returns:
+        List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
+    Example
+    -------
+        >>> video_temporal_localization('Did a goal happened?', frames)
+        [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
+    """
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload: Dict[str, Any] = {
+        "prompt": prompt,
+        "model": model,
+        "function_name": "video_temporal_localization",
+    }
+    if chunk_length is not None:
+        payload["chunk_length"] = chunk_length
+    if chunk_length_seconds is not None:
+        payload["chunk_length_seconds"] = chunk_length_seconds
+    if chunk_length_frames is not None:
+        payload["chunk_length_frames"] = chunk_length_frames
+    data = send_inference_request(
+        payload, "video-temporal-localization", files=files, v2=True
+    )
+    return [cast(float, value) for value in data]
 def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
     """'clip' is a tool that can classify an image or a cropped detection given a list
     of input classes or tags. It returns the same list of the input classes along with