PyPI - vision-agent - Versions diffs - 0.2.177__tar.gz → 0.2.179__tar.gz - Mend

vision-agent 0.2.177tar.gz → 0.2.179tar.gz

Files changed (35) hide show

{vision_agent-0.2.177 → vision_agent-0.2.179}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.177
+Version: 0.2.179
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.177 → vision_agent-0.2.179}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.177"
+version = "0.2.179"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.177 → vision_agent-0.2.179}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -65,6 +65,8 @@ from .tools import (
     template_match,
     vit_image_classification,
     vit_nsfw_classification,
+    qwen2_vl_images_vqa,
+    video_temporal_localization,
 )
 __new_tools__ = [

{vision_agent-0.2.177 → vision_agent-0.2.179}/vision_agent/tools/tools.py RENAMED Viewed

@@ -852,6 +852,39 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
     return cast(str, data["answer"])
+def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
+    """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images
+    including regular images or images of documents or presentations. It returns text
+    as an answer to the question.
+    Parameters:
+        prompt (str): The question about the document image
+        images (List[np.ndarray]): The reference images used for the question
+    Returns:
+        str: A string which is the answer to the given prompt.
+    Example
+    -------
+        >>> qwen2_vl_images_vqa('Give a summary of the document', images)
+        'The document talks about the history of the United States of America and its...'
+    """
+    for image in images:
+        if image.shape[0] < 1 or image.shape[1] < 1:
+            raise ValueError(f"Image is empty, image shape: {image.shape}")
+    files = [("images", numpy_to_bytes(image)) for image in images]
+    payload = {
+        "prompt": prompt,
+        "model": "qwen2vl",
+        "function_name": "qwen2_vl_images_vqa",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "image-to-text", files=files, v2=True
+    )
+    return cast(str, data)
 def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
     including regular videos or videos of documents or presentations. It returns text
@@ -975,6 +1008,54 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
     return answer["text"][0]  # type: ignore
+def video_temporal_localization(
+    prompt: str,
+    frames: List[np.ndarray],
+    model: str = "qwen2vl",
+    chunk_length: Optional[float] = None,
+    chunk_length_seconds: Optional[float] = None,
+    chunk_length_frames: Optional[int] = 2,
+) -> List[float]:
+    """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
+    It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
+    Parameters:
+        prompt (str): The question about the video
+        frames (List[np.ndarray]): The reference frames used for the question
+        model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
+        chunk_length (Optional[float]): length of each chunk in seconds
+        chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
+        chunk_length_frames (Optional[int]): length of each chunk in frames
+    Returns:
+        List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
+    Example
+    -------
+        >>> video_temporal_localization('Did a goal happened?', frames)
+        [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
+    """
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload: Dict[str, Any] = {
+        "prompt": prompt,
+        "model": model,
+        "function_name": "video_temporal_localization",
+    }
+    if chunk_length is not None:
+        payload["chunk_length"] = chunk_length
+    if chunk_length_seconds is not None:
+        payload["chunk_length_seconds"] = chunk_length_seconds
+    if chunk_length_frames is not None:
+        payload["chunk_length_frames"] = chunk_length_frames
+    data = send_inference_request(
+        payload, "video-temporal-localization", files=files, v2=True
+    )
+    return [cast(float, value) for value in data]
 def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
     """'clip' is a tool that can classify an image or a cropped detection given a list
     of input classes or tags. It returns the same list of the input classes along with