PyPI - vision-agent - Versions diffs - 0.2.182__tar.gz → 0.2.184__tar.gz - Mend

vision-agent 0.2.182tar.gz → 0.2.184tar.gz

Files changed (35) hide show

{vision_agent-0.2.182 → vision_agent-0.2.184}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.182
+Version: 0.2.184
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.182 → vision_agent-0.2.184}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.182"
+version = "0.2.184"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.182 → vision_agent-0.2.184}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -66,7 +66,9 @@ from .tools import (
     vit_image_classification,
     vit_nsfw_classification,
     qwen2_vl_images_vqa,
+    qwen2_vl_video_vqa,
     video_temporal_localization,
+    flux_image_inpainting,
 )
 __new_tools__ = [

{vision_agent-0.2.182 → vision_agent-0.2.184}/vision_agent/tools/tools.py RENAMED Viewed

@@ -930,6 +930,37 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data["answer"])
+def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
+    """'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
+    including regular videos or videos of documents or presentations. It returns text
+    as an answer to the question.
+    Parameters:
+        prompt (str): The question about the video
+        frames (List[np.ndarray]): The reference frames used for the question
+    Returns:
+        str: A string which is the answer to the given prompt.
+    Example
+    -------
+        >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
+        'Lionel Messi'
+    """
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "prompt": prompt,
+        "model": "qwen2vl",
+        "function_name": "qwen2_vl_video_vqa",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "image-to-text", files=files, v2=True
+    )
+    return cast(str, data)
 def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
     """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
     including regular images or images of documents or presentations. It returns text
@@ -1742,6 +1773,82 @@ def closest_box_distance(
     return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
+def flux_image_inpainting(
+    prompt: str,
+    image: np.ndarray,
+    mask: np.ndarray,
+) -> np.ndarray:
+    """'flux_image_inpainting' performs image inpainting to fill the masked regions,
+    given by mask, in the image, given image based on the text prompt and surrounding image context.
+    It can be used to edit regions of an image according to the prompt given.
+    Parameters:
+        prompt (str): A detailed text description guiding what should be generated
+            in the masked area. More detailed and specific prompts typically yield better results.
+        image (np.ndarray): The source image to be inpainted.
+            The image will serve as the base context for the inpainting process.
+        mask (np.ndarray): A binary mask image with 0's and 1's,
+            where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
+    Returns:
+        np.ndarray:
+            The generated image(s) as a numpy array in RGB format
+            with values ranging from 0 to 255.
+    -------
+    Example:
+        >>> # Generate inpainting
+        >>> result = flux_image_inpainting(
+        ...     prompt="a modern black leather sofa with white pillows",
+        ...     image=image,
+        ...     mask=mask,
+        ... )
+        >>> save_image(result, "inpainted_room.png")
+    """
+    if (
+        image.shape[0] < 8
+        or image.shape[1] < 8
+        or mask.shape[0] < 8
+        or mask.shape[1] < 8
+    ):
+        raise ValueError("The image or mask does not have enough size for inpainting")
+    if np.array_equal(mask, mask.astype(bool).astype(int)):
+        mask = np.where(mask > 0, 255, 0).astype(np.uint8)
+    else:
+        raise ValueError("The mask should be a binary mask with 0's and 1's")
+    image_file = numpy_to_bytes(image)
+    mask_file = numpy_to_bytes(mask)
+    files = [
+        ("image", image_file),
+        ("mask_image", mask_file),
+    ]
+    payload = {
+        "prompt": prompt,
+        "task": "inpainting",
+        "height": image.shape[0],
+        "width": image.shape[1],
+        "strength": 0.99,
+        "guidance_scale": 18,
+        "num_inference_steps": 20,
+        "seed": None,
+    }
+    response = send_inference_request(
+        payload=payload,
+        endpoint_name="flux1",
+        files=files,
+        v2=True,
+        metadata_payload={"function_name": "flux_image_inpainting"},
+    )
+    output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
+    return output_image
 # Utility and visualization functions
@@ -2238,13 +2345,13 @@ FUNCTION_TOOLS = [
     florence2_sam2_image,
     florence2_sam2_video_tracking,
     florence2_phrase_grounding,
-    ixc25_image_vqa,
-    ixc25_video_vqa,
     detr_segmentation,
     depth_anything_v2,
     generate_pose_image,
     closest_mask_distance,
     closest_box_distance,
+    qwen2_vl_images_vqa,
+    qwen2_vl_video_vqa,
 ]
 UTIL_TOOLS = [