PyPI - vision-agent - Versions diffs - 0.2.123__tar.gz → 0.2.125__tar.gz - Mend

vision-agent 0.2.123tar.gz → 0.2.125tar.gz

Files changed (33) hide show

{vision_agent-0.2.123 → vision_agent-0.2.125}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.123
+Version: 0.2.125
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.123 → vision_agent-0.2.125}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.123"
+version = "0.2.125"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/lmm/lmm.py RENAMED Viewed

@@ -1,77 +1,36 @@
-import base64
-import io
 import json
 import logging
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterator, List, Optional, Union, cast
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
 import anthropic
 import requests
 from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
 from openai import AzureOpenAI, OpenAI
-from PIL import Image
-import vision_agent.tools as T
-from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
+from vision_agent.utils.image_utils import encode_media
 from .types import Message
 _LOGGER = logging.getLogger(__name__)
-def encode_image_bytes(image: bytes) -> str:
-    image = Image.open(io.BytesIO(image)).convert("RGB")  # type: ignore
-    buffer = io.BytesIO()
-    image.save(buffer, format="PNG")  # type: ignore
-    encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
-    return encoded_image
-def encode_media(media: Union[str, Path]) -> str:
-    if type(media) is str and media.startswith(("http", "https")):
-        # for mp4 video url, we assume there is a same url but ends with png
-        # vision-agent-ui will upload this png when uploading the video
-        if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
-            return media[:-4] + ".png"
-        return media
-    extension = "png"
-    extension = Path(media).suffix
-    if extension.lower() not in {
-        ".jpg",
-        ".jpeg",
-        ".png",
-        ".webp",
-        ".bmp",
-        ".mp4",
-        ".mov",
-    }:
-        raise ValueError(f"Unsupported image extension: {extension}")
-    image_bytes = b""
-    if extension.lower() in {".mp4", ".mov"}:
-        frames = T.extract_frames(media)
-        image = frames[len(frames) // 2]
-        buffer = io.BytesIO()
-        Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
-        image_bytes = buffer.getvalue()
-    else:
-        image_bytes = open(media, "rb").read()
-    return encode_image_bytes(image_bytes)
 class LMM(ABC):
     @abstractmethod
     def generate(
-        self, prompt: str, media: Optional[List[Union[str, Path]]] = None, **kwargs: Any
+        self,
+        prompt: str,
+        media: Optional[Sequence[Union[str, Path]]] = None,
+        **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         pass
     @abstractmethod
     def chat(
         self,
-        chat: List[Message],
+        chat: Sequence[Message],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         pass
@@ -79,7 +38,7 @@ class LMM(ABC):
     @abstractmethod
     def __call__(
         self,
-        input: Union[str, List[Message]],
+        input: Union[str, Sequence[Message]],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         pass
@@ -111,7 +70,7 @@ class OpenAILMM(LMM):
     def __call__(
         self,
-        input: Union[str, List[Message]],
+        input: Union[str, Sequence[Message]],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         if isinstance(input, str):
@@ -120,13 +79,13 @@ class OpenAILMM(LMM):
     def chat(
         self,
-        chat: List[Message],
+        chat: Sequence[Message],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         """Chat with the LMM model.
         Parameters:
-            chat (List[Dict[str, str]]): A list of dictionaries containing the chat
+            chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
                 messages. The messages can be in the format:
                 [{"role": "user", "content": "Hello!"}, ...]
                 or if it contains media, it should be in the format:
@@ -147,6 +106,7 @@ class OpenAILMM(LMM):
                                 "url": (
                                     encoded_media
                                     if encoded_media.startswith(("http", "https"))
+                                    or encoded_media.startswith("data:image/")
                                     else f"data:image/png;base64,{encoded_media}"
                                 ),
                                 "detail": "low",
@@ -174,7 +134,7 @@ class OpenAILMM(LMM):
     def generate(
         self,
         prompt: str,
-        media: Optional[List[Union[str, Path]]] = None,
+        media: Optional[Sequence[Union[str, Path]]] = None,
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         message: List[Dict[str, Any]] = [
@@ -192,7 +152,12 @@ class OpenAILMM(LMM):
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": f"data:image/png;base64,{encoded_media}",
+                            "url": (
+                                encoded_media
+                                if encoded_media.startswith(("http", "https"))
+                                or encoded_media.startswith("data:image/")
+                                else f"data:image/png;base64,{encoded_media}"
+                            ),
                             "detail": "low",
                         },
                     },
@@ -214,81 +179,6 @@ class OpenAILMM(LMM):
         else:
             return cast(str, response.choices[0].message.content)
-    def generate_classifier(self, question: str) -> Callable:
-        api_doc = T.get_tool_documentation([T.clip])
-        prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
-        response = self.client.chat.completions.create(
-            model=self.model_name,
-            messages=[
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": prompt},
-            ],
-            response_format={"type": "json_object"},
-        )
-        try:
-            params = json.loads(cast(str, response.choices[0].message.content))[
-                "Parameters"
-            ]
-        except json.JSONDecodeError:
-            _LOGGER.error(
-                f"Failed to decode response: {response.choices[0].message.content}"
-            )
-            raise ValueError("Failed to decode response")
-        return lambda x: T.clip(x, params["prompt"])
-    def generate_detector(self, question: str) -> Callable:
-        api_doc = T.get_tool_documentation([T.owl_v2])
-        prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
-        response = self.client.chat.completions.create(
-            model=self.model_name,
-            messages=[
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": prompt},
-            ],
-            response_format={"type": "json_object"},
-        )
-        try:
-            params = json.loads(cast(str, response.choices[0].message.content))[
-                "Parameters"
-            ]
-        except json.JSONDecodeError:
-            _LOGGER.error(
-                f"Failed to decode response: {response.choices[0].message.content}"
-            )
-            raise ValueError("Failed to decode response")
-        return lambda x: T.owl_v2(params["prompt"], x)
-    def generate_segmentor(self, question: str) -> Callable:
-        api_doc = T.get_tool_documentation([T.grounding_sam])
-        prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
-        response = self.client.chat.completions.create(
-            model=self.model_name,
-            messages=[
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": prompt},
-            ],
-            response_format={"type": "json_object"},
-        )
-        try:
-            params = json.loads(cast(str, response.choices[0].message.content))[
-                "Parameters"
-            ]
-        except json.JSONDecodeError:
-            _LOGGER.error(
-                f"Failed to decode response: {response.choices[0].message.content}"
-            )
-            raise ValueError("Failed to decode response")
-        return lambda x: T.grounding_sam(params["prompt"], x)
-    def generate_image_qa_tool(self, question: str) -> Callable:
-        return lambda x: T.git_vqa_v2(question, x)
 class AzureOpenAILMM(OpenAILMM):
     def __init__(
@@ -362,7 +252,7 @@ class OllamaLMM(LMM):
     def __call__(
         self,
-        input: Union[str, List[Message]],
+        input: Union[str, Sequence[Message]],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         if isinstance(input, str):
@@ -371,13 +261,13 @@ class OllamaLMM(LMM):
     def chat(
         self,
-        chat: List[Message],
+        chat: Sequence[Message],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         """Chat with the LMM model.
         Parameters:
-            chat (List[Dict[str, str]]): A list of dictionaries containing the chat
+            chat (Sequence[Dict[str, str]]): A list of dictionaries containing the chat
                 messages. The messages can be in the format:
                 [{"role": "user", "content": "Hello!"}, ...]
                 or if it contains media, it should be in the format:
@@ -429,7 +319,7 @@ class OllamaLMM(LMM):
     def generate(
         self,
         prompt: str,
-        media: Optional[List[Union[str, Path]]] = None,
+        media: Optional[Sequence[Union[str, Path]]] = None,
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         url = f"{self.url}/generate"
@@ -493,7 +383,7 @@ class ClaudeSonnetLMM(LMM):
     def __call__(
         self,
-        input: Union[str, List[Dict[str, Any]]],
+        input: Union[str, Sequence[Dict[str, Any]]],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         if isinstance(input, str):
@@ -502,7 +392,7 @@ class ClaudeSonnetLMM(LMM):
     def chat(
         self,
-        chat: List[Dict[str, Any]],
+        chat: Sequence[Dict[str, Any]],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         messages: List[MessageParam] = []
@@ -551,7 +441,7 @@ class ClaudeSonnetLMM(LMM):
     def generate(
         self,
         prompt: str,
-        media: Optional[List[Union[str, Path]]] = None,
+        media: Optional[Sequence[Union[str, Path]]] = None,
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         content: List[Union[TextBlockParam, ImageBlockParam]] = [

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -16,6 +16,8 @@ from .tools import (
     clip,
     closest_box_distance,
     closest_mask_distance,
+    countgd_counting,
+    countgd_example_based_counting,
     depth_anything_v2,
     detr_segmentation,
     dpt_hybrid_midas,
@@ -30,6 +32,8 @@ from .tools import (
     generate_soft_edge_image,
     get_tool_documentation,
     git_vqa_v2,
+    gpt4o_image_vqa,
+    gpt4o_video_vqa,
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
@@ -37,13 +41,11 @@ from .tools import (
     load_image,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
-    countgd_counting,
-    countgd_example_based_counting,
     ocr,
     overlay_bounding_boxes,
+    overlay_counting_results,
     overlay_heat_map,
     overlay_segmentation_masks,
-    overlay_counting_results,
     owl_v2,
     save_image,
     save_json,

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/tools.py RENAMED Viewed

@@ -13,26 +13,27 @@ import cv2
 import numpy as np
 import requests
 from moviepy.editor import ImageSequenceClip
-from PIL import Image, ImageDraw, ImageFont, ImageEnhance
+from PIL import Image, ImageDraw, ImageEnhance, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
 from vision_agent.clients.landing_public_api import LandingPublicAPI
+from vision_agent.lmm.lmm import OpenAILMM
 from vision_agent.tools.tool_utils import (
+    filter_bboxes_by_threshold,
     get_tool_descriptions,
     get_tool_documentation,
     get_tools_df,
     get_tools_info,
     send_inference_request,
     send_task_inference_request,
-    filter_bboxes_by_threshold,
 )
 from vision_agent.tools.tools_types import (
     FineTuning,
     Florence2FtRequest,
     JobStatus,
-    PromptTask,
     ODResponseData,
+    PromptTask,
 )
 from vision_agent.utils import extract_frames_from_video
 from vision_agent.utils.exceptions import FineTuneModelIsNotReady
@@ -42,6 +43,7 @@ from vision_agent.utils.image_utils import (
     convert_quad_box_to_bbox,
     convert_to_b64,
     denormalize_bbox,
+    encode_image_bytes,
     frames_to_bytes,
     get_image_size,
     normalize_bbox,
@@ -691,6 +693,69 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data["answer"])
+def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
+    """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
+    including regular images or images of documents or presentations. It returns text
+    as an answer to the question.
+    Parameters:
+        prompt (str): The question about the image
+        image (np.ndarray): The reference image used for the question
+    Returns:
+        str: A string which is the answer to the given prompt.
+    Example
+    -------
+        >>> gpt4o_image_vqa('What is the cat doing?', image)
+        'drinking milk'
+    """
+    lmm = OpenAILMM()
+    buffer = io.BytesIO()
+    Image.fromarray(image).save(buffer, format="PNG")
+    image_bytes = buffer.getvalue()
+    image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
+    resp = lmm.generate(prompt, [image_b64])
+    return cast(str, resp)
+def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
+    """'gpt4o_video_vqa' is a tool that can answer any questions about arbitrary videos
+    including regular videos or videos of documents or presentations. It returns text
+    as an answer to the question.
+    Parameters:
+        prompt (str): The question about the video
+        frames (List[np.ndarray]): The reference frames used for the question
+    Returns:
+        str: A string which is the answer to the given prompt.
+    Example
+    -------
+        >>> gpt4o_video_vqa('Which football player made the goal?', frames)
+        'Lionel Messi'
+    """
+    lmm = OpenAILMM()
+    if len(frames) > 10:
+        step = len(frames) / 10
+        frames = [frames[int(i * step)] for i in range(10)]
+    frames_b64 = []
+    for frame in frames:
+        buffer = io.BytesIO()
+        Image.fromarray(frame).save(buffer, format="PNG")
+        image_bytes = buffer.getvalue()
+        image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
+        frames_b64.append(image_b64)
+    resp = lmm.generate(prompt, frames_b64)
+    return cast(str, resp)
 def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
     """'git_vqa_v2' is a tool that can answer questions about the visual
     contents of an image given a question and an image. It returns an answer to the
@@ -1755,7 +1820,6 @@ def overlay_counting_results(
 FUNCTION_TOOLS = [
     owl_v2,
-    extract_frames,
     ocr,
     clip,
     vit_image_classification,
@@ -1776,6 +1840,7 @@ FUNCTION_TOOLS = [
 ]
 UTIL_TOOLS = [
+    extract_frames,
     save_json,
     load_image,
     save_image,
@@ -1791,7 +1856,7 @@ TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
 TOOLS_DF = get_tools_df(TOOLS)  # type: ignore
 TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS)  # type: ignore
 TOOL_DOCSTRING = get_tool_documentation(TOOLS)  # type: ignore
-TOOLS_INFO = get_tools_info(TOOLS)  # type: ignore
+TOOLS_INFO = get_tools_info(FUNCTION_TOOLS)  # type: ignore
 UTILITIES_DOCSTRING = get_tool_documentation(
     [
         save_json,

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/tools_types.py RENAMED Viewed

@@ -1,8 +1,8 @@
 from enum import Enum
+from typing import List, Optional, Tuple, Union
 from uuid import UUID
-from typing import List, Tuple, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field, field_serializer, SerializationInfo
+from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
 class BboxInput(BaseModel):

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/image_utils.py RENAMED Viewed

@@ -13,6 +13,8 @@ from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from PIL.Image import Image as ImageType
+from vision_agent.utils import extract_frames_from_video
 COLORS = [
     (158, 218, 229),
     (219, 219, 141),
@@ -172,6 +174,51 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
         )
+def encode_image_bytes(image: bytes) -> str:
+    image = Image.open(io.BytesIO(image)).convert("RGB")  # type: ignore
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")  # type: ignore
+    encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
+    return encoded_image
+def encode_media(media: Union[str, Path]) -> str:
+    if isinstance(media, str) and media.startswith(("http", "https")):
+        # for mp4 video url, we assume there is a same url but ends with png
+        # vision-agent-ui will upload this png when uploading the video
+        if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
+            return media[:-4] + ".png"
+        return media
+    # if media is already a base64 encoded image return
+    if isinstance(media, str) and media.startswith("data:image/"):
+        return media
+    extension = "png"
+    extension = Path(media).suffix
+    if extension.lower() not in {
+        ".jpg",
+        ".jpeg",
+        ".png",
+        ".webp",
+        ".bmp",
+        ".mp4",
+        ".mov",
+    }:
+        raise ValueError(f"Unsupported image extension: {extension}")
+    image_bytes = b""
+    if extension.lower() in {".mp4", ".mov"}:
+        frames = extract_frames_from_video(str(media), fps=1)
+        image = frames[len(frames) // 2]
+        buffer = io.BytesIO()
+        Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
+        image_bytes = buffer.getvalue()
+    else:
+        image_bytes = open(media, "rb").read()
+    return encode_image_bytes(image_bytes)
 def denormalize_bbox(
     bbox: List[Union[int, float]], image_size: Tuple[int, ...]
 ) -> List[float]:

{vision_agent-0.2.123 → vision_agent-0.2.125}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/README.md RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/agent.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/agent_utils.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/clients/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/clients/http.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/clients/landing_public_api.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/fonts/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/fonts/default_font_ch_en.ttf RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/lmm/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/lmm/types.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/meta_tools.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/tool_utils.py RENAMED Viewed

@@ -1,6 +1,6 @@
-import os
 import inspect
 import logging
+import os
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 import pandas as pd
@@ -10,10 +10,10 @@ from requests import Session
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
+from vision_agent.tools.tools_types import BoundingBoxes
 from vision_agent.utils.exceptions import RemoteToolCallFailed
 from vision_agent.utils.execute import Error, MimeType
 from vision_agent.utils.type_defs import LandingaiAPIKey
-from vision_agent.tools.tools_types import BoundingBoxes
 _LOGGER = logging.getLogger(__name__)
 _LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/exceptions.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/execute.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/sim.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/type_defs.py RENAMED Viewed

File without changes

{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/video.py RENAMED Viewed

File without changes

vision-agent 0.2.123__tar.gz → 0.2.125__tar.gz

vision-agent 0.2.123tar.gz → 0.2.125tar.gz