PyPI - vision-agent - Versions diffs - 1.1.12__tar.gz → 1.1.14__tar.gz - Mend

vision-agent 1.1.12tar.gz → 1.1.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{vision_agent-1.1.12 → vision_agent-1.1.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vision-agent
-Version: 1.1.12
+Version: 1.1.14
 Summary: Toolset for Vision Agent
 Project-URL: Homepage, https://landing.ai
 Project-URL: repository, https://github.com/landing-ai/vision-agent

{vision_agent-1.1.12 → vision_agent-1.1.14}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "vision-agent"
-version = "1.1.12"
+version = "1.1.14"
 description = "Toolset for Vision Agent"
 authors = [{ name = "Landing AI", email = "dev@landing.ai" }]
 requires-python = ">=3.9,<4.0"
@@ -58,7 +58,6 @@ dev = [
     "types-pillow>=9.5.0.4,<10",
     "data-science-types>=0.2.23,<0.3",
     "types-tqdm>=4.65.0.1,<5",
-    "setuptools>=68.0.0,<69",
     "griffe>=0.45.3,<0.46",
     "mkdocs>=1.5.3,<2",
     "mkdocstrings[python]>=0.23.0,<0.24",

{vision_agent-1.1.12 → vision_agent-1.1.14}/vision_agent/.sim_tools/df.csv RENAMED Viewed

@@ -718,4 +718,4 @@ desc,doc,name
                     [0, 0, 0, ..., 0, 0, 0],
                     [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
             }],
-        )",overlay_segmentation_masks
+        )",overlay_segmentation_masks

{vision_agent-1.1.12 → vision_agent-1.1.14}/vision_agent/lmm/lmm.py RENAMED Viewed

@@ -3,12 +3,16 @@ import os
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
+import base64
 import anthropic
 import requests
 from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
 from openai import AzureOpenAI, OpenAI
+from google import genai  # type: ignore
+from google.genai import types  # type: ignore
 from vision_agent.models import Message
 from vision_agent.utils.image_utils import encode_media
@@ -516,28 +520,152 @@ class AnthropicLMM(LMM):
             return cast(str, response.content[0].text)
-class GoogleLMM(OpenAILMM):
+class GoogleLMM(LMM):
     r"""An LMM class for the Google LMMs."""
     def __init__(
         self,
+        model_name: str = "gemini-2.5-pro-preview-03-25",
         api_key: Optional[str] = None,
-        model_name: str = "gemini-2.0-flash-exp",
-        max_tokens: int = 4096,
-        image_detail: str = "low",
         image_size: int = 768,
+        image_detail: str = "low",
         **kwargs: Any,
     ):
-        base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
         if not api_key:
-            api_key = os.environ.get("GEMINI_API_KEY")
-        self.client = OpenAI(api_key=api_key, base_url=base_url)
+            api_key = os.environ.get("GOOGLE_API_KEY")
+        # Create the client using the Google Genai client
+        self.client = genai.Client(api_key=api_key)
         self.model_name = model_name
         self.image_size = image_size
         self.image_detail = image_detail
-        if "max_tokens" not in kwargs:
-            kwargs["max_tokens"] = max_tokens
         self.kwargs = kwargs
+    def __call__(
+        self,
+        input: Union[str, Sequence[Dict[str, Any]]],
+        **kwargs: Any,
+    ) -> Union[str, Iterator[Optional[str]]]:
+        if isinstance(input, str):
+            return self.generate(input, **kwargs)
+        return self.chat(input, **kwargs)
+    def chat(
+        self,
+        chat: Sequence[Dict[str, Any]],
+        **kwargs: Any,
+    ) -> Union[str, Iterator[Optional[str]]]:
+        prompt_parts = []
+        for message in chat:
+            if message["role"] != "user":
+                continue  # Gemini expects only user input
+            prompt_parts.extend(self._convert_message_parts(message, **kwargs))
+        tmp_kwargs = self.kwargs | kwargs
+        generation_config = self._create_generation_config(tmp_kwargs)
+        if tmp_kwargs.get("stream"):
+            def f() -> Iterator[Optional[str]]:
+                # Use the client to stream content
+                response_stream = self.client.models.generate_content_stream(
+                    model=self.model_name,
+                    contents=prompt_parts,
+                    config=generation_config,
+                )
+                for chunk in response_stream:
+                    if chunk.text:
+                        yield chunk.text
+            return f()
+        else:
+            # Use the client for non-streaming
+            response = self.client.models.generate_content(
+                model=self.model_name,
+                contents=prompt_parts,
+                config=generation_config,
+            )
+            return cast(str, response.text)
+    def generate(
+        self,
+        prompt: str,
+        media: Optional[Sequence[Union[str, Path]]] = None,
+        **kwargs: Any,
+    ) -> Union[str, Iterator[Optional[str]]]:
+        prompt_parts = [{"text": prompt}]
+        if media:
+            for m in media:
+                prompt_parts.append(self._convert_media_part(m, **kwargs))
+        tmp_kwargs = self.kwargs | kwargs
+        generation_config = self._create_generation_config(tmp_kwargs)
+        if tmp_kwargs.get("stream"):
+            def f() -> Iterator[Optional[str]]:
+                response_stream = self.client.models.generate_content_stream(
+                    model=self.model_name,
+                    contents=prompt_parts,
+                    config=generation_config,
+                )
+                for chunk in response_stream:
+                    if chunk.text:
+                        yield chunk.text
+            return f()
+        else:
+            response = self.client.models.generate_content(
+                model=self.model_name,
+                contents=prompt_parts,
+                config=generation_config,
+            )
+            return cast(str, response.text)
+    def _convert_message_parts(
+        self, message: Dict[str, Any], **kwargs: Any
+    ) -> List[Any]:
+        parts = [{"text": message["content"]}]
+        if "media" in message:
+            for media_path in message["media"]:
+                parts.append(self._convert_media_part(media_path, **kwargs))
+        return parts
+    def _convert_media_part(self, media: Union[str, Path], **kwargs: Any) -> types.Part:
+        resize = kwargs.get("resize", self.image_size)
+        encoded_media = encode_media(str(media), resize=resize)
+        if encoded_media.startswith("data:image/"):
+            encoded_media = encoded_media.split(",", 1)[-1]
+        binary_data = base64.b64decode(encoded_media)
+        return types.Part.from_bytes(
+            data=binary_data,
+            mime_type="image/png",
+        )
+    def _create_generation_config(
+        self, kwargs: Dict[str, Any]
+    ) -> types.GenerateContentConfig:
+        # Extract generation-specific parameters
+        config_params = {}
+        # Handle known parameters
+        for param in [
+            "max_output_tokens",
+            "temperature",
+            "top_p",
+            "top_k",
+            "response_mime_type",
+            "stop_sequences",
+            "candidate_count",
+            "seed",
+            "safety_settings",
+            "system_instruction",
+        ]:
+            if param in kwargs:
+                config_params[param] = kwargs[param]
+        # Create a GenerateContentConfig object
+        return types.GenerateContentConfig(**config_params)

{vision_agent-1.1.12 → vision_agent-1.1.14}/vision_agent/tools/tools.py RENAMED Viewed

@@ -2959,13 +2959,10 @@ def gemini_image_generation(
             return image
         else:
             try:
-                _LOGGER.warning("All retries failed; prompting for fresh generation.")
-                time.sleep(10)
-                output_image_bytes = try_generate_content(
-                    types.Content(parts=[types.Part(text="Generate an image.")]),
-                    num_retries=1,
-                )
+                current_dir = os.path.dirname(os.path.abspath(__file__))
+                img_path = os.path.join(current_dir, "../../assets/gemini.png")
+                with open(img_path, "rb") as img_file:
+                    output_image_bytes = img_file.read()
             except Exception as e:
                 raise ValueError(f"Fallback generation failed: {str(e)}")