PyPI - vision-agent - Versions diffs - 1.1.7__py3-none-any.whl → 1.1.8__py3-none-any.whl - Mend

vision-agent 1.1.7py3-none-any.whl → 1.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

vision_agent/.sim_tools/df.csv CHANGED Viewed

@@ -559,6 +559,30 @@ desc,doc,name
         ... )
         >>> save_image(result, ""inpainted_room.png"")
     ",flux_image_inpainting
+"'gemini_image_generation' performs image inpainting given an image and text prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: numpy.ndarray) -> numpy.ndarray:
+'gemini_image_generation' performs image inpainting given an image and text prompt.
+    It can be used to edit parts of an image or the entire image according to the prompt given.
+    Parameters:
+        prompt (str): A detailed text description guiding what should be generated
+            in the image. More detailed and specific prompts typically yield
+            better results.
+        image (np.ndarray): The source image to be inpainted. The image will serve as
+            the base context for the inpainting process.
+    Returns:
+        np.ndarray: The generated image(s) as a numpy array in RGB format with values
+            ranging from 0 to 255.
+    -------
+    Example:
+        >>> # Generate inpainting
+        >>> result = gemini_image_generation(
+        ...     prompt="a modern black leather sofa with white pillows",
+        ...     image=image,
+        ... )
+        >>> save_image(result, ""inpainted_room.png"")
+    ",gemini_image_generation
 'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
 'siglip_classification' is a tool that can classify an image or a cropped detection given a list
     of input labels or tags. It returns the same list of the input labels along with

vision_agent/tools/__init__.py CHANGED Viewed

@@ -31,6 +31,7 @@ from .tools import (
     florence2_sam2_instance_segmentation,
     florence2_sam2_video_tracking,
     flux_image_inpainting,
+    gemini_image_generation,
     generate_pose_image,
     get_tools,
     get_tools_descriptions,

vision_agent/tools/tools.py CHANGED Viewed

@@ -10,6 +10,7 @@ from importlib import resources
 from pathlib import Path
 from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
 from warnings import warn
+import time
 import cv2
 import numpy as np
@@ -20,6 +21,8 @@ from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
 import pymupdf  # type: ignore
+from google import genai  # type: ignore
+from google.genai import types  # type: ignore
 from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
 from vision_agent.utils.execute import FileSerializer, MimeType
@@ -2841,6 +2844,147 @@ def flux_image_inpainting(
     return output_image
+def gemini_image_generation(
+    prompt: str,
+    image: Optional[np.ndarray] = None,
+) -> np.ndarray:
+    """'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
+    It can be used to edit parts of an image or the entire image according to the prompt given.
+    Parameters:
+        prompt (str): A detailed text description guiding what should be generated
+            in the image. More detailed and specific prompts typically yield
+            better results.
+        image (np.ndarray, optional): The source image to be inpainted. The image will serve as
+            the base context for the inpainting process.
+    Returns:
+        np.ndarray: The generated image(s) as a numpy array in RGB format with values
+            ranging from 0 to 255.
+    -------
+    Example:
+        >>> # Generate inpainting
+        >>> result = gemini_image_generation(
+        ...     prompt="a modern black leather sofa with white pillows",
+        ...     image=image,
+        ... )
+        >>> save_image(result, "inpainted_room.png")
+    """
+    client = genai.Client()
+    files = []
+    image_file = None
+    def try_generate_content(
+        input_prompt: types.Content, num_retries: int = 3
+    ) -> Optional[bytes]:
+        """Try to generate content with multiple attempts."""
+        for attempt in range(num_retries):
+            try:
+                resp = client.models.generate_content(
+                    model="gemini-2.0-flash-exp-image-generation",
+                    contents=input_prompt,
+                    config=types.GenerateContentConfig(
+                        response_modalities=["Text", "Image"]
+                    ),
+                )
+                if (
+                    not resp.candidates
+                    or not resp.candidates[0].content
+                    or not resp.candidates[0].content.parts
+                    or not resp.candidates[0].content.parts[0].inline_data
+                    or not resp.candidates[0].content.parts[0].inline_data.data
+                ):
+                    _LOGGER.warning(f"Attempt {attempt + 1}: No candidates returned")
+                    time.sleep(5)
+                    continue
+                else:
+                    return (
+                        resp.candidates[0].content.parts[0].inline_data.data
+                        if isinstance(
+                            resp.candidates[0].content.parts[0].inline_data.data, bytes
+                        )
+                        else None
+                    )
+            except genai.errors.ClientError as e:
+                _LOGGER.warning(f"Attempt {attempt + 1} failed: {str(e)}")
+                time.sleep(5)
+        return None
+    if image is not None:
+        # Resize if needed
+        max_size = (512, 512)
+        if image.shape[0] > max_size[0] or image.shape[1] > max_size[1]:
+            scaling_factor = min(
+                max_size[0] / image.shape[0], max_size[1] / image.shape[1]
+            )
+            new_size = (
+                int(image.shape[1] * scaling_factor),
+                int(image.shape[0] * scaling_factor),
+            )
+            image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
+        # Convert to RGB
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image_file = numpy_to_bytes(image)
+        files = [("image", image_file)]
+        input_prompt = types.Content(
+            parts=[
+                types.Part(
+                    text="I want you to edit this image given this prompt: " + prompt
+                ),
+                types.Part(inline_data={"mime_type": "image/png", "data": image_file}),
+            ]
+        )
+    else:
+        input_prompt = types.Content(parts=[types.Part(text=prompt)])
+    # Try to generate content
+    output_image_bytes = try_generate_content(input_prompt)
+    # Handle fallback if all attempts failed
+    if output_image_bytes is None:
+        if image is not None:
+            _LOGGER.warning("Returning original image after all retries failed.")
+            return image
+        else:
+            try:
+                _LOGGER.warning("All retries failed; prompting for fresh generation.")
+                time.sleep(10)
+                output_image_bytes = try_generate_content(
+                    types.Content(parts=[types.Part(text="Generate an image.")]),
+                    num_retries=1,
+                )
+            except Exception as e:
+                raise ValueError(f"Fallback generation failed: {str(e)}")
+    # Convert bytes to image
+    if output_image_bytes is not None:
+        output_image_temp = io.BytesIO(output_image_bytes)
+        output_image_pil = Image.open(output_image_temp)
+        final_image = np.array(output_image_pil)
+    else:
+        raise ValueError("Fallback generation failed")
+    _display_tool_trace(
+        gemini_image_generation.__name__,
+        {
+            "prompt": prompt,
+            "model": "gemini-2.0-flash-exp-image-generation",
+        },
+        final_image,
+        files,
+    )
+    return final_image
 def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any]:
     """'siglip_classification' is a tool that can classify an image or a cropped detection given a list
     of input labels or tags. It returns the same list of the input labels along with

{vision_agent-1.1.7.dist-info → vision_agent-1.1.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vision-agent
-Version: 1.1.7
+Version: 1.1.8
 Summary: Toolset for Vision Agent
 Project-URL: Homepage, https://landing.ai
 Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -12,13 +12,15 @@ Requires-Dist: anthropic<0.32,>=0.31.0
 Requires-Dist: av<12,>=11.0.0
 Requires-Dist: dotenv<0.10,>=0.9.9
 Requires-Dist: flake8<8,>=7.0.0
+Requires-Dist: google-genai<2,>=1.0.0
+Requires-Dist: httpx==0.27.2
 Requires-Dist: ipykernel<7,>=6.29.4
 Requires-Dist: libcst<2,>=1.5.0
 Requires-Dist: matplotlib<4,>=3.9.2
 Requires-Dist: nbclient<0.11,>=0.10.0
 Requires-Dist: nbformat<6,>=5.10.4
 Requires-Dist: numpy<2.0.0,>=1.21.0
-Requires-Dist: openai==1.*
+Requires-Dist: openai==1.55.3
 Requires-Dist: opencv-python==4.*
 Requires-Dist: opentelemetry-api<2,>=1.29.0
 Requires-Dist: pandas==2.*
@@ -75,7 +77,7 @@ The most important step is to [signup](https://va.landing.ai/agent) and obtain y
 ### Other Prerequisites
 - Python version 3.9 or higher
 - [Anthropic API key](#get-an-anthropic-api-key)
-- [Gemini API key](#get-a-gemini-api-key)
+- [Google API key](#get-a-google-api-key)
 ### Why do I need Anthropic and Google API Keys?
 VisionAgent uses models from Anthropic and Google to respond to prompts and generate code.
@@ -84,7 +86,7 @@ When you run the web-based version of VisionAgent, the app uses the LandingAI AP
 When you run VisionAgent programmatically, the app will need to use your API keys to access the Anthropic and Google models. This ensures that any projects you run with VisionAgent aren’t limited by the rate limits in place with the LandingAI accounts, and it also prevents many users from overloading the LandingAI rate limits.
-Anthropic and Gemini each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
+Anthropic and Google each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
 > **_NOTE:_** In VisionAgent v1.0.2 and earlier, VisionAgent was powered by Anthropic Claude-3.5 and OpenAI o1. If using one of these VisionAgent versions, you get an OpenAI API key and set it as an environment variable.
@@ -94,7 +96,7 @@ Anthropic and Gemini each have their own rate limits and paid tiers. Refer to th
 2. In the Anthropic Console, go to the [API Keys](https://console.anthropic.com/settings/keys) page.
 3. Generate an API key.
-### Get a Gemini API Key
+### Get a Google API Key
 1. If you don’t have one yet, create a [Google AI Studio account](https://aistudio.google.com/).
 2. In Google AI Studio, go to the [Get API Key](https://aistudio.google.com/app/apikey) page.
 3. Generate an API key.
@@ -109,8 +111,8 @@ pip install vision-agent
 ## Quickstart: Prompt VisionAgent
 Follow this quickstart to learn how to prompt VisionAgent. After learning the basics, customize your prompt and workflow to meet your needs.
-1. Get your Anthropic, Gemini, and VisionAgent API keys.
-2. [Set the Anthropic, Gemini, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
+1. Get your Anthropic, Google, and VisionAgent API keys.
+2. [Set the Anthropic, Google, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
 3. [Install VisionAgent](#installation).
 4. Create a folder called `quickstart`.
 5. Find an image you want to analyze and save it to the `quickstart` folder.
@@ -119,13 +121,13 @@ Follow this quickstart to learn how to prompt VisionAgent. After learning the ba
 8. VisionAgent creates a file called `generated_code.py` and saves the generated code there.
 ### Set API Keys as Environment Variables
-Before running VisionAgent code, you must set the Anthropic, Gemini, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
+Before running VisionAgent code, you must set the Anthropic, Google, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
 Here is the code for setting the variables:
 ```bash
 export VISION_AGENT_API_KEY="your-api-key"
 export ANTHROPIC_API_KEY="your-api-key"
-export GEMINI_API_KEY="your-api-key"
+export GOOGLE_API_KEY="your-api-key"
 ```
 ### Sample Script: Prompt VisionAgent
 To use VisionAgent to generate code, use the following script as a starting point:

{vision_agent-1.1.7.dist-info → vision_agent-1.1.8.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
-vision_agent/.sim_tools/df.csv,sha256=jCyBDlLxI9_yAxzLZcoN2BPpveF1yh29AlfdSAGTZ4A,40842
+vision_agent/.sim_tools/df.csv,sha256=pMJKoMzCpcvSSopvWuWlHl7NHCICgUxAqgFQ-m0l7HM,42068
 vision_agent/.sim_tools/embs.npy,sha256=QN8Ojc0Mv4_OS6WA4elvBhXTDHcpx2g1pLxsGqk4IQU,245888
 vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
 vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
 vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
 vision_agent/sim/sim.py,sha256=WQY_x9A4VT647qGDBScJ3R8_Iv0aoYLHTgwcQSCXwv4,10059
-vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
+vision_agent/tools/__init__.py,sha256=PRUka2eqHwPWJxwfpLj-O2Ab7hXG_dsE1Aov3TE6teM,2496
 vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
 vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tools.py,sha256=dKKrfKxqQYVDFRsLjMMpp1z4_5k68pkaoZUMf1BMc_Q,125694
+vision_agent/tools/tools.py,sha256=pJTk-nQKd68iBXlR-C4oGo_o7V3WPXc4OhOKtw5pf0o,130906
 vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
 vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
 vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8
 vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
 vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
 vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
-vision_agent-1.1.7.dist-info/METADATA,sha256=vDncVy4FczlJzizC0R64y3wHDMVqJXs5YKjK0U5NIHQ,12530
-vision_agent-1.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-vision_agent-1.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-1.1.7.dist-info/RECORD,,
+vision_agent-1.1.8.dist-info/METADATA,sha256=e8RqIuV0Y54jyNTYy7kOxfWeT8e0R4pVjMhLXZMcV7k,12600
+vision_agent-1.1.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+vision_agent-1.1.8.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-1.1.8.dist-info/RECORD,,

{vision_agent-1.1.7.dist-info → vision_agent-1.1.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{vision_agent-1.1.7.dist-info → vision_agent-1.1.8.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

vision-agent 1.1.7__py3-none-any.whl → 1.1.8__py3-none-any.whl

vision-agent 1.1.7py3-none-any.whl → 1.1.8py3-none-any.whl