vision-agent 1.1.7__py3-none-any.whl → 1.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -559,6 +559,30 @@ desc,doc,name
559
559
  ... )
560
560
  >>> save_image(result, ""inpainted_room.png"")
561
561
  ",flux_image_inpainting
562
+ "'gemini_image_generation' performs image inpainting given an image and text prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: numpy.ndarray) -> numpy.ndarray:
563
+ 'gemini_image_generation' performs image inpainting given an image and text prompt.
564
+ It can be used to edit parts of an image or the entire image according to the prompt given.
565
+
566
+ Parameters:
567
+ prompt (str): A detailed text description guiding what should be generated
568
+ in the image. More detailed and specific prompts typically yield
569
+ better results.
570
+ image (np.ndarray): The source image to be inpainted. The image will serve as
571
+ the base context for the inpainting process.
572
+
573
+ Returns:
574
+ np.ndarray: The generated image(s) as a numpy array in RGB format with values
575
+ ranging from 0 to 255.
576
+
577
+ -------
578
+ Example:
579
+ >>> # Generate inpainting
580
+ >>> result = gemini_image_generation(
581
+ ... prompt="a modern black leather sofa with white pillows",
582
+ ... image=image,
583
+ ... )
584
+ >>> save_image(result, ""inpainted_room.png"")
585
+ ",gemini_image_generation
562
586
  'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
563
587
  'siglip_classification' is a tool that can classify an image or a cropped detection given a list
564
588
  of input labels or tags. It returns the same list of the input labels along with
@@ -31,6 +31,7 @@ from .tools import (
31
31
  florence2_sam2_instance_segmentation,
32
32
  florence2_sam2_video_tracking,
33
33
  flux_image_inpainting,
34
+ gemini_image_generation,
34
35
  generate_pose_image,
35
36
  get_tools,
36
37
  get_tools_descriptions,
@@ -10,6 +10,7 @@ from importlib import resources
10
10
  from pathlib import Path
11
11
  from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
12
12
  from warnings import warn
13
+ import time
13
14
 
14
15
  import cv2
15
16
  import numpy as np
@@ -20,6 +21,8 @@ from PIL import Image, ImageDraw, ImageFont
20
21
  from pillow_heif import register_heif_opener # type: ignore
21
22
  from pytube import YouTube # type: ignore
22
23
  import pymupdf # type: ignore
24
+ from google import genai # type: ignore
25
+ from google.genai import types # type: ignore
23
26
 
24
27
  from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
25
28
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -2841,6 +2844,147 @@ def flux_image_inpainting(
2841
2844
  return output_image
2842
2845
 
2843
2846
 
2847
+ def gemini_image_generation(
2848
+ prompt: str,
2849
+ image: Optional[np.ndarray] = None,
2850
+ ) -> np.ndarray:
2851
+ """'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
2852
+ It can be used to edit parts of an image or the entire image according to the prompt given.
2853
+
2854
+ Parameters:
2855
+ prompt (str): A detailed text description guiding what should be generated
2856
+ in the image. More detailed and specific prompts typically yield
2857
+ better results.
2858
+ image (np.ndarray, optional): The source image to be inpainted. The image will serve as
2859
+ the base context for the inpainting process.
2860
+
2861
+ Returns:
2862
+ np.ndarray: The generated image(s) as a numpy array in RGB format with values
2863
+ ranging from 0 to 255.
2864
+
2865
+ -------
2866
+ Example:
2867
+ >>> # Generate inpainting
2868
+ >>> result = gemini_image_generation(
2869
+ ... prompt="a modern black leather sofa with white pillows",
2870
+ ... image=image,
2871
+ ... )
2872
+ >>> save_image(result, "inpainted_room.png")
2873
+ """
2874
+ client = genai.Client()
2875
+ files = []
2876
+ image_file = None
2877
+
2878
+ def try_generate_content(
2879
+ input_prompt: types.Content, num_retries: int = 3
2880
+ ) -> Optional[bytes]:
2881
+ """Try to generate content with multiple attempts."""
2882
+ for attempt in range(num_retries):
2883
+ try:
2884
+ resp = client.models.generate_content(
2885
+ model="gemini-2.0-flash-exp-image-generation",
2886
+ contents=input_prompt,
2887
+ config=types.GenerateContentConfig(
2888
+ response_modalities=["Text", "Image"]
2889
+ ),
2890
+ )
2891
+
2892
+ if (
2893
+ not resp.candidates
2894
+ or not resp.candidates[0].content
2895
+ or not resp.candidates[0].content.parts
2896
+ or not resp.candidates[0].content.parts[0].inline_data
2897
+ or not resp.candidates[0].content.parts[0].inline_data.data
2898
+ ):
2899
+ _LOGGER.warning(f"Attempt {attempt + 1}: No candidates returned")
2900
+ time.sleep(5)
2901
+ continue
2902
+ else:
2903
+ return (
2904
+ resp.candidates[0].content.parts[0].inline_data.data
2905
+ if isinstance(
2906
+ resp.candidates[0].content.parts[0].inline_data.data, bytes
2907
+ )
2908
+ else None
2909
+ )
2910
+
2911
+ except genai.errors.ClientError as e:
2912
+ _LOGGER.warning(f"Attempt {attempt + 1} failed: {str(e)}")
2913
+ time.sleep(5)
2914
+
2915
+ return None
2916
+
2917
+ if image is not None:
2918
+ # Resize if needed
2919
+ max_size = (512, 512)
2920
+ if image.shape[0] > max_size[0] or image.shape[1] > max_size[1]:
2921
+ scaling_factor = min(
2922
+ max_size[0] / image.shape[0], max_size[1] / image.shape[1]
2923
+ )
2924
+ new_size = (
2925
+ int(image.shape[1] * scaling_factor),
2926
+ int(image.shape[0] * scaling_factor),
2927
+ )
2928
+ image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
2929
+
2930
+ # Convert to RGB
2931
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
2932
+ image_file = numpy_to_bytes(image)
2933
+ files = [("image", image_file)]
2934
+
2935
+ input_prompt = types.Content(
2936
+ parts=[
2937
+ types.Part(
2938
+ text="I want you to edit this image given this prompt: " + prompt
2939
+ ),
2940
+ types.Part(inline_data={"mime_type": "image/png", "data": image_file}),
2941
+ ]
2942
+ )
2943
+
2944
+ else:
2945
+ input_prompt = types.Content(parts=[types.Part(text=prompt)])
2946
+
2947
+ # Try to generate content
2948
+ output_image_bytes = try_generate_content(input_prompt)
2949
+
2950
+ # Handle fallback if all attempts failed
2951
+ if output_image_bytes is None:
2952
+ if image is not None:
2953
+ _LOGGER.warning("Returning original image after all retries failed.")
2954
+ return image
2955
+ else:
2956
+ try:
2957
+ _LOGGER.warning("All retries failed; prompting for fresh generation.")
2958
+ time.sleep(10)
2959
+ output_image_bytes = try_generate_content(
2960
+ types.Content(parts=[types.Part(text="Generate an image.")]),
2961
+ num_retries=1,
2962
+ )
2963
+
2964
+ except Exception as e:
2965
+ raise ValueError(f"Fallback generation failed: {str(e)}")
2966
+
2967
+ # Convert bytes to image
2968
+ if output_image_bytes is not None:
2969
+ output_image_temp = io.BytesIO(output_image_bytes)
2970
+ output_image_pil = Image.open(output_image_temp)
2971
+ final_image = np.array(output_image_pil)
2972
+ else:
2973
+ raise ValueError("Fallback generation failed")
2974
+
2975
+ _display_tool_trace(
2976
+ gemini_image_generation.__name__,
2977
+ {
2978
+ "prompt": prompt,
2979
+ "model": "gemini-2.0-flash-exp-image-generation",
2980
+ },
2981
+ final_image,
2982
+ files,
2983
+ )
2984
+
2985
+ return final_image
2986
+
2987
+
2844
2988
  def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any]:
2845
2989
  """'siglip_classification' is a tool that can classify an image or a cropped detection given a list
2846
2990
  of input labels or tags. It returns the same list of the input labels along with
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.7
3
+ Version: 1.1.8
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -12,13 +12,15 @@ Requires-Dist: anthropic<0.32,>=0.31.0
12
12
  Requires-Dist: av<12,>=11.0.0
13
13
  Requires-Dist: dotenv<0.10,>=0.9.9
14
14
  Requires-Dist: flake8<8,>=7.0.0
15
+ Requires-Dist: google-genai<2,>=1.0.0
16
+ Requires-Dist: httpx==0.27.2
15
17
  Requires-Dist: ipykernel<7,>=6.29.4
16
18
  Requires-Dist: libcst<2,>=1.5.0
17
19
  Requires-Dist: matplotlib<4,>=3.9.2
18
20
  Requires-Dist: nbclient<0.11,>=0.10.0
19
21
  Requires-Dist: nbformat<6,>=5.10.4
20
22
  Requires-Dist: numpy<2.0.0,>=1.21.0
21
- Requires-Dist: openai==1.*
23
+ Requires-Dist: openai==1.55.3
22
24
  Requires-Dist: opencv-python==4.*
23
25
  Requires-Dist: opentelemetry-api<2,>=1.29.0
24
26
  Requires-Dist: pandas==2.*
@@ -75,7 +77,7 @@ The most important step is to [signup](https://va.landing.ai/agent) and obtain y
75
77
  ### Other Prerequisites
76
78
  - Python version 3.9 or higher
77
79
  - [Anthropic API key](#get-an-anthropic-api-key)
78
- - [Gemini API key](#get-a-gemini-api-key)
80
+ - [Google API key](#get-a-google-api-key)
79
81
 
80
82
  ### Why do I need Anthropic and Google API Keys?
81
83
  VisionAgent uses models from Anthropic and Google to respond to prompts and generate code.
@@ -84,7 +86,7 @@ When you run the web-based version of VisionAgent, the app uses the LandingAI AP
84
86
 
85
87
  When you run VisionAgent programmatically, the app will need to use your API keys to access the Anthropic and Google models. This ensures that any projects you run with VisionAgent aren’t limited by the rate limits in place with the LandingAI accounts, and it also prevents many users from overloading the LandingAI rate limits.
86
88
 
87
- Anthropic and Gemini each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
89
+ Anthropic and Google each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
88
90
 
89
91
  > **_NOTE:_** In VisionAgent v1.0.2 and earlier, VisionAgent was powered by Anthropic Claude-3.5 and OpenAI o1. If using one of these VisionAgent versions, you get an OpenAI API key and set it as an environment variable.
90
92
 
@@ -94,7 +96,7 @@ Anthropic and Gemini each have their own rate limits and paid tiers. Refer to th
94
96
  2. In the Anthropic Console, go to the [API Keys](https://console.anthropic.com/settings/keys) page.
95
97
  3. Generate an API key.
96
98
 
97
- ### Get a Gemini API Key
99
+ ### Get a Google API Key
98
100
  1. If you don’t have one yet, create a [Google AI Studio account](https://aistudio.google.com/).
99
101
  2. In Google AI Studio, go to the [Get API Key](https://aistudio.google.com/app/apikey) page.
100
102
  3. Generate an API key.
@@ -109,8 +111,8 @@ pip install vision-agent
109
111
  ## Quickstart: Prompt VisionAgent
110
112
  Follow this quickstart to learn how to prompt VisionAgent. After learning the basics, customize your prompt and workflow to meet your needs.
111
113
 
112
- 1. Get your Anthropic, Gemini, and VisionAgent API keys.
113
- 2. [Set the Anthropic, Gemini, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
114
+ 1. Get your Anthropic, Google, and VisionAgent API keys.
115
+ 2. [Set the Anthropic, Google, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
114
116
  3. [Install VisionAgent](#installation).
115
117
  4. Create a folder called `quickstart`.
116
118
  5. Find an image you want to analyze and save it to the `quickstart` folder.
@@ -119,13 +121,13 @@ Follow this quickstart to learn how to prompt VisionAgent. After learning the ba
119
121
  8. VisionAgent creates a file called `generated_code.py` and saves the generated code there.
120
122
 
121
123
  ### Set API Keys as Environment Variables
122
- Before running VisionAgent code, you must set the Anthropic, Gemini, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
124
+ Before running VisionAgent code, you must set the Anthropic, Google, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
123
125
 
124
126
  Here is the code for setting the variables:
125
127
  ```bash
126
128
  export VISION_AGENT_API_KEY="your-api-key"
127
129
  export ANTHROPIC_API_KEY="your-api-key"
128
- export GEMINI_API_KEY="your-api-key"
130
+ export GOOGLE_API_KEY="your-api-key"
129
131
  ```
130
132
  ### Sample Script: Prompt VisionAgent
131
133
  To use VisionAgent to generate code, use the following script as a starting point:
@@ -1,5 +1,5 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
- vision_agent/.sim_tools/df.csv,sha256=jCyBDlLxI9_yAxzLZcoN2BPpveF1yh29AlfdSAGTZ4A,40842
2
+ vision_agent/.sim_tools/df.csv,sha256=pMJKoMzCpcvSSopvWuWlHl7NHCICgUxAqgFQ-m0l7HM,42068
3
3
  vision_agent/.sim_tools/embs.npy,sha256=QN8Ojc0Mv4_OS6WA4elvBhXTDHcpx2g1pLxsGqk4IQU,245888
4
4
  vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
5
5
  vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
26
26
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
27
27
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
28
28
  vision_agent/sim/sim.py,sha256=WQY_x9A4VT647qGDBScJ3R8_Iv0aoYLHTgwcQSCXwv4,10059
29
- vision_agent/tools/__init__.py,sha256=o9lfWBVopT_qSoSi26WcgQJTKQYNgbXv7r4z_o5j2Eg,2467
29
+ vision_agent/tools/__init__.py,sha256=PRUka2eqHwPWJxwfpLj-O2Ab7hXG_dsE1Aov3TE6teM,2496
30
30
  vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
31
31
  vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tools.py,sha256=dKKrfKxqQYVDFRsLjMMpp1z4_5k68pkaoZUMf1BMc_Q,125694
33
+ vision_agent/tools/tools.py,sha256=pJTk-nQKd68iBXlR-C4oGo_o7V3WPXc4OhOKtw5pf0o,130906
34
34
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
35
35
  vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
36
36
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8
40
40
  vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
41
41
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
42
42
  vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
43
- vision_agent-1.1.7.dist-info/METADATA,sha256=vDncVy4FczlJzizC0R64y3wHDMVqJXs5YKjK0U5NIHQ,12530
44
- vision_agent-1.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
- vision_agent-1.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
- vision_agent-1.1.7.dist-info/RECORD,,
43
+ vision_agent-1.1.8.dist-info/METADATA,sha256=e8RqIuV0Y54jyNTYy7kOxfWeT8e0R4pVjMhLXZMcV7k,12600
44
+ vision_agent-1.1.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
+ vision_agent-1.1.8.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
+ vision_agent-1.1.8.dist-info/RECORD,,