vision-agent 1.1.7__py3-none-any.whl → 1.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +24 -0
- vision_agent/agent/vision_agent_prompts_v2.py +1 -1
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +144 -0
- {vision_agent-1.1.7.dist-info → vision_agent-1.1.9.dist-info}/METADATA +11 -9
- {vision_agent-1.1.7.dist-info → vision_agent-1.1.9.dist-info}/RECORD +8 -8
- {vision_agent-1.1.7.dist-info → vision_agent-1.1.9.dist-info}/WHEEL +0 -0
- {vision_agent-1.1.7.dist-info → vision_agent-1.1.9.dist-info}/licenses/LICENSE +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -559,6 +559,30 @@ desc,doc,name
|
|
559
559
|
... )
|
560
560
|
>>> save_image(result, ""inpainted_room.png"")
|
561
561
|
",flux_image_inpainting
|
562
|
+
"'gemini_image_generation' performs image inpainting given an image and text prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: numpy.ndarray) -> numpy.ndarray:
|
563
|
+
'gemini_image_generation' performs image inpainting given an image and text prompt.
|
564
|
+
It can be used to edit parts of an image or the entire image according to the prompt given.
|
565
|
+
|
566
|
+
Parameters:
|
567
|
+
prompt (str): A detailed text description guiding what should be generated
|
568
|
+
in the image. More detailed and specific prompts typically yield
|
569
|
+
better results.
|
570
|
+
image (np.ndarray): The source image to be inpainted. The image will serve as
|
571
|
+
the base context for the inpainting process.
|
572
|
+
|
573
|
+
Returns:
|
574
|
+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
575
|
+
ranging from 0 to 255.
|
576
|
+
|
577
|
+
-------
|
578
|
+
Example:
|
579
|
+
>>> # Generate inpainting
|
580
|
+
>>> result = gemini_image_generation(
|
581
|
+
... prompt="a modern black leather sofa with white pillows",
|
582
|
+
... image=image,
|
583
|
+
... )
|
584
|
+
>>> save_image(result, ""inpainted_room.png"")
|
585
|
+
",gemini_image_generation
|
562
586
|
'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
|
563
587
|
'siglip_classification' is a tool that can classify an image or a cropped detection given a list
|
564
588
|
of input labels or tags. It returns the same list of the input labels along with
|
@@ -1,5 +1,5 @@
|
|
1
1
|
CONVERSATION = """
|
2
|
-
**Role**: You are a help agent that called
|
2
|
+
**Role**: You are a help agent that called VisionAgent, built by LandingAI, that assists users write code to solve vision tasks.
|
3
3
|
|
4
4
|
**Actions**:
|
5
5
|
`generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -10,6 +10,7 @@ from importlib import resources
|
|
10
10
|
from pathlib import Path
|
11
11
|
from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
12
12
|
from warnings import warn
|
13
|
+
import time
|
13
14
|
|
14
15
|
import cv2
|
15
16
|
import numpy as np
|
@@ -20,6 +21,8 @@ from PIL import Image, ImageDraw, ImageFont
|
|
20
21
|
from pillow_heif import register_heif_opener # type: ignore
|
21
22
|
from pytube import YouTube # type: ignore
|
22
23
|
import pymupdf # type: ignore
|
24
|
+
from google import genai # type: ignore
|
25
|
+
from google.genai import types # type: ignore
|
23
26
|
|
24
27
|
from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
|
25
28
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -2841,6 +2844,147 @@ def flux_image_inpainting(
|
|
2841
2844
|
return output_image
|
2842
2845
|
|
2843
2846
|
|
2847
|
+
def gemini_image_generation(
|
2848
|
+
prompt: str,
|
2849
|
+
image: Optional[np.ndarray] = None,
|
2850
|
+
) -> np.ndarray:
|
2851
|
+
"""'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
|
2852
|
+
It can be used to edit parts of an image or the entire image according to the prompt given.
|
2853
|
+
|
2854
|
+
Parameters:
|
2855
|
+
prompt (str): A detailed text description guiding what should be generated
|
2856
|
+
in the image. More detailed and specific prompts typically yield
|
2857
|
+
better results.
|
2858
|
+
image (np.ndarray, optional): The source image to be inpainted. The image will serve as
|
2859
|
+
the base context for the inpainting process.
|
2860
|
+
|
2861
|
+
Returns:
|
2862
|
+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
2863
|
+
ranging from 0 to 255.
|
2864
|
+
|
2865
|
+
-------
|
2866
|
+
Example:
|
2867
|
+
>>> # Generate inpainting
|
2868
|
+
>>> result = gemini_image_generation(
|
2869
|
+
... prompt="a modern black leather sofa with white pillows",
|
2870
|
+
... image=image,
|
2871
|
+
... )
|
2872
|
+
>>> save_image(result, "inpainted_room.png")
|
2873
|
+
"""
|
2874
|
+
client = genai.Client()
|
2875
|
+
files = []
|
2876
|
+
image_file = None
|
2877
|
+
|
2878
|
+
def try_generate_content(
|
2879
|
+
input_prompt: types.Content, num_retries: int = 3
|
2880
|
+
) -> Optional[bytes]:
|
2881
|
+
"""Try to generate content with multiple attempts."""
|
2882
|
+
for attempt in range(num_retries):
|
2883
|
+
try:
|
2884
|
+
resp = client.models.generate_content(
|
2885
|
+
model="gemini-2.0-flash-exp-image-generation",
|
2886
|
+
contents=input_prompt,
|
2887
|
+
config=types.GenerateContentConfig(
|
2888
|
+
response_modalities=["Text", "Image"]
|
2889
|
+
),
|
2890
|
+
)
|
2891
|
+
|
2892
|
+
if (
|
2893
|
+
not resp.candidates
|
2894
|
+
or not resp.candidates[0].content
|
2895
|
+
or not resp.candidates[0].content.parts
|
2896
|
+
or not resp.candidates[0].content.parts[0].inline_data
|
2897
|
+
or not resp.candidates[0].content.parts[0].inline_data.data
|
2898
|
+
):
|
2899
|
+
_LOGGER.warning(f"Attempt {attempt + 1}: No candidates returned")
|
2900
|
+
time.sleep(5)
|
2901
|
+
continue
|
2902
|
+
else:
|
2903
|
+
return (
|
2904
|
+
resp.candidates[0].content.parts[0].inline_data.data
|
2905
|
+
if isinstance(
|
2906
|
+
resp.candidates[0].content.parts[0].inline_data.data, bytes
|
2907
|
+
)
|
2908
|
+
else None
|
2909
|
+
)
|
2910
|
+
|
2911
|
+
except genai.errors.ClientError as e:
|
2912
|
+
_LOGGER.warning(f"Attempt {attempt + 1} failed: {str(e)}")
|
2913
|
+
time.sleep(5)
|
2914
|
+
|
2915
|
+
return None
|
2916
|
+
|
2917
|
+
if image is not None:
|
2918
|
+
# Resize if needed
|
2919
|
+
max_size = (512, 512)
|
2920
|
+
if image.shape[0] > max_size[0] or image.shape[1] > max_size[1]:
|
2921
|
+
scaling_factor = min(
|
2922
|
+
max_size[0] / image.shape[0], max_size[1] / image.shape[1]
|
2923
|
+
)
|
2924
|
+
new_size = (
|
2925
|
+
int(image.shape[1] * scaling_factor),
|
2926
|
+
int(image.shape[0] * scaling_factor),
|
2927
|
+
)
|
2928
|
+
image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
|
2929
|
+
|
2930
|
+
# Convert to RGB
|
2931
|
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
2932
|
+
image_file = numpy_to_bytes(image)
|
2933
|
+
files = [("image", image_file)]
|
2934
|
+
|
2935
|
+
input_prompt = types.Content(
|
2936
|
+
parts=[
|
2937
|
+
types.Part(
|
2938
|
+
text="I want you to edit this image given this prompt: " + prompt
|
2939
|
+
),
|
2940
|
+
types.Part(inline_data={"mime_type": "image/png", "data": image_file}),
|
2941
|
+
]
|
2942
|
+
)
|
2943
|
+
|
2944
|
+
else:
|
2945
|
+
input_prompt = types.Content(parts=[types.Part(text=prompt)])
|
2946
|
+
|
2947
|
+
# Try to generate content
|
2948
|
+
output_image_bytes = try_generate_content(input_prompt)
|
2949
|
+
|
2950
|
+
# Handle fallback if all attempts failed
|
2951
|
+
if output_image_bytes is None:
|
2952
|
+
if image is not None:
|
2953
|
+
_LOGGER.warning("Returning original image after all retries failed.")
|
2954
|
+
return image
|
2955
|
+
else:
|
2956
|
+
try:
|
2957
|
+
_LOGGER.warning("All retries failed; prompting for fresh generation.")
|
2958
|
+
time.sleep(10)
|
2959
|
+
output_image_bytes = try_generate_content(
|
2960
|
+
types.Content(parts=[types.Part(text="Generate an image.")]),
|
2961
|
+
num_retries=1,
|
2962
|
+
)
|
2963
|
+
|
2964
|
+
except Exception as e:
|
2965
|
+
raise ValueError(f"Fallback generation failed: {str(e)}")
|
2966
|
+
|
2967
|
+
# Convert bytes to image
|
2968
|
+
if output_image_bytes is not None:
|
2969
|
+
output_image_temp = io.BytesIO(output_image_bytes)
|
2970
|
+
output_image_pil = Image.open(output_image_temp)
|
2971
|
+
final_image = np.array(output_image_pil)
|
2972
|
+
else:
|
2973
|
+
raise ValueError("Fallback generation failed")
|
2974
|
+
|
2975
|
+
_display_tool_trace(
|
2976
|
+
gemini_image_generation.__name__,
|
2977
|
+
{
|
2978
|
+
"prompt": prompt,
|
2979
|
+
"model": "gemini-2.0-flash-exp-image-generation",
|
2980
|
+
},
|
2981
|
+
final_image,
|
2982
|
+
files,
|
2983
|
+
)
|
2984
|
+
|
2985
|
+
return final_image
|
2986
|
+
|
2987
|
+
|
2844
2988
|
def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any]:
|
2845
2989
|
"""'siglip_classification' is a tool that can classify an image or a cropped detection given a list
|
2846
2990
|
of input labels or tags. It returns the same list of the input labels along with
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.9
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Project-URL: Homepage, https://landing.ai
|
6
6
|
Project-URL: repository, https://github.com/landing-ai/vision-agent
|
@@ -12,13 +12,15 @@ Requires-Dist: anthropic<0.32,>=0.31.0
|
|
12
12
|
Requires-Dist: av<12,>=11.0.0
|
13
13
|
Requires-Dist: dotenv<0.10,>=0.9.9
|
14
14
|
Requires-Dist: flake8<8,>=7.0.0
|
15
|
+
Requires-Dist: google-genai<2,>=1.0.0
|
16
|
+
Requires-Dist: httpx==0.27.2
|
15
17
|
Requires-Dist: ipykernel<7,>=6.29.4
|
16
18
|
Requires-Dist: libcst<2,>=1.5.0
|
17
19
|
Requires-Dist: matplotlib<4,>=3.9.2
|
18
20
|
Requires-Dist: nbclient<0.11,>=0.10.0
|
19
21
|
Requires-Dist: nbformat<6,>=5.10.4
|
20
22
|
Requires-Dist: numpy<2.0.0,>=1.21.0
|
21
|
-
Requires-Dist: openai==1
|
23
|
+
Requires-Dist: openai==1.55.3
|
22
24
|
Requires-Dist: opencv-python==4.*
|
23
25
|
Requires-Dist: opentelemetry-api<2,>=1.29.0
|
24
26
|
Requires-Dist: pandas==2.*
|
@@ -75,7 +77,7 @@ The most important step is to [signup](https://va.landing.ai/agent) and obtain y
|
|
75
77
|
### Other Prerequisites
|
76
78
|
- Python version 3.9 or higher
|
77
79
|
- [Anthropic API key](#get-an-anthropic-api-key)
|
78
|
-
- [
|
80
|
+
- [Google API key](#get-a-google-api-key)
|
79
81
|
|
80
82
|
### Why do I need Anthropic and Google API Keys?
|
81
83
|
VisionAgent uses models from Anthropic and Google to respond to prompts and generate code.
|
@@ -84,7 +86,7 @@ When you run the web-based version of VisionAgent, the app uses the LandingAI AP
|
|
84
86
|
|
85
87
|
When you run VisionAgent programmatically, the app will need to use your API keys to access the Anthropic and Google models. This ensures that any projects you run with VisionAgent aren’t limited by the rate limits in place with the LandingAI accounts, and it also prevents many users from overloading the LandingAI rate limits.
|
86
88
|
|
87
|
-
Anthropic and
|
89
|
+
Anthropic and Google each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
|
88
90
|
|
89
91
|
> **_NOTE:_** In VisionAgent v1.0.2 and earlier, VisionAgent was powered by Anthropic Claude-3.5 and OpenAI o1. If using one of these VisionAgent versions, you get an OpenAI API key and set it as an environment variable.
|
90
92
|
|
@@ -94,7 +96,7 @@ Anthropic and Gemini each have their own rate limits and paid tiers. Refer to th
|
|
94
96
|
2. In the Anthropic Console, go to the [API Keys](https://console.anthropic.com/settings/keys) page.
|
95
97
|
3. Generate an API key.
|
96
98
|
|
97
|
-
### Get a
|
99
|
+
### Get a Google API Key
|
98
100
|
1. If you don’t have one yet, create a [Google AI Studio account](https://aistudio.google.com/).
|
99
101
|
2. In Google AI Studio, go to the [Get API Key](https://aistudio.google.com/app/apikey) page.
|
100
102
|
3. Generate an API key.
|
@@ -109,8 +111,8 @@ pip install vision-agent
|
|
109
111
|
## Quickstart: Prompt VisionAgent
|
110
112
|
Follow this quickstart to learn how to prompt VisionAgent. After learning the basics, customize your prompt and workflow to meet your needs.
|
111
113
|
|
112
|
-
1. Get your Anthropic,
|
113
|
-
2. [Set the Anthropic,
|
114
|
+
1. Get your Anthropic, Google, and VisionAgent API keys.
|
115
|
+
2. [Set the Anthropic, Google, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
|
114
116
|
3. [Install VisionAgent](#installation).
|
115
117
|
4. Create a folder called `quickstart`.
|
116
118
|
5. Find an image you want to analyze and save it to the `quickstart` folder.
|
@@ -119,13 +121,13 @@ Follow this quickstart to learn how to prompt VisionAgent. After learning the ba
|
|
119
121
|
8. VisionAgent creates a file called `generated_code.py` and saves the generated code there.
|
120
122
|
|
121
123
|
### Set API Keys as Environment Variables
|
122
|
-
Before running VisionAgent code, you must set the Anthropic,
|
124
|
+
Before running VisionAgent code, you must set the Anthropic, Google, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
|
123
125
|
|
124
126
|
Here is the code for setting the variables:
|
125
127
|
```bash
|
126
128
|
export VISION_AGENT_API_KEY="your-api-key"
|
127
129
|
export ANTHROPIC_API_KEY="your-api-key"
|
128
|
-
export
|
130
|
+
export GOOGLE_API_KEY="your-api-key"
|
129
131
|
```
|
130
132
|
### Sample Script: Prompt VisionAgent
|
131
133
|
To use VisionAgent to generate code, use the following script as a starting point:
|
@@ -1,5 +1,5 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
2
|
+
vision_agent/.sim_tools/df.csv,sha256=pMJKoMzCpcvSSopvWuWlHl7NHCICgUxAqgFQ-m0l7HM,42068
|
3
3
|
vision_agent/.sim_tools/embs.npy,sha256=QN8Ojc0Mv4_OS6WA4elvBhXTDHcpx2g1pLxsGqk4IQU,245888
|
4
4
|
vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
|
5
5
|
vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
|
@@ -8,7 +8,7 @@ vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=53b_DhQtffX5wxLuCbNQ8
|
|
8
8
|
vision_agent/agent/vision_agent_coder_v2.py,sha256=ELc_J8Q4NKPs7YETu3a9O0Vk1zN3k6QfHBgu0M0IWGk,17450
|
9
9
|
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=YARVphHKLMNUqCeOsrManvgecl77RP1g51vtt7JpdWk,35937
|
10
10
|
vision_agent/agent/vision_agent_planner_v2.py,sha256=Aww_BJhTFKZ5XjYe8FW57z2Gwp2se0vg1t1DKLGRAyQ,22050
|
11
|
-
vision_agent/agent/vision_agent_prompts_v2.py,sha256=
|
11
|
+
vision_agent/agent/vision_agent_prompts_v2.py,sha256=6l0o6yAEcaTBOxkHPNJcdV2wkLpoMIiB_9ZqgL2qo2k,4231
|
12
12
|
vision_agent/agent/vision_agent_v2.py,sha256=iPW6DowH7wCFIA5vb1SdSLfZFWbn_oSC7Xa8uO8KIJI,11675
|
13
13
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
14
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
|
|
26
26
|
vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
27
27
|
vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
|
28
28
|
vision_agent/sim/sim.py,sha256=WQY_x9A4VT647qGDBScJ3R8_Iv0aoYLHTgwcQSCXwv4,10059
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=PRUka2eqHwPWJxwfpLj-O2Ab7hXG_dsE1Aov3TE6teM,2496
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tools.py,sha256=pJTk-nQKd68iBXlR-C4oGo_o7V3WPXc4OhOKtw5pf0o,130906
|
34
34
|
vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
|
35
35
|
vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
|
36
36
|
vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
|
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8
|
|
40
40
|
vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
|
41
41
|
vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
|
42
42
|
vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
|
43
|
-
vision_agent-1.1.
|
44
|
-
vision_agent-1.1.
|
45
|
-
vision_agent-1.1.
|
46
|
-
vision_agent-1.1.
|
43
|
+
vision_agent-1.1.9.dist-info/METADATA,sha256=iI5W90sxEbji3gPNJfUHyFxAynPf1vC-VvTMULioSgc,12600
|
44
|
+
vision_agent-1.1.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
45
|
+
vision_agent-1.1.9.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
46
|
+
vision_agent-1.1.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|