vision-agent 1.1.6__tar.gz → 1.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. vision_agent-1.1.8/.gitignore +99 -0
  2. {vision_agent-1.1.6 → vision_agent-1.1.8}/PKG-INFO +43 -47
  3. {vision_agent-1.1.6 → vision_agent-1.1.8}/README.md +8 -7
  4. vision_agent-1.1.8/pyproject.toml +122 -0
  5. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/.sim_tools/df.csv +24 -0
  6. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/tools/__init__.py +1 -0
  7. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/tools/tools.py +144 -0
  8. vision_agent-1.1.6/pyproject.toml +0 -108
  9. {vision_agent-1.1.6 → vision_agent-1.1.8}/LICENSE +0 -0
  10. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/.sim_tools/embs.npy +0 -0
  11. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/__init__.py +0 -0
  12. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/README.md +0 -0
  13. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/__init__.py +0 -0
  14. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/agent.py +0 -0
  15. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  16. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  17. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  18. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  19. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  20. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/agent/vision_agent_v2.py +0 -0
  21. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/clients/__init__.py +0 -0
  22. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/clients/http.py +0 -0
  23. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/configs/__init__.py +0 -0
  24. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/configs/anthropic_config.py +0 -0
  25. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/configs/config.py +0 -0
  26. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/configs/openai_config.py +0 -0
  27. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/fonts/__init__.py +0 -0
  28. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  29. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/lmm/__init__.py +0 -0
  30. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/lmm/lmm.py +0 -0
  31. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/models/__init__.py +0 -0
  32. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/models/agent_types.py +0 -0
  33. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/models/lmm_types.py +0 -0
  34. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/models/tools_types.py +0 -0
  35. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/sim/__init__.py +0 -0
  36. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/sim/sim.py +0 -0
  37. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/tools/meta_tools.py +0 -0
  38. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/tools/planner_tools.py +0 -0
  39. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/tools/prompts.py +0 -0
  40. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/__init__.py +0 -0
  41. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/agent.py +0 -0
  42. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/exceptions.py +0 -0
  43. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/execute.py +0 -0
  44. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/image_utils.py +0 -0
  45. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/tools.py +0 -0
  46. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/tools_doc.py +0 -0
  47. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/video.py +0 -0
  48. {vision_agent-1.1.6 → vision_agent-1.1.8}/vision_agent/utils/video_tracking.py +0 -0
@@ -0,0 +1,99 @@
1
+ # Prerequisites
2
+ *.d
3
+
4
+ # Object files
5
+ *.o
6
+ *.ko
7
+ *.obj
8
+ *.elf
9
+
10
+ # Env files
11
+ .env
12
+
13
+ # Precompiled Headers
14
+ *.gch
15
+ *.pch
16
+
17
+ # Libraries
18
+ *.lib
19
+ *.a
20
+ *.la
21
+ *.lo
22
+
23
+ # Shared objects (inc. Windows DLLs)
24
+ *.dll
25
+ *.so
26
+ *.so.*
27
+ *.dylib
28
+
29
+ # Executables
30
+ *.exe
31
+ *.out
32
+ *.app
33
+ *.i*86
34
+ *.x86_64
35
+ *.hex
36
+
37
+ # Debug files
38
+ *.dSYM/
39
+ *.su
40
+
41
+ # Mac files
42
+ .DS_Store
43
+ .DS_STORE
44
+
45
+ # Old HG stuff
46
+ .hg
47
+ .hgignore
48
+ .hgtags
49
+
50
+ .git
51
+ __pycache__
52
+ .ipynb_checkpoints
53
+ */__pycache__
54
+ */.ipynb_checkpoints
55
+ .local
56
+ .jupyter
57
+ .ipython
58
+ */.terraform
59
+ terraform.*
60
+ .terraform.*
61
+ shinobi-dvr/*
62
+ .vscode/
63
+
64
+ # mypy
65
+ .mypy_cache/*
66
+
67
+ # Distribution / packaging
68
+ .Python
69
+ build/
70
+ develop-eggs/
71
+ dist/
72
+ downloads/
73
+ eggs/
74
+ .eggs/
75
+ lib/
76
+ lib64/
77
+ parts/
78
+ sdist/
79
+ var/
80
+ wheels/
81
+ pip-wheel-metadata/
82
+ share/python-wheels/
83
+ *.egg-info/
84
+ .installed.cfg
85
+ *.egg
86
+ MANIFEST
87
+
88
+ # Output from various tools
89
+ examples/output
90
+ tests/output
91
+ docs-build
92
+ site
93
+
94
+ # Local or WIP files
95
+ local/
96
+
97
+ vision-agent-benchmark/
98
+ vision_agent/tools/suggestion.py
99
+ vision_agent/agent/visual_design_patterns.py
@@ -1,46 +1,42 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.6
3
+ Version: 1.1.8
4
4
  Summary: Toolset for Vision Agent
5
- Author: Landing AI
6
- Author-email: dev@landing.ai
7
- Requires-Python: >=3.9,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.9
10
- Classifier: Programming Language :: Python :: 3.10
11
- Classifier: Programming Language :: Python :: 3.11
12
- Classifier: Programming Language :: Python :: 3.12
13
- Classifier: Programming Language :: Python :: 3.13
14
- Requires-Dist: anthropic (>=0.31.0,<0.32.0)
15
- Requires-Dist: av (>=11.0.0,<12.0.0)
16
- Requires-Dist: dotenv (>=0.9.9,<0.10.0)
17
- Requires-Dist: flake8 (>=7.0.0,<8.0.0)
18
- Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
19
- Requires-Dist: libcst (>=1.5.0,<2.0.0)
20
- Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
21
- Requires-Dist: nbclient (>=0.10.0,<0.11.0)
22
- Requires-Dist: nbformat (>=5.10.4,<6.0.0)
23
- Requires-Dist: numpy (>=1.21.0,<2.0.0)
24
- Requires-Dist: openai (==1.*)
25
- Requires-Dist: opencv-python (==4.*)
26
- Requires-Dist: opentelemetry-api (>=1.29.0,<2.0.0)
27
- Requires-Dist: pandas (==2.*)
28
- Requires-Dist: pillow (==10.*)
29
- Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
30
- Requires-Dist: pydantic (>=2.0.0,<3.0.0)
31
- Requires-Dist: pymupdf (>=1.23.0,<2.0.0)
32
- Requires-Dist: pytube (==15.0.0)
33
- Requires-Dist: requests (==2.*)
34
- Requires-Dist: rich (>=13.7.1,<14.0.0)
35
- Requires-Dist: scikit-learn (>=1.5.2,<2.0.0)
36
- Requires-Dist: scipy (==1.13.*)
37
- Requires-Dist: tabulate (>=0.9.0,<0.10.0)
38
- Requires-Dist: tenacity (>=8.3.0,<9.0.0)
39
- Requires-Dist: tqdm (>=4.64.0,<5.0.0)
40
- Requires-Dist: typing_extensions (==4.*)
41
5
  Project-URL: Homepage, https://landing.ai
42
- Project-URL: documentation, https://github.com/landing-ai/vision-agent
43
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
7
+ Project-URL: documentation, https://github.com/landing-ai/vision-agent
8
+ Author-email: Landing AI <dev@landing.ai>
9
+ License-File: LICENSE
10
+ Requires-Python: <4.0,>=3.9
11
+ Requires-Dist: anthropic<0.32,>=0.31.0
12
+ Requires-Dist: av<12,>=11.0.0
13
+ Requires-Dist: dotenv<0.10,>=0.9.9
14
+ Requires-Dist: flake8<8,>=7.0.0
15
+ Requires-Dist: google-genai<2,>=1.0.0
16
+ Requires-Dist: httpx==0.27.2
17
+ Requires-Dist: ipykernel<7,>=6.29.4
18
+ Requires-Dist: libcst<2,>=1.5.0
19
+ Requires-Dist: matplotlib<4,>=3.9.2
20
+ Requires-Dist: nbclient<0.11,>=0.10.0
21
+ Requires-Dist: nbformat<6,>=5.10.4
22
+ Requires-Dist: numpy<2.0.0,>=1.21.0
23
+ Requires-Dist: openai==1.55.3
24
+ Requires-Dist: opencv-python==4.*
25
+ Requires-Dist: opentelemetry-api<2,>=1.29.0
26
+ Requires-Dist: pandas==2.*
27
+ Requires-Dist: pillow-heif<0.17,>=0.16.0
28
+ Requires-Dist: pillow==10.*
29
+ Requires-Dist: pydantic<3,>=2.0.0
30
+ Requires-Dist: pymupdf<2,>=1.23.0
31
+ Requires-Dist: pytube==15.0.0
32
+ Requires-Dist: requests==2.*
33
+ Requires-Dist: rich<14,>=13.7.1
34
+ Requires-Dist: scikit-learn<2,>=1.5.2
35
+ Requires-Dist: scipy==1.13.*
36
+ Requires-Dist: tabulate<0.10,>=0.9.0
37
+ Requires-Dist: tenacity<9,>=8.3.0
38
+ Requires-Dist: tqdm<5.0.0,>=4.64.0
39
+ Requires-Dist: typing-extensions==4.*
44
40
  Description-Content-Type: text/markdown
45
41
 
46
42
  <div align="center">
@@ -81,7 +77,7 @@ The most important step is to [signup](https://va.landing.ai/agent) and obtain y
81
77
  ### Other Prerequisites
82
78
  - Python version 3.9 or higher
83
79
  - [Anthropic API key](#get-an-anthropic-api-key)
84
- - [Gemini API key](#get-a-gemini-api-key)
80
+ - [Google API key](#get-a-google-api-key)
85
81
 
86
82
  ### Why do I need Anthropic and Google API Keys?
87
83
  VisionAgent uses models from Anthropic and Google to respond to prompts and generate code.
@@ -90,7 +86,7 @@ When you run the web-based version of VisionAgent, the app uses the LandingAI AP
90
86
 
91
87
  When you run VisionAgent programmatically, the app will need to use your API keys to access the Anthropic and Google models. This ensures that any projects you run with VisionAgent aren’t limited by the rate limits in place with the LandingAI accounts, and it also prevents many users from overloading the LandingAI rate limits.
92
88
 
93
- Anthropic and Gemini each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
89
+ Anthropic and Google each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
94
90
 
95
91
  > **_NOTE:_** In VisionAgent v1.0.2 and earlier, VisionAgent was powered by Anthropic Claude-3.5 and OpenAI o1. If using one of these VisionAgent versions, you get an OpenAI API key and set it as an environment variable.
96
92
 
@@ -100,13 +96,14 @@ Anthropic and Gemini each have their own rate limits and paid tiers. Refer to th
100
96
  2. In the Anthropic Console, go to the [API Keys](https://console.anthropic.com/settings/keys) page.
101
97
  3. Generate an API key.
102
98
 
103
- ### Get a Gemini API Key
99
+ ### Get a Google API Key
104
100
  1. If you don’t have one yet, create a [Google AI Studio account](https://aistudio.google.com/).
105
101
  2. In Google AI Studio, go to the [Get API Key](https://aistudio.google.com/app/apikey) page.
106
102
  3. Generate an API key.
107
103
 
108
104
 
109
105
  ## Installation
106
+
110
107
  ```bash
111
108
  pip install vision-agent
112
109
  ```
@@ -114,8 +111,8 @@ pip install vision-agent
114
111
  ## Quickstart: Prompt VisionAgent
115
112
  Follow this quickstart to learn how to prompt VisionAgent. After learning the basics, customize your prompt and workflow to meet your needs.
116
113
 
117
- 1. Get your Anthropic, Gemini, and VisionAgent API keys.
118
- 2. [Set the Anthropic, Gemini, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
114
+ 1. Get your Anthropic, Google, and VisionAgent API keys.
115
+ 2. [Set the Anthropic, Google, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
119
116
  3. [Install VisionAgent](#installation).
120
117
  4. Create a folder called `quickstart`.
121
118
  5. Find an image you want to analyze and save it to the `quickstart` folder.
@@ -124,13 +121,13 @@ Follow this quickstart to learn how to prompt VisionAgent. After learning the ba
124
121
  8. VisionAgent creates a file called `generated_code.py` and saves the generated code there.
125
122
 
126
123
  ### Set API Keys as Environment Variables
127
- Before running VisionAgent code, you must set the Anthropic, Gemini, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
124
+ Before running VisionAgent code, you must set the Anthropic, Google, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
128
125
 
129
126
  Here is the code for setting the variables:
130
127
  ```bash
131
128
  export VISION_AGENT_API_KEY="your-api-key"
132
129
  export ANTHROPIC_API_KEY="your-api-key"
133
- export GEMINI_API_KEY="your-api-key"
130
+ export GOOGLE_API_KEY="your-api-key"
134
131
  ```
135
132
  ### Sample Script: Prompt VisionAgent
136
133
  To use VisionAgent to generate code, use the following script as a starting point:
@@ -269,4 +266,3 @@ with this code:
269
266
  - [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/): Learn how to use this library.
270
267
  - [VisionAgent Web App Docs](https://support.landing.ai/docs/agentic-ai): Learn how to use the web-based version of VisionAgent.
271
268
  - [Video Tutorials](https://www.youtube.com/playlist?list=PLrKGAzovU85fvo22OnVtPl90mxBygIf79): Watch the latest video tutorials to see how VisionAgent is used in a variety of use cases.
272
-
@@ -36,7 +36,7 @@ The most important step is to [signup](https://va.landing.ai/agent) and obtain y
36
36
  ### Other Prerequisites
37
37
  - Python version 3.9 or higher
38
38
  - [Anthropic API key](#get-an-anthropic-api-key)
39
- - [Gemini API key](#get-a-gemini-api-key)
39
+ - [Google API key](#get-a-google-api-key)
40
40
 
41
41
  ### Why do I need Anthropic and Google API Keys?
42
42
  VisionAgent uses models from Anthropic and Google to respond to prompts and generate code.
@@ -45,7 +45,7 @@ When you run the web-based version of VisionAgent, the app uses the LandingAI AP
45
45
 
46
46
  When you run VisionAgent programmatically, the app will need to use your API keys to access the Anthropic and Google models. This ensures that any projects you run with VisionAgent aren’t limited by the rate limits in place with the LandingAI accounts, and it also prevents many users from overloading the LandingAI rate limits.
47
47
 
48
- Anthropic and Gemini each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
48
+ Anthropic and Google each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
49
49
 
50
50
  > **_NOTE:_** In VisionAgent v1.0.2 and earlier, VisionAgent was powered by Anthropic Claude-3.5 and OpenAI o1. If using one of these VisionAgent versions, you get an OpenAI API key and set it as an environment variable.
51
51
 
@@ -55,13 +55,14 @@ Anthropic and Gemini each have their own rate limits and paid tiers. Refer to th
55
55
  2. In the Anthropic Console, go to the [API Keys](https://console.anthropic.com/settings/keys) page.
56
56
  3. Generate an API key.
57
57
 
58
- ### Get a Gemini API Key
58
+ ### Get a Google API Key
59
59
  1. If you don’t have one yet, create a [Google AI Studio account](https://aistudio.google.com/).
60
60
  2. In Google AI Studio, go to the [Get API Key](https://aistudio.google.com/app/apikey) page.
61
61
  3. Generate an API key.
62
62
 
63
63
 
64
64
  ## Installation
65
+
65
66
  ```bash
66
67
  pip install vision-agent
67
68
  ```
@@ -69,8 +70,8 @@ pip install vision-agent
69
70
  ## Quickstart: Prompt VisionAgent
70
71
  Follow this quickstart to learn how to prompt VisionAgent. After learning the basics, customize your prompt and workflow to meet your needs.
71
72
 
72
- 1. Get your Anthropic, Gemini, and VisionAgent API keys.
73
- 2. [Set the Anthropic, Gemini, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
73
+ 1. Get your Anthropic, Google, and VisionAgent API keys.
74
+ 2. [Set the Anthropic, Google, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
74
75
  3. [Install VisionAgent](#installation).
75
76
  4. Create a folder called `quickstart`.
76
77
  5. Find an image you want to analyze and save it to the `quickstart` folder.
@@ -79,13 +80,13 @@ Follow this quickstart to learn how to prompt VisionAgent. After learning the ba
79
80
  8. VisionAgent creates a file called `generated_code.py` and saves the generated code there.
80
81
 
81
82
  ### Set API Keys as Environment Variables
82
- Before running VisionAgent code, you must set the Anthropic, Gemini, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
83
+ Before running VisionAgent code, you must set the Anthropic, Google, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
83
84
 
84
85
  Here is the code for setting the variables:
85
86
  ```bash
86
87
  export VISION_AGENT_API_KEY="your-api-key"
87
88
  export ANTHROPIC_API_KEY="your-api-key"
88
- export GEMINI_API_KEY="your-api-key"
89
+ export GOOGLE_API_KEY="your-api-key"
89
90
  ```
90
91
  ### Sample Script: Prompt VisionAgent
91
92
  To use VisionAgent to generate code, use the following script as a starting point:
@@ -0,0 +1,122 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agent"
7
+ version = "1.1.8"
8
+ description = "Toolset for Vision Agent"
9
+ authors = [{ name = "Landing AI", email = "dev@landing.ai" }]
10
+ requires-python = ">=3.9,<4.0"
11
+ readme = "README.md"
12
+ dependencies = [
13
+ "numpy>=1.21.0,<2.0.0",
14
+ "pillow==10.*",
15
+ "requests==2.*",
16
+ "tqdm>=4.64.0,<5.0.0",
17
+ "pandas==2.*",
18
+ "openai==1.55.3",
19
+ "httpx==0.27.2",
20
+ "flake8>=7.0.0,<8",
21
+ "typing_extensions==4.*",
22
+ "opencv-python==4.*",
23
+ "tabulate>=0.9.0,<0.10",
24
+ "scipy==1.13.*",
25
+ "nbclient>=0.10.0,<0.11",
26
+ "nbformat>=5.10.4,<6",
27
+ "rich>=13.7.1,<14",
28
+ "ipykernel>=6.29.4,<7",
29
+ "tenacity>=8.3.0,<9",
30
+ "pillow-heif>=0.16.0,<0.17",
31
+ "pytube==15.0.0",
32
+ "anthropic>=0.31.0,<0.32",
33
+ "pydantic>=2.0.0,<3",
34
+ "av>=11.0.0,<12",
35
+ "libcst>=1.5.0,<2",
36
+ "matplotlib>=3.9.2,<4",
37
+ "scikit-learn>=1.5.2,<2",
38
+ "opentelemetry-api>=1.29.0,<2",
39
+ "dotenv>=0.9.9,<0.10",
40
+ "pymupdf>=1.23.0,<2",
41
+ "google-genai>=1.0.0,<2",
42
+ ]
43
+
44
+ [project.urls]
45
+ Homepage = "https://landing.ai"
46
+ repository = "https://github.com/landing-ai/vision-agent"
47
+ documentation = "https://github.com/landing-ai/vision-agent"
48
+
49
+ [dependency-groups]
50
+ dev = [
51
+ "autoflake==1.*",
52
+ "pytest==7.*",
53
+ "black>=23,<25",
54
+ "isort==5.*",
55
+ "responses>=0.23.1,<0.24",
56
+ "mypy<1.8.0",
57
+ "types-requests>=2.31.0.0,<3",
58
+ "types-pillow>=9.5.0.4,<10",
59
+ "data-science-types>=0.2.23,<0.3",
60
+ "types-tqdm>=4.65.0.1,<5",
61
+ "setuptools>=68.0.0,<69",
62
+ "griffe>=0.45.3,<0.46",
63
+ "mkdocs>=1.5.3,<2",
64
+ "mkdocstrings[python]>=0.23.0,<0.24",
65
+ "mkdocs-material>=9.4.2,<10",
66
+ "types-tabulate>=0.9.0.20240106,<0.10",
67
+ "scikit-image<0.23.1",
68
+ "pre-commit>=3.8.0,<4",
69
+ ]
70
+
71
+ [tool.hatch.build.targets.wheel]
72
+ include = [
73
+ "vision_agent",
74
+ "vision_agent/.sim_tools/*",
75
+ ]
76
+
77
+
78
+ [tool.hatch.build.targets.sdist]
79
+ include = [
80
+ "vision_agent",
81
+ "vision_agent/.sim_tools/*",
82
+ ]
83
+
84
+ [tool.pytest.ini_options]
85
+ log_cli = true
86
+ log_cli_level = "INFO"
87
+ log_cli_format = "%(asctime)s [%(levelname)s] %(message)s (%(filename)s:%(lineno)s)"
88
+ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
89
+
90
+ [tool.black]
91
+ exclude = '.vscode|.eggs|venv'
92
+ line-length = 88 # suggested by black official site
93
+
94
+ [tool.isort]
95
+ line_length = 88
96
+ profile = "black"
97
+
98
+ [tool.mypy]
99
+ plugins = "pydantic.mypy"
100
+
101
+ exclude = "tests"
102
+ show_error_context = true
103
+ pretty = true
104
+ check_untyped_defs = true
105
+ disallow_untyped_defs = true
106
+ no_implicit_optional = true
107
+ strict_optional = true
108
+ strict_equality = true
109
+ extra_checks = true
110
+ warn_redundant_casts = true
111
+ warn_unused_configs = true
112
+ warn_unused_ignores = true
113
+ warn_return_any = true
114
+ show_error_codes = true
115
+
116
+ [[tool.mypy.overrides]]
117
+ ignore_missing_imports = true
118
+ module = [
119
+ "cv2.*",
120
+ "openai.*",
121
+ "sentence_transformers.*",
122
+ ]
@@ -559,6 +559,30 @@ desc,doc,name
559
559
  ... )
560
560
  >>> save_image(result, ""inpainted_room.png"")
561
561
  ",flux_image_inpainting
562
+ "'gemini_image_generation' performs image inpainting given an image and text prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: numpy.ndarray) -> numpy.ndarray:
563
+ 'gemini_image_generation' performs image inpainting given an image and text prompt.
564
+ It can be used to edit parts of an image or the entire image according to the prompt given.
565
+
566
+ Parameters:
567
+ prompt (str): A detailed text description guiding what should be generated
568
+ in the image. More detailed and specific prompts typically yield
569
+ better results.
570
+ image (np.ndarray): The source image to be inpainted. The image will serve as
571
+ the base context for the inpainting process.
572
+
573
+ Returns:
574
+ np.ndarray: The generated image(s) as a numpy array in RGB format with values
575
+ ranging from 0 to 255.
576
+
577
+ -------
578
+ Example:
579
+ >>> # Generate inpainting
580
+ >>> result = gemini_image_generation(
581
+ ... prompt="a modern black leather sofa with white pillows",
582
+ ... image=image,
583
+ ... )
584
+ >>> save_image(result, ""inpainted_room.png"")
585
+ ",gemini_image_generation
562
586
  'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
563
587
  'siglip_classification' is a tool that can classify an image or a cropped detection given a list
564
588
  of input labels or tags. It returns the same list of the input labels along with
@@ -31,6 +31,7 @@ from .tools import (
31
31
  florence2_sam2_instance_segmentation,
32
32
  florence2_sam2_video_tracking,
33
33
  flux_image_inpainting,
34
+ gemini_image_generation,
34
35
  generate_pose_image,
35
36
  get_tools,
36
37
  get_tools_descriptions,
@@ -10,6 +10,7 @@ from importlib import resources
10
10
  from pathlib import Path
11
11
  from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
12
12
  from warnings import warn
13
+ import time
13
14
 
14
15
  import cv2
15
16
  import numpy as np
@@ -20,6 +21,8 @@ from PIL import Image, ImageDraw, ImageFont
20
21
  from pillow_heif import register_heif_opener # type: ignore
21
22
  from pytube import YouTube # type: ignore
22
23
  import pymupdf # type: ignore
24
+ from google import genai # type: ignore
25
+ from google.genai import types # type: ignore
23
26
 
24
27
  from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
25
28
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -2841,6 +2844,147 @@ def flux_image_inpainting(
2841
2844
  return output_image
2842
2845
 
2843
2846
 
2847
+ def gemini_image_generation(
2848
+ prompt: str,
2849
+ image: Optional[np.ndarray] = None,
2850
+ ) -> np.ndarray:
2851
+ """'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
2852
+ It can be used to edit parts of an image or the entire image according to the prompt given.
2853
+
2854
+ Parameters:
2855
+ prompt (str): A detailed text description guiding what should be generated
2856
+ in the image. More detailed and specific prompts typically yield
2857
+ better results.
2858
+ image (np.ndarray, optional): The source image to be inpainted. The image will serve as
2859
+ the base context for the inpainting process.
2860
+
2861
+ Returns:
2862
+ np.ndarray: The generated image(s) as a numpy array in RGB format with values
2863
+ ranging from 0 to 255.
2864
+
2865
+ -------
2866
+ Example:
2867
+ >>> # Generate inpainting
2868
+ >>> result = gemini_image_generation(
2869
+ ... prompt="a modern black leather sofa with white pillows",
2870
+ ... image=image,
2871
+ ... )
2872
+ >>> save_image(result, "inpainted_room.png")
2873
+ """
2874
+ client = genai.Client()
2875
+ files = []
2876
+ image_file = None
2877
+
2878
+ def try_generate_content(
2879
+ input_prompt: types.Content, num_retries: int = 3
2880
+ ) -> Optional[bytes]:
2881
+ """Try to generate content with multiple attempts."""
2882
+ for attempt in range(num_retries):
2883
+ try:
2884
+ resp = client.models.generate_content(
2885
+ model="gemini-2.0-flash-exp-image-generation",
2886
+ contents=input_prompt,
2887
+ config=types.GenerateContentConfig(
2888
+ response_modalities=["Text", "Image"]
2889
+ ),
2890
+ )
2891
+
2892
+ if (
2893
+ not resp.candidates
2894
+ or not resp.candidates[0].content
2895
+ or not resp.candidates[0].content.parts
2896
+ or not resp.candidates[0].content.parts[0].inline_data
2897
+ or not resp.candidates[0].content.parts[0].inline_data.data
2898
+ ):
2899
+ _LOGGER.warning(f"Attempt {attempt + 1}: No candidates returned")
2900
+ time.sleep(5)
2901
+ continue
2902
+ else:
2903
+ return (
2904
+ resp.candidates[0].content.parts[0].inline_data.data
2905
+ if isinstance(
2906
+ resp.candidates[0].content.parts[0].inline_data.data, bytes
2907
+ )
2908
+ else None
2909
+ )
2910
+
2911
+ except genai.errors.ClientError as e:
2912
+ _LOGGER.warning(f"Attempt {attempt + 1} failed: {str(e)}")
2913
+ time.sleep(5)
2914
+
2915
+ return None
2916
+
2917
+ if image is not None:
2918
+ # Resize if needed
2919
+ max_size = (512, 512)
2920
+ if image.shape[0] > max_size[0] or image.shape[1] > max_size[1]:
2921
+ scaling_factor = min(
2922
+ max_size[0] / image.shape[0], max_size[1] / image.shape[1]
2923
+ )
2924
+ new_size = (
2925
+ int(image.shape[1] * scaling_factor),
2926
+ int(image.shape[0] * scaling_factor),
2927
+ )
2928
+ image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
2929
+
2930
+ # Convert to RGB
2931
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
2932
+ image_file = numpy_to_bytes(image)
2933
+ files = [("image", image_file)]
2934
+
2935
+ input_prompt = types.Content(
2936
+ parts=[
2937
+ types.Part(
2938
+ text="I want you to edit this image given this prompt: " + prompt
2939
+ ),
2940
+ types.Part(inline_data={"mime_type": "image/png", "data": image_file}),
2941
+ ]
2942
+ )
2943
+
2944
+ else:
2945
+ input_prompt = types.Content(parts=[types.Part(text=prompt)])
2946
+
2947
+ # Try to generate content
2948
+ output_image_bytes = try_generate_content(input_prompt)
2949
+
2950
+ # Handle fallback if all attempts failed
2951
+ if output_image_bytes is None:
2952
+ if image is not None:
2953
+ _LOGGER.warning("Returning original image after all retries failed.")
2954
+ return image
2955
+ else:
2956
+ try:
2957
+ _LOGGER.warning("All retries failed; prompting for fresh generation.")
2958
+ time.sleep(10)
2959
+ output_image_bytes = try_generate_content(
2960
+ types.Content(parts=[types.Part(text="Generate an image.")]),
2961
+ num_retries=1,
2962
+ )
2963
+
2964
+ except Exception as e:
2965
+ raise ValueError(f"Fallback generation failed: {str(e)}")
2966
+
2967
+ # Convert bytes to image
2968
+ if output_image_bytes is not None:
2969
+ output_image_temp = io.BytesIO(output_image_bytes)
2970
+ output_image_pil = Image.open(output_image_temp)
2971
+ final_image = np.array(output_image_pil)
2972
+ else:
2973
+ raise ValueError("Fallback generation failed")
2974
+
2975
+ _display_tool_trace(
2976
+ gemini_image_generation.__name__,
2977
+ {
2978
+ "prompt": prompt,
2979
+ "model": "gemini-2.0-flash-exp-image-generation",
2980
+ },
2981
+ final_image,
2982
+ files,
2983
+ )
2984
+
2985
+ return final_image
2986
+
2987
+
2844
2988
  def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any]:
2845
2989
  """'siglip_classification' is a tool that can classify an image or a cropped detection given a list
2846
2990
  of input labels or tags. It returns the same list of the input labels along with
@@ -1,108 +0,0 @@
1
- [build-system]
2
- requires = ["poetry-core"]
3
- build-backend = "poetry.core.masonry.api"
4
-
5
- [tool.poetry]
6
- name = "vision-agent"
7
- version = "1.1.6"
8
- description = "Toolset for Vision Agent"
9
- authors = ["Landing AI <dev@landing.ai>"]
10
- readme = "README.md"
11
- packages = [{include = "vision_agent"}]
12
- include = [{path = "vision_agent/.sim_tools/*"}]
13
-
14
- [tool.poetry.urls]
15
- "Homepage" = "https://landing.ai"
16
- "repository" = "https://github.com/landing-ai/vision-agent"
17
- "documentation" = "https://github.com/landing-ai/vision-agent"
18
-
19
- [tool.poetry.dependencies] # main dependency group
20
- python = ">=3.9,<4.0"
21
-
22
- numpy = ">=1.21.0,<2.0.0"
23
- pillow = "10.*"
24
- requests = "2.*"
25
- tqdm = ">=4.64.0,<5.0.0"
26
- pandas = "2.*"
27
- openai = "1.*"
28
- flake8 = "^7.0.0"
29
- typing_extensions = "4.*"
30
- opencv-python = "4.*"
31
- tabulate = "^0.9.0"
32
- scipy = "1.13.*"
33
- nbclient = "^0.10.0"
34
- nbformat = "^5.10.4"
35
- rich = "^13.7.1"
36
- ipykernel = "^6.29.4"
37
- tenacity = "^8.3.0"
38
- pillow-heif = "^0.16.0"
39
- pytube = "15.0.0"
40
- anthropic = "^0.31.0"
41
- pydantic = "^2.0.0"
42
- av = "^11.0.0"
43
- libcst = "^1.5.0"
44
- matplotlib = "^3.9.2"
45
- scikit-learn = "^1.5.2"
46
- opentelemetry-api = "^1.29.0"
47
- dotenv = "^0.9.9"
48
- pymupdf = "^1.23.0"
49
-
50
- [tool.poetry.group.dev.dependencies]
51
- autoflake = "1.*"
52
- pytest = "7.*"
53
- black = ">=23,<25"
54
- isort = "5.*"
55
- responses = "^0.23.1"
56
- mypy = "<1.8.0"
57
- types-requests = "^2.31.0.0"
58
- types-pillow = "^9.5.0.4"
59
- data-science-types = "^0.2.23"
60
- types-tqdm = "^4.65.0.1"
61
- setuptools = "^68.0.0"
62
- griffe = "^0.45.3"
63
- mkdocs = "^1.5.3"
64
- mkdocstrings = {extras = ["python"], version = "^0.23.0"}
65
- mkdocs-material = "^9.4.2"
66
- types-tabulate = "^0.9.0.20240106"
67
- scikit-image = "<0.23.1"
68
- pre-commit = "^3.8.0"
69
-
70
- [tool.pytest.ini_options]
71
- log_cli = true
72
- log_cli_level = "INFO"
73
- log_cli_format = "%(asctime)s [%(levelname)s] %(message)s (%(filename)s:%(lineno)s)"
74
- log_cli_date_format = "%Y-%m-%d %H:%M:%S"
75
-
76
- [tool.black]
77
- exclude = '.vscode|.eggs|venv'
78
- line-length = 88 # suggested by black official site
79
-
80
- [tool.isort]
81
- line_length = 88
82
- profile = "black"
83
-
84
- [tool.mypy]
85
- plugins = "pydantic.mypy"
86
-
87
- exclude = "tests"
88
- show_error_context = true
89
- pretty = true
90
- check_untyped_defs = true
91
- disallow_untyped_defs = true
92
- no_implicit_optional = true
93
- strict_optional = true
94
- strict_equality = true
95
- extra_checks = true
96
- warn_redundant_casts = true
97
- warn_unused_configs = true
98
- warn_unused_ignores = true
99
- warn_return_any = true
100
- show_error_codes = true
101
-
102
- [[tool.mypy.overrides]]
103
- ignore_missing_imports = true
104
- module = [
105
- "cv2.*",
106
- "openai.*",
107
- "sentence_transformers.*",
108
- ]
File without changes