vision-agent 1.0.7__tar.gz → 1.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. vision_agent-1.0.9/PKG-INFO +259 -0
  2. vision_agent-1.0.9/README.md +214 -0
  3. {vision_agent-1.0.7 → vision_agent-1.0.9}/pyproject.toml +1 -1
  4. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/tools.py +3 -1
  5. vision_agent-1.0.7/PKG-INFO +0 -179
  6. vision_agent-1.0.7/README.md +0 -136
  7. {vision_agent-1.0.7 → vision_agent-1.0.9}/LICENSE +0 -0
  8. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/.sim_tools/df.csv +0 -0
  9. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/.sim_tools/embs.npy +0 -0
  10. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/__init__.py +0 -0
  11. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/README.md +0 -0
  12. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/__init__.py +0 -0
  13. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/agent.py +0 -0
  14. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  15. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  16. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  17. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  18. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  19. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/agent/vision_agent_v2.py +0 -0
  20. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/clients/__init__.py +0 -0
  21. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/clients/http.py +0 -0
  22. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/configs/__init__.py +0 -0
  23. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/configs/anthropic_config.py +0 -0
  24. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/configs/config.py +0 -0
  25. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/configs/openai_config.py +0 -0
  26. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/fonts/__init__.py +0 -0
  27. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  28. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/lmm/__init__.py +0 -0
  29. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/lmm/lmm.py +0 -0
  30. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/models/__init__.py +0 -0
  31. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/models/agent_types.py +0 -0
  32. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/models/lmm_types.py +0 -0
  33. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/models/tools_types.py +0 -0
  34. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/sim/__init__.py +0 -0
  35. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/sim/sim.py +0 -0
  36. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/tools/__init__.py +0 -0
  37. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/tools/meta_tools.py +0 -0
  38. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/tools/planner_tools.py +0 -0
  39. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/tools/prompts.py +0 -0
  40. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/tools/tools.py +0 -0
  41. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/__init__.py +0 -0
  42. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/agent.py +0 -0
  43. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/exceptions.py +0 -0
  44. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/execute.py +0 -0
  45. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/image_utils.py +0 -0
  46. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/tools_doc.py +0 -0
  47. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/video.py +0 -0
  48. {vision_agent-1.0.7 → vision_agent-1.0.9}/vision_agent/utils/video_tracking.py +0 -0
@@ -0,0 +1,259 @@
1
+ Metadata-Version: 2.3
2
+ Name: vision-agent
3
+ Version: 1.0.9
4
+ Summary: Toolset for Vision Agent
5
+ Author: Landing AI
6
+ Author-email: dev@landing.ai
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: anthropic (>=0.31.0,<0.32.0)
15
+ Requires-Dist: av (>=11.0.0,<12.0.0)
16
+ Requires-Dist: dotenv (>=0.9.9,<0.10.0)
17
+ Requires-Dist: flake8 (>=7.0.0,<8.0.0)
18
+ Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
19
+ Requires-Dist: libcst (>=1.5.0,<2.0.0)
20
+ Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
21
+ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
22
+ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
23
+ Requires-Dist: numpy (>=1.21.0,<2.0.0)
24
+ Requires-Dist: openai (==1.*)
25
+ Requires-Dist: opencv-python (==4.*)
26
+ Requires-Dist: opentelemetry-api (>=1.29.0,<2.0.0)
27
+ Requires-Dist: pandas (==2.*)
28
+ Requires-Dist: pillow (==10.*)
29
+ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
30
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
31
+ Requires-Dist: pytube (==15.0.0)
32
+ Requires-Dist: requests (==2.*)
33
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
34
+ Requires-Dist: scikit-learn (>=1.5.2,<2.0.0)
35
+ Requires-Dist: scipy (==1.13.*)
36
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
37
+ Requires-Dist: tenacity (>=8.3.0,<9.0.0)
38
+ Requires-Dist: tqdm (>=4.64.0,<5.0.0)
39
+ Requires-Dist: typing_extensions (==4.*)
40
+ Project-URL: Homepage, https://landing.ai
41
+ Project-URL: documentation, https://github.com/landing-ai/vision-agent
42
+ Project-URL: repository, https://github.com/landing-ai/vision-agent
43
+ Description-Content-Type: text/markdown
44
+
45
+ <div align="center">
46
+ <picture>
47
+ <source media="(prefers-color-scheme: dark)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
48
+ <source media="(prefers-color-scheme: light)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_dark.svg?raw=true">
49
+ <img alt="VisionAgent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
50
+ </picture>
51
+
52
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
53
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
54
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
55
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
56
+ </div>
57
+
58
+ ## VisionAgent
59
+ VisionAgent is the Visual AI Pilot from LandingAI. Submit a prompt and image to VisionAgent, and the app selects the best models for your tasks. VisionAgent then generates code so that you can build vision-enabled apps in minutes.
60
+
61
+ ## How to Use This VisionAgent Library
62
+ - [Prompt VisionAgent](#quickstart-prompt-visionagent): Submit your prompt and VisionAgent generates code.
63
+ - [Call specific tools](#use-specific-tools-from-visionagent): Incorporate specific tools from the library into your code.
64
+
65
+ Instead of downloading this library, you can also use the [VisionAgent web app](https://va.landing.ai/).
66
+
67
+ ## Prerequisites
68
+ - Python version 3.9, 3.10, or 3.11
69
+ - [VisionAgent API key](https://va.landing.ai/account/api-key)
70
+ - [Anthropic API key](#get-an-anthropic-api-key)
71
+ - [Gemini API key](#get-a-gemini-api-key)
72
+
73
+ ### Why do I need Anthropic and Google API Keys?
74
+ VisionAgent uses models from Anthropic and Google to respond to prompts and generate code.
75
+
76
+ When you run the web-based version of VisionAgent, the app uses the LandingAI API keys to access these models.
77
+
78
+ When you run VisionAgent programmatically, the app will need to use your API keys to access the Anthropic and Google models. This ensures that any projects you run with VisionAgent aren’t limited by the rate limits in place with the LandingAI accounts, and it also prevents many users from overloading the LandingAI rate limits.
79
+
80
+ Anthropic and Gemini each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
81
+
82
+ > **_NOTE:_** In VisionAgent v1.0.2 and earlier, VisionAgent was powered by Anthropic Claude-3.5 and OpenAI o1. If using one of these VisionAgent versions, you get an OpenAI API key and set it as an environment variable.
83
+
84
+
85
+ ### Get an Anthropic API Key
86
+ 1. If you don’t have one yet, create an [Anthropic Console account](https://console.anthropic.com/).
87
+ 2. In the Anthropic Console, go to the [API Keys](https://console.anthropic.com/settings/keys) page.
88
+ 3. Generate an API key.
89
+
90
+ ### Get a Gemini API Key
91
+ 1. If you don’t have one yet, create a [Google AI Studio account](https://aistudio.google.com/).
92
+ 2. In Google AI Studio, go to the [Get API Key](https://aistudio.google.com/app/apikey) page.
93
+ 3. Generate an API key.
94
+
95
+
96
+ ## Installation
97
+ ```bash
98
+ pip install vision-agent
99
+ ```
100
+
101
+ ## Quickstart: Prompt VisionAgent
102
+ Follow this quickstart to learn how to prompt VisionAgent. After learning the basics, customize your prompt and workflow to meet your needs.
103
+
104
+ 1. Get your Anthropic, Gemini, and VisionAgent API keys.
105
+ 2. [Set the Anthropic, Gemini, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
106
+ 3. [Install VisionAgent](#installation).
107
+ 4. Create a folder called `quickstart`.
108
+ 5. Find an image you want to analyze and save it to the `quickstart` folder.
109
+ 6. Copy the [Sample Script](#sample-script-prompt-visionagent) to a file called `source.py`. Save the file to the `quickstart` folder.
110
+ 7. Run `source.py`.
111
+ 8. VisionAgent creates a file called `generated_code.py` and saves the generated code there.
112
+
113
+ ### Set API Keys as Environment Variables
114
+ Before running VisionAgent code, you must set the Anthropic, Gemini, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
115
+
116
+ Here is the code for setting the variables:
117
+ ```bash
118
+ export VISION_AGENT_API_KEY="your-api-key"
119
+ export ANTHROPIC_API_KEY="your-api-key"
120
+ export GEMINI_API_KEY="your-api-key"
121
+ ```
122
+ ### Sample Script: Prompt VisionAgent
123
+ To use VisionAgent to generate code, use the following script as a starting point:
124
+
125
+ ```python
126
+ # Import the classes you need from the VisionAgent package
127
+ from vision_agent.agent import VisionAgentCoderV2
128
+ from vision_agent.models import AgentMessage
129
+
130
+ # Enable verbose output
131
+ agent = VisionAgentCoderV2(verbose=True)
132
+
133
+ # Add your prompt (content) and image file (media)
134
+ code_context = agent.generate_code(
135
+ [
136
+ AgentMessage(
137
+ role="user",
138
+ content="Describe the image",
139
+ media=["friends.jpg"]
140
+ )
141
+ ]
142
+ )
143
+
144
+ # Write the output to a file
145
+ with open("generated_code.py", "w") as f:
146
+ f.write(code_context.code + "\n" + code_context.test)
147
+ ```
148
+ ### What to Expect When You Prompt VisionAgent
149
+ When you submit a prompt, VisionAgent performs the following tasks.
150
+
151
+ 1. Generates a plan for the code generation task. If verbose output is on, the numbered steps for this plan display.
152
+ 2. Generates code and a test case based on the plan.
153
+ 3. Tests the generated code with the test case. If the test case fails, VisionAgent iterates on the code generation process until the test case passes.
154
+
155
+ ## Example: Count Cans in an Image
156
+ Check out how to use VisionAgent in this Jupyter Notebook to learn how to count the number of cans in an image:
157
+
158
+ [Count Cans in an Image](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks/counting_cans.ipynb)
159
+
160
+ ## Use Specific Tools from VisionAgent
161
+ The VisionAgent library includes a set of [tools](vision_agent/tools), which are standalone models or functions that complete specific tasks. When you prompt VisionAgent, VisionAgent selects one or more of these tools to complete the tasks outlined in your prompt.
162
+
163
+ For example, if you prompt VisionAgent to “count the number of dogs in an image”, VisionAgent might use the `florence2_object_detection` tool to detect all the dogs, and then the `countgd_object_detection` tool to count the number of detected dogs.
164
+
165
+ After installing the VisionAgent library, you can also use the tools in your own scripts. For example, if you’re writing a script to track objects in videos, you can call the `owlv2_sam2_video_tracking` function. In other words, you can use the VisionAgent tools outside of simply prompting VisionAgent.
166
+
167
+ The tools are in the [vision_agent.tools](vision_agent/tools) API.
168
+
169
+ ### Sample Script: Use Specific Tools for Images
170
+ You can call the `countgd_object_detection` function to count the number of objects in an image.
171
+
172
+ To do this, you could run this script:
173
+ ```python
174
+ # Import the VisionAgent Tools library; import Matplotlib to visualize the results
175
+ import vision_agent.tools as T
176
+ import matplotlib.pyplot as plt
177
+
178
+ # Load the image
179
+ image = T.load_image("people.png")
180
+
181
+ # Call the function to count objects in an image, and specify that you want to count people
182
+ dets = T.countgd_object_detection("person", image)
183
+
184
+ # Visualize the countgd bounding boxes on the image
185
+ viz = T.overlay_bounding_boxes(image, dets)
186
+
187
+ # Save the visualization to a file
188
+ T.save_image(viz, "people_detected.png")
189
+
190
+ # Display the visualization
191
+ plt.imshow(viz)
192
+ plt.show()
193
+
194
+ ```
195
+ ### Sample Script: Use Specific Tools for Videos
196
+ You can call the `countgd_sam2_video_tracking` function to track people in a video and pair it with the `extract_frames_and_timestamps` function to return the frames and timestamps in which those people appear.
197
+
198
+ To do this, you could run this script:
199
+ ```python
200
+ # Import the VisionAgent Tools library
201
+ import vision_agent.tools as T
202
+
203
+ # Call the function to get the frames and timestamps
204
+ frames_and_ts = T.extract_frames_and_timestamps("people.mp4")
205
+
206
+ # Extract the frames from the frames_and_ts list
207
+ frames = [f["frame"] for f in frames_and_ts]
208
+
209
+ # Call the function to track objects, and specify that you want to track people
210
+ tracks = T.countgd_sam2_video_tracking("person", frames)
211
+
212
+ # Visualize the countgd tracking results on the frames and save the video
213
+ viz = T.overlay_segmentation_masks(frames, tracks)
214
+ T.save_video(viz, "people_detected.mp4")
215
+ ```
216
+
217
+
218
+ ## Use Other LLM Providers
219
+ VisionAgent uses [Anthropic Claude 3.7 Sonnet](https://www.anthropic.com/claude/sonnet) and [Gemini Flash 2.0 Experimental](https://ai.google.dev/gemini-api/docs/models/experimental-models) (`gemini-2.0-flash-exp`) to respond to prompts and generate code. We’ve found that these provide the best performance for VisionAgent and are available on the free tiers (with rate limits) from their providers.
220
+
221
+ If you prefer to use only one of these models or a different set of models, you can change the selected LLM provider in this file: `vision_agent/configs/config.py`. You must also add the provider’s API Key as an [environment variable](#set-api-keys-as-environment-variables).
222
+
223
+ For example, if you want to use **only** the Anthropic model, run this command:
224
+ ```bash
225
+ cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
226
+ ```
227
+
228
+ Or, you can manually enter the model details in the `config.py` file. For example, if you want to change the planner model from Anthropic to OpenAI, you would replace this code:
229
+ ```python
230
+ planner: Type[LMM] = Field(default=AnthropicLMM)
231
+ planner_kwargs: dict = Field(
232
+ default_factory=lambda: {
233
+ "model_name": "claude-3-7-sonnet-20250219",
234
+ "temperature": 0.0,
235
+ "image_size": 768,
236
+ }
237
+ )
238
+ ```
239
+
240
+ with this code:
241
+
242
+ ```python
243
+ planner: Type[LMM] = Field(default=OpenAILMM)
244
+ planner_kwargs: dict = Field(
245
+ default_factory=lambda: {
246
+ "model_name": "gpt-4o-2024-11-20",
247
+ "temperature": 0.0,
248
+ "image_size": 768,
249
+ "image_detail": "low",
250
+ }
251
+ )
252
+ ```
253
+
254
+ ## Resources
255
+ - [Discord](https://discord.com/invite/RVcW3j9RgR): Check out our community of VisionAgent users to share use cases and learn about updates.
256
+ - [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/): Learn how to use this library.
257
+ - [VisionAgent Web App Docs](https://support.landing.ai/docs/agentic-ai): Learn how to use the web-based version of VisionAgent.
258
+ - [Video Tutorials](https://www.youtube.com/playlist?list=PLrKGAzovU85fvo22OnVtPl90mxBygIf79): Watch the latest video tutorials to see how VisionAgent is used in a variety of use cases.
259
+
@@ -0,0 +1,214 @@
1
+ <div align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
4
+ <source media="(prefers-color-scheme: light)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_dark.svg?raw=true">
5
+ <img alt="VisionAgent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
6
+ </picture>
7
+
8
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
9
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
10
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
11
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
12
+ </div>
13
+
14
+ ## VisionAgent
15
+ VisionAgent is the Visual AI Pilot from LandingAI. Submit a prompt and image to VisionAgent, and the app selects the best models for your tasks. VisionAgent then generates code so that you can build vision-enabled apps in minutes.
16
+
17
+ ## How to Use This VisionAgent Library
18
+ - [Prompt VisionAgent](#quickstart-prompt-visionagent): Submit your prompt and VisionAgent generates code.
19
+ - [Call specific tools](#use-specific-tools-from-visionagent): Incorporate specific tools from the library into your code.
20
+
21
+ Instead of downloading this library, you can also use the [VisionAgent web app](https://va.landing.ai/).
22
+
23
+ ## Prerequisites
24
+ - Python version 3.9, 3.10, or 3.11
25
+ - [VisionAgent API key](https://va.landing.ai/account/api-key)
26
+ - [Anthropic API key](#get-an-anthropic-api-key)
27
+ - [Gemini API key](#get-a-gemini-api-key)
28
+
29
+ ### Why do I need Anthropic and Google API Keys?
30
+ VisionAgent uses models from Anthropic and Google to respond to prompts and generate code.
31
+
32
+ When you run the web-based version of VisionAgent, the app uses the LandingAI API keys to access these models.
33
+
34
+ When you run VisionAgent programmatically, the app will need to use your API keys to access the Anthropic and Google models. This ensures that any projects you run with VisionAgent aren’t limited by the rate limits in place with the LandingAI accounts, and it also prevents many users from overloading the LandingAI rate limits.
35
+
36
+ Anthropic and Gemini each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
37
+
38
+ > **_NOTE:_** In VisionAgent v1.0.2 and earlier, VisionAgent was powered by Anthropic Claude-3.5 and OpenAI o1. If using one of these VisionAgent versions, you get an OpenAI API key and set it as an environment variable.
39
+
40
+
41
+ ### Get an Anthropic API Key
42
+ 1. If you don’t have one yet, create an [Anthropic Console account](https://console.anthropic.com/).
43
+ 2. In the Anthropic Console, go to the [API Keys](https://console.anthropic.com/settings/keys) page.
44
+ 3. Generate an API key.
45
+
46
+ ### Get a Gemini API Key
47
+ 1. If you don’t have one yet, create a [Google AI Studio account](https://aistudio.google.com/).
48
+ 2. In Google AI Studio, go to the [Get API Key](https://aistudio.google.com/app/apikey) page.
49
+ 3. Generate an API key.
50
+
51
+
52
+ ## Installation
53
+ ```bash
54
+ pip install vision-agent
55
+ ```
56
+
57
+ ## Quickstart: Prompt VisionAgent
58
+ Follow this quickstart to learn how to prompt VisionAgent. After learning the basics, customize your prompt and workflow to meet your needs.
59
+
60
+ 1. Get your Anthropic, Gemini, and VisionAgent API keys.
61
+ 2. [Set the Anthropic, Gemini, and VisionAgent API keys as environment variables](#set-api-keys-as-environment-variables).
62
+ 3. [Install VisionAgent](#installation).
63
+ 4. Create a folder called `quickstart`.
64
+ 5. Find an image you want to analyze and save it to the `quickstart` folder.
65
+ 6. Copy the [Sample Script](#sample-script-prompt-visionagent) to a file called `source.py`. Save the file to the `quickstart` folder.
66
+ 7. Run `source.py`.
67
+ 8. VisionAgent creates a file called `generated_code.py` and saves the generated code there.
68
+
69
+ ### Set API Keys as Environment Variables
70
+ Before running VisionAgent code, you must set the Anthropic, Gemini, and VisionAgent API keys as environment variables. Each operating system offers different ways to do this.
71
+
72
+ Here is the code for setting the variables:
73
+ ```bash
74
+ export VISION_AGENT_API_KEY="your-api-key"
75
+ export ANTHROPIC_API_KEY="your-api-key"
76
+ export GEMINI_API_KEY="your-api-key"
77
+ ```
78
+ ### Sample Script: Prompt VisionAgent
79
+ To use VisionAgent to generate code, use the following script as a starting point:
80
+
81
+ ```python
82
+ # Import the classes you need from the VisionAgent package
83
+ from vision_agent.agent import VisionAgentCoderV2
84
+ from vision_agent.models import AgentMessage
85
+
86
+ # Enable verbose output
87
+ agent = VisionAgentCoderV2(verbose=True)
88
+
89
+ # Add your prompt (content) and image file (media)
90
+ code_context = agent.generate_code(
91
+ [
92
+ AgentMessage(
93
+ role="user",
94
+ content="Describe the image",
95
+ media=["friends.jpg"]
96
+ )
97
+ ]
98
+ )
99
+
100
+ # Write the output to a file
101
+ with open("generated_code.py", "w") as f:
102
+ f.write(code_context.code + "\n" + code_context.test)
103
+ ```
104
+ ### What to Expect When You Prompt VisionAgent
105
+ When you submit a prompt, VisionAgent performs the following tasks.
106
+
107
+ 1. Generates a plan for the code generation task. If verbose output is on, the numbered steps for this plan display.
108
+ 2. Generates code and a test case based on the plan.
109
+ 3. Tests the generated code with the test case. If the test case fails, VisionAgent iterates on the code generation process until the test case passes.
110
+
111
+ ## Example: Count Cans in an Image
112
+ Check out how to use VisionAgent in this Jupyter Notebook to learn how to count the number of cans in an image:
113
+
114
+ [Count Cans in an Image](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks/counting_cans.ipynb)
115
+
116
+ ## Use Specific Tools from VisionAgent
117
+ The VisionAgent library includes a set of [tools](vision_agent/tools), which are standalone models or functions that complete specific tasks. When you prompt VisionAgent, VisionAgent selects one or more of these tools to complete the tasks outlined in your prompt.
118
+
119
+ For example, if you prompt VisionAgent to “count the number of dogs in an image”, VisionAgent might use the `florence2_object_detection` tool to detect all the dogs, and then the `countgd_object_detection` tool to count the number of detected dogs.
120
+
121
+ After installing the VisionAgent library, you can also use the tools in your own scripts. For example, if you’re writing a script to track objects in videos, you can call the `owlv2_sam2_video_tracking` function. In other words, you can use the VisionAgent tools outside of simply prompting VisionAgent.
122
+
123
+ The tools are in the [vision_agent.tools](vision_agent/tools) API.
124
+
125
+ ### Sample Script: Use Specific Tools for Images
126
+ You can call the `countgd_object_detection` function to count the number of objects in an image.
127
+
128
+ To do this, you could run this script:
129
+ ```python
130
+ # Import the VisionAgent Tools library; import Matplotlib to visualize the results
131
+ import vision_agent.tools as T
132
+ import matplotlib.pyplot as plt
133
+
134
+ # Load the image
135
+ image = T.load_image("people.png")
136
+
137
+ # Call the function to count objects in an image, and specify that you want to count people
138
+ dets = T.countgd_object_detection("person", image)
139
+
140
+ # Visualize the countgd bounding boxes on the image
141
+ viz = T.overlay_bounding_boxes(image, dets)
142
+
143
+ # Save the visualization to a file
144
+ T.save_image(viz, "people_detected.png")
145
+
146
+ # Display the visualization
147
+ plt.imshow(viz)
148
+ plt.show()
149
+
150
+ ```
151
+ ### Sample Script: Use Specific Tools for Videos
152
+ You can call the `countgd_sam2_video_tracking` function to track people in a video and pair it with the `extract_frames_and_timestamps` function to return the frames and timestamps in which those people appear.
153
+
154
+ To do this, you could run this script:
155
+ ```python
156
+ # Import the VisionAgent Tools library
157
+ import vision_agent.tools as T
158
+
159
+ # Call the function to get the frames and timestamps
160
+ frames_and_ts = T.extract_frames_and_timestamps("people.mp4")
161
+
162
+ # Extract the frames from the frames_and_ts list
163
+ frames = [f["frame"] for f in frames_and_ts]
164
+
165
+ # Call the function to track objects, and specify that you want to track people
166
+ tracks = T.countgd_sam2_video_tracking("person", frames)
167
+
168
+ # Visualize the countgd tracking results on the frames and save the video
169
+ viz = T.overlay_segmentation_masks(frames, tracks)
170
+ T.save_video(viz, "people_detected.mp4")
171
+ ```
172
+
173
+
174
+ ## Use Other LLM Providers
175
+ VisionAgent uses [Anthropic Claude 3.7 Sonnet](https://www.anthropic.com/claude/sonnet) and [Gemini Flash 2.0 Experimental](https://ai.google.dev/gemini-api/docs/models/experimental-models) (`gemini-2.0-flash-exp`) to respond to prompts and generate code. We’ve found that these provide the best performance for VisionAgent and are available on the free tiers (with rate limits) from their providers.
176
+
177
+ If you prefer to use only one of these models or a different set of models, you can change the selected LLM provider in this file: `vision_agent/configs/config.py`. You must also add the provider’s API Key as an [environment variable](#set-api-keys-as-environment-variables).
178
+
179
+ For example, if you want to use **only** the Anthropic model, run this command:
180
+ ```bash
181
+ cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
182
+ ```
183
+
184
+ Or, you can manually enter the model details in the `config.py` file. For example, if you want to change the planner model from Anthropic to OpenAI, you would replace this code:
185
+ ```python
186
+ planner: Type[LMM] = Field(default=AnthropicLMM)
187
+ planner_kwargs: dict = Field(
188
+ default_factory=lambda: {
189
+ "model_name": "claude-3-7-sonnet-20250219",
190
+ "temperature": 0.0,
191
+ "image_size": 768,
192
+ }
193
+ )
194
+ ```
195
+
196
+ with this code:
197
+
198
+ ```python
199
+ planner: Type[LMM] = Field(default=OpenAILMM)
200
+ planner_kwargs: dict = Field(
201
+ default_factory=lambda: {
202
+ "model_name": "gpt-4o-2024-11-20",
203
+ "temperature": 0.0,
204
+ "image_size": 768,
205
+ "image_detail": "low",
206
+ }
207
+ )
208
+ ```
209
+
210
+ ## Resources
211
+ - [Discord](https://discord.com/invite/RVcW3j9RgR): Check out our community of VisionAgent users to share use cases and learn about updates.
212
+ - [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/): Learn how to use this library.
213
+ - [VisionAgent Web App Docs](https://support.landing.ai/docs/agentic-ai): Learn how to use the web-based version of VisionAgent.
214
+ - [Video Tutorials](https://www.youtube.com/playlist?list=PLrKGAzovU85fvo22OnVtPl90mxBygIf79): Watch the latest video tutorials to see how VisionAgent is used in a variety of use cases.
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "1.0.7"
7
+ version = "1.0.9"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -27,7 +27,9 @@ def get_vision_agent_api_key() -> str:
27
27
  if vision_agent_api_key:
28
28
  return vision_agent_api_key
29
29
  else:
30
- raise ValueError("VISION_AGENT_API_KEY not found in environment variables.")
30
+ raise ValueError(
31
+ "VISION_AGENT_API_KEY not found in environment variables, required for tool usage. You can get a free key from https://va.landing.ai/account/api-key"
32
+ )
31
33
 
32
34
 
33
35
  def should_report_tool_traces() -> bool:
@@ -1,179 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: vision-agent
3
- Version: 1.0.7
4
- Summary: Toolset for Vision Agent
5
- Author: Landing AI
6
- Author-email: dev@landing.ai
7
- Requires-Python: >=3.9,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.9
10
- Classifier: Programming Language :: Python :: 3.10
11
- Classifier: Programming Language :: Python :: 3.11
12
- Requires-Dist: anthropic (>=0.31.0,<0.32.0)
13
- Requires-Dist: av (>=11.0.0,<12.0.0)
14
- Requires-Dist: dotenv (>=0.9.9,<0.10.0)
15
- Requires-Dist: flake8 (>=7.0.0,<8.0.0)
16
- Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
17
- Requires-Dist: libcst (>=1.5.0,<2.0.0)
18
- Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
19
- Requires-Dist: nbclient (>=0.10.0,<0.11.0)
20
- Requires-Dist: nbformat (>=5.10.4,<6.0.0)
21
- Requires-Dist: numpy (>=1.21.0,<2.0.0)
22
- Requires-Dist: openai (>=1.0.0,<2.0.0)
23
- Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
24
- Requires-Dist: opentelemetry-api (>=1.29.0,<2.0.0)
25
- Requires-Dist: pandas (>=2.0.0,<3.0.0)
26
- Requires-Dist: pillow (>=10.0.0,<11.0.0)
27
- Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
28
- Requires-Dist: pydantic (>=2.0.0,<3.0.0)
29
- Requires-Dist: pytube (==15.0.0)
30
- Requires-Dist: requests (>=2.0.0,<3.0.0)
31
- Requires-Dist: rich (>=13.7.1,<14.0.0)
32
- Requires-Dist: scikit-learn (>=1.5.2,<2.0.0)
33
- Requires-Dist: scipy (>=1.13.0,<1.14.0)
34
- Requires-Dist: tabulate (>=0.9.0,<0.10.0)
35
- Requires-Dist: tenacity (>=8.3.0,<9.0.0)
36
- Requires-Dist: tqdm (>=4.64.0,<5.0.0)
37
- Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
38
- Project-URL: Homepage, https://landing.ai
39
- Project-URL: documentation, https://github.com/landing-ai/vision-agent
40
- Project-URL: repository, https://github.com/landing-ai/vision-agent
41
- Description-Content-Type: text/markdown
42
-
43
- <div align="center">
44
- <picture>
45
- <source media="(prefers-color-scheme: dark)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
46
- <source media="(prefers-color-scheme: light)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_dark.svg?raw=true">
47
- <img alt="VisionAgent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
48
- </picture>
49
-
50
- [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
51
- ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
52
- [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
53
- ![version](https://img.shields.io/pypi/pyversions/vision-agent)
54
- </div>
55
-
56
- ## VisionAgent
57
- VisionAgent is a library that helps you utilize agent frameworks to generate code to
58
- solve your vision task. Check out our discord for updates and roadmaps! The fastest
59
- way to test out VisionAgent is to use our web application which you can find [here](https://va.landing.ai/).
60
-
61
- ## Installation
62
- ```bash
63
- pip install vision-agent
64
- ```
65
-
66
- ```bash
67
- export ANTHROPIC_API_KEY="your-api-key"
68
- export GEMINI_API_KEY="your-api-key"
69
- ```
70
-
71
- > **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
72
-
73
- You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
74
-
75
- ```bash
76
- export VISION_AGENT_API_KEY="your-api-key"
77
- ```
78
-
79
- ## Documentation
80
-
81
- [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
82
-
83
- ## Examples
84
- ### Counting cans in an image
85
- You can run VisionAgent in a local Jupyter Notebook [Counting cans in an image](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks/counting_cans.ipynb)
86
-
87
- ### Generating code
88
- You can use VisionAgent to generate code to count the number of people in an image:
89
- ```python
90
- from vision_agent.agent import VisionAgentCoderV2
91
- from vision_agent.models import AgentMessage
92
-
93
- agent = VisionAgentCoderV2(verbose=True)
94
- code_context = agent.generate_code(
95
- [
96
- AgentMessage(
97
- role="user",
98
- content="Count the number of people in this image",
99
- media=["people.png"]
100
- )
101
- ]
102
- )
103
-
104
- with open("generated_code.py", "w") as f:
105
- f.write(code_context.code + "\n" + code_context.test)
106
- ```
107
-
108
- ### Using the tools directly
109
- VisionAgent produces code that utilizes our tools. You can also use the tools directly.
110
- For example if you wanted to detect people in an image and visualize the results:
111
- ```python
112
- import vision_agent.tools as T
113
- import matplotlib.pyplot as plt
114
-
115
- image = T.load_image("people.png")
116
- dets = T.countgd_object_detection("person", image)
117
- # visualize the countgd bounding boxes on the image
118
- viz = T.overlay_bounding_boxes(image, dets)
119
-
120
- # save the visualization to a file
121
- T.save_image(viz, "people_detected.png")
122
-
123
- # display the visualization
124
- plt.imshow(viz)
125
- plt.show()
126
- ```
127
-
128
- You can also use the tools for running on video files:
129
- ```python
130
- import vision_agent.tools as T
131
-
132
- frames_and_ts = T.extract_frames_and_timestamps("people.mp4")
133
- # extract the frames from the frames_and_ts list
134
- frames = [f["frame"] for f in frames_and_ts]
135
-
136
- # run the countgd tracking on the frames
137
- tracks = T.countgd_sam2_video_tracking("person", frames)
138
- # visualize the countgd tracking results on the frames and save the video
139
- viz = T.overlay_segmentation_masks(frames, tracks)
140
- T.save_video(viz, "people_detected.mp4")
141
- ```
142
-
143
- ## Using Other LLM Providers
144
- You can use other LLM providers by changing `config.py` in the `vision_agent/configs`
145
- directory. For example to change to Anthropic simply just run:
146
- ```bash
147
- cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
148
- ```
149
-
150
- You can also modify the existing `config.py` file yourself to use a different LLM
151
- provider, for example if you wanted to change the planner from Anthropic inside
152
- `config.py` to OpenAI you would replace this code:
153
- ```python
154
- planner: Type[LMM] = Field(default=AnthropicLMM)
155
- planner_kwargs: dict = Field(
156
- default_factory=lambda: {
157
- "model_name": "claude-3-7-sonnet-20250219",
158
- "temperature": 0.0,
159
- "image_size": 768,
160
- }
161
- )
162
- ```
163
-
164
- with this code:
165
-
166
- ```python
167
- planner: Type[LMM] = Field(default=OpenAILMM)
168
- planner_kwargs: dict = Field(
169
- default_factory=lambda: {
170
- "model_name": "gpt-4o-2024-11-20",
171
- "temperature": 0.0,
172
- "image_size": 768,
173
- "image_detail": "low",
174
- }
175
- )
176
- ```
177
-
178
- > **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
179
-
@@ -1,136 +0,0 @@
1
- <div align="center">
2
- <picture>
3
- <source media="(prefers-color-scheme: dark)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
4
- <source media="(prefers-color-scheme: light)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_dark.svg?raw=true">
5
- <img alt="VisionAgent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
6
- </picture>
7
-
8
- [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
9
- ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
10
- [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
11
- ![version](https://img.shields.io/pypi/pyversions/vision-agent)
12
- </div>
13
-
14
- ## VisionAgent
15
- VisionAgent is a library that helps you utilize agent frameworks to generate code to
16
- solve your vision task. Check out our discord for updates and roadmaps! The fastest
17
- way to test out VisionAgent is to use our web application which you can find [here](https://va.landing.ai/).
18
-
19
- ## Installation
20
- ```bash
21
- pip install vision-agent
22
- ```
23
-
24
- ```bash
25
- export ANTHROPIC_API_KEY="your-api-key"
26
- export GEMINI_API_KEY="your-api-key"
27
- ```
28
-
29
- > **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
30
-
31
- You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
32
-
33
- ```bash
34
- export VISION_AGENT_API_KEY="your-api-key"
35
- ```
36
-
37
- ## Documentation
38
-
39
- [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
40
-
41
- ## Examples
42
- ### Counting cans in an image
43
- You can run VisionAgent in a local Jupyter Notebook [Counting cans in an image](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks/counting_cans.ipynb)
44
-
45
- ### Generating code
46
- You can use VisionAgent to generate code to count the number of people in an image:
47
- ```python
48
- from vision_agent.agent import VisionAgentCoderV2
49
- from vision_agent.models import AgentMessage
50
-
51
- agent = VisionAgentCoderV2(verbose=True)
52
- code_context = agent.generate_code(
53
- [
54
- AgentMessage(
55
- role="user",
56
- content="Count the number of people in this image",
57
- media=["people.png"]
58
- )
59
- ]
60
- )
61
-
62
- with open("generated_code.py", "w") as f:
63
- f.write(code_context.code + "\n" + code_context.test)
64
- ```
65
-
66
- ### Using the tools directly
67
- VisionAgent produces code that utilizes our tools. You can also use the tools directly.
68
- For example if you wanted to detect people in an image and visualize the results:
69
- ```python
70
- import vision_agent.tools as T
71
- import matplotlib.pyplot as plt
72
-
73
- image = T.load_image("people.png")
74
- dets = T.countgd_object_detection("person", image)
75
- # visualize the countgd bounding boxes on the image
76
- viz = T.overlay_bounding_boxes(image, dets)
77
-
78
- # save the visualization to a file
79
- T.save_image(viz, "people_detected.png")
80
-
81
- # display the visualization
82
- plt.imshow(viz)
83
- plt.show()
84
- ```
85
-
86
- You can also use the tools for running on video files:
87
- ```python
88
- import vision_agent.tools as T
89
-
90
- frames_and_ts = T.extract_frames_and_timestamps("people.mp4")
91
- # extract the frames from the frames_and_ts list
92
- frames = [f["frame"] for f in frames_and_ts]
93
-
94
- # run the countgd tracking on the frames
95
- tracks = T.countgd_sam2_video_tracking("person", frames)
96
- # visualize the countgd tracking results on the frames and save the video
97
- viz = T.overlay_segmentation_masks(frames, tracks)
98
- T.save_video(viz, "people_detected.mp4")
99
- ```
100
-
101
- ## Using Other LLM Providers
102
- You can use other LLM providers by changing `config.py` in the `vision_agent/configs`
103
- directory. For example to change to Anthropic simply just run:
104
- ```bash
105
- cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
106
- ```
107
-
108
- You can also modify the existing `config.py` file yourself to use a different LLM
109
- provider, for example if you wanted to change the planner from Anthropic inside
110
- `config.py` to OpenAI you would replace this code:
111
- ```python
112
- planner: Type[LMM] = Field(default=AnthropicLMM)
113
- planner_kwargs: dict = Field(
114
- default_factory=lambda: {
115
- "model_name": "claude-3-7-sonnet-20250219",
116
- "temperature": 0.0,
117
- "image_size": 768,
118
- }
119
- )
120
- ```
121
-
122
- with this code:
123
-
124
- ```python
125
- planner: Type[LMM] = Field(default=OpenAILMM)
126
- planner_kwargs: dict = Field(
127
- default_factory=lambda: {
128
- "model_name": "gpt-4o-2024-11-20",
129
- "temperature": 0.0,
130
- "image_size": 768,
131
- "image_detail": "low",
132
- }
133
- )
134
- ```
135
-
136
- > **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
File without changes