vision-agent 0.2.155__tar.gz → 0.2.157__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.155 → vision_agent-0.2.157}/PKG-INFO +213 -95
- {vision_agent-0.2.155 → vision_agent-0.2.157}/README.md +212 -94
- {vision_agent-0.2.155 → vision_agent-0.2.157}/pyproject.toml +1 -1
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/agent/agent_utils.py +6 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/agent/vision_agent.py +0 -2
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/agent/vision_agent_coder.py +7 -3
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/agent/vision_agent_prompts.py +7 -6
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/tools/__init__.py +0 -1
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/tools/meta_tools.py +3 -1
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/tools/tools.py +58 -59
- {vision_agent-0.2.155 → vision_agent-0.2.157}/LICENSE +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/tools/tool_utils.py +1 -1
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/tools/tools_types.py +1 -1
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.157
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -56,19 +56,23 @@ accomplish the task you want. Vision Agent aims to provide an in-seconds experie
|
|
56
56
|
allowing users to describe their problem in text and have the agent framework generate
|
57
57
|
code to solve the task for them. Check out our discord for updates and roadmaps!
|
58
58
|
|
59
|
+
## Table of Contents
|
60
|
+
- [🚀Quick Start](#quick-start)
|
61
|
+
- [📚Documentation](#documentation)
|
62
|
+
- [🔍🤖Vision Agent](#vision-agent-basic-usage)
|
63
|
+
- [🛠️Tools](#tools)
|
64
|
+
- [🤖LMMs](#lmms)
|
65
|
+
- [💻🤖Vision Agent Coder](#vision-agent-coder)
|
66
|
+
- [🏗️Additional Backends](#additional-backends)
|
59
67
|
|
60
|
-
##
|
68
|
+
## Quick Start
|
69
|
+
### Web Application
|
70
|
+
The fastest way to test out Vision Agent is to use our web application. You can find it
|
71
|
+
[here](https://va.landing.ai/).
|
61
72
|
|
62
|
-
Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/)
|
63
73
|
|
64
|
-
## Documentation
|
65
|
-
|
66
|
-
[Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
67
|
-
|
68
|
-
|
69
|
-
## Getting Started
|
70
74
|
### Installation
|
71
|
-
To get started, you can install
|
75
|
+
To get started with the python library, you can install it using pip:
|
72
76
|
|
73
77
|
```bash
|
74
78
|
pip install vision-agent
|
@@ -82,17 +86,93 @@ export ANTHROPIC_API_KEY="your-api-key"
|
|
82
86
|
export OPENAI_API_KEY="your-api-key"
|
83
87
|
```
|
84
88
|
|
85
|
-
###
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
89
|
+
### Basic Usage
|
90
|
+
To get started you can just import the `VisionAgent` and start chatting with it:
|
91
|
+
```python
|
92
|
+
>>> from vision_agent.agent import VisionAgent
|
93
|
+
>>> agent = VisionAgent()
|
94
|
+
>>> resp = agent("Hello")
|
95
|
+
>>> print(resp)
|
96
|
+
[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
|
97
|
+
>>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]})
|
98
|
+
>>> resp = agent(resp)
|
99
|
+
```
|
100
|
+
|
101
|
+
The chat messages are similar to `OpenAI`'s format with `role` and `content` keys but
|
102
|
+
in addition to those you can add `medai` which is a list of media files that can either
|
103
|
+
be images or video files.
|
104
|
+
|
105
|
+
## Documentation
|
106
|
+
|
107
|
+
[Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
108
|
+
|
109
|
+
## Vision Agent Basic Usage
|
110
|
+
### Chatting and Message Formats
|
111
|
+
`VisionAgent` is an agent that can chat with you and call other tools or agents to
|
112
|
+
write vision code for you. You can interact with it like you would ChatGPT or any other
|
113
|
+
chatbot. The agent uses Clause-3.5 for it's LMM and OpenAI for embeddings for searching
|
114
|
+
for tools.
|
115
|
+
|
116
|
+
The message format is:
|
117
|
+
```json
|
118
|
+
{
|
119
|
+
"role": "user",
|
120
|
+
"content": "Hello",
|
121
|
+
"media": ["image.jpg"]
|
122
|
+
}
|
123
|
+
```
|
124
|
+
Where `role` can be `user`, `assistant` or `observation` if the agent has executed a
|
125
|
+
function and needs to observe the output. `content` is always the text message and
|
126
|
+
`media` is a list of media files that can be images or videos that you want the agent
|
127
|
+
to examine.
|
128
|
+
|
129
|
+
When the agent responds, inside it's `context` you will find the following data structure:
|
130
|
+
```json
|
131
|
+
{
|
132
|
+
"thoughts": "The user has greeted me. I will respond with a greeting and ask how I can assist them.",
|
133
|
+
"response": "Hello! How can I assist you today?",
|
134
|
+
"let_user_respond": true
|
135
|
+
}
|
136
|
+
```
|
137
|
+
|
138
|
+
`thoughts` are the thoughts the agent had when processing the message, `response` is the
|
139
|
+
response it generated which could contain a python execution, and `let_user_respond` is
|
140
|
+
a boolean that tells the agent if it should wait for the user to respond before
|
141
|
+
continuing, for example it may want to execute code and look at the output before
|
142
|
+
letting the user respond.
|
92
143
|
|
93
|
-
|
94
|
-
|
95
|
-
|
144
|
+
### Chatting and Artifacts
|
145
|
+
If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
|
146
|
+
are a way to sync files between local and remote environments. The agent will read and
|
147
|
+
write to the artifact object, which is just a pickle object, when it wants to save or
|
148
|
+
load files.
|
149
|
+
|
150
|
+
```python
|
151
|
+
import vision_agent as va
|
152
|
+
from vision_agent.tools.meta_tools import Artifact
|
153
|
+
|
154
|
+
artifact = Artifact("artifact.pkl")
|
155
|
+
# you can store text files such as code or images in the artifact
|
156
|
+
with open("code.py", "r") as f:
|
157
|
+
artifacts["code.py"] = f.read()
|
158
|
+
with open("image.png", "rb") as f:
|
159
|
+
artifacts["image.png"] = f.read()
|
160
|
+
|
161
|
+
agent = va.agent.VisionAgent()
|
162
|
+
response, artifacts = agent.chat_with_code(
|
163
|
+
[
|
164
|
+
{
|
165
|
+
"role": "user",
|
166
|
+
"content": "Can you write code to count the number of people in image.png",
|
167
|
+
}
|
168
|
+
],
|
169
|
+
artifacts=artifacts,
|
170
|
+
)
|
171
|
+
```
|
172
|
+
|
173
|
+
### Running the Streamlit App
|
174
|
+
To test out things quickly, sometimes it's easier to run the streamlit app locally to
|
175
|
+
chat with `VisionAgent`, you can run the following command:
|
96
176
|
|
97
177
|
```bash
|
98
178
|
pip install -r examples/chat/requirements.txt
|
@@ -100,25 +180,117 @@ export WORKSPACE=/path/to/your/workspace
|
|
100
180
|
export ZMQ_PORT=5555
|
101
181
|
streamlit run examples/chat/app.py
|
102
182
|
```
|
103
|
-
You can find more details about the streamlit app [here](examples/chat/)
|
183
|
+
You can find more details about the streamlit app [here](examples/chat/), there are
|
184
|
+
still some concurrency issues with the streamlit app so if you find it doing weird things
|
185
|
+
clear your workspace and restart the app.
|
186
|
+
|
187
|
+
## Tools
|
188
|
+
There are a variety of tools for the model or the user to use. Some are executed locally
|
189
|
+
while others are hosted for you. You can easily access them yourself, for example if
|
190
|
+
you want to run `owl_v2_image` and visualize the output you can run:
|
104
191
|
|
105
|
-
#### Basic Programmatic Usage
|
106
192
|
```python
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
193
|
+
import vision_agent.tools as T
|
194
|
+
import matplotlib.pyplot as plt
|
195
|
+
|
196
|
+
image = T.load_image("dogs.jpg")
|
197
|
+
dets = T.owl_v2_image("dogs", image)
|
198
|
+
# visualize the owl_v2_ bounding boxes on the image
|
199
|
+
viz = T.overlay_bounding_boxes(image, dets)
|
200
|
+
|
201
|
+
# plot the image in matplotlib or save it
|
202
|
+
plt.imshow(viz)
|
203
|
+
plt.show()
|
204
|
+
T.save_image(viz, "viz.png")
|
114
205
|
```
|
115
206
|
|
116
|
-
|
117
|
-
embeddings for tool searching.
|
207
|
+
Or if you want to run on video data, for example track sharks and people at 10 FPS:
|
118
208
|
|
119
|
-
|
120
|
-
|
121
|
-
|
209
|
+
```python
|
210
|
+
frames_and_ts = T.extract_frames_and_timestamps("sharks.mp4", fps=10)
|
211
|
+
# extract only the frames from frames and timestamps
|
212
|
+
frames = [f["frame"] for f in frames_and_ts]
|
213
|
+
# track the sharks and people in the frames, returns segmentation masks
|
214
|
+
track = T.florence2_sam2_video_tracking("shark, person", frames)
|
215
|
+
# plot the segmentation masks on the frames
|
216
|
+
viz = T.overlay_segmentation_masks(frames, track)
|
217
|
+
T.save_video(viz, "viz.mp4")
|
218
|
+
```
|
219
|
+
|
220
|
+
You can find all available tools in `vision_agent/tools/tools.py`, however the
|
221
|
+
`VisionAgent` will only utilizes a subset of tools that have been tested and provide
|
222
|
+
the best performance. Those can be found in the same file under the `FUNCION_TOOLS`
|
223
|
+
variable inside `tools.py`.
|
224
|
+
|
225
|
+
#### Custom Tools
|
226
|
+
If you can't find the tool you are looking for you can also add custom tools to the
|
227
|
+
agent:
|
228
|
+
|
229
|
+
```python
|
230
|
+
import vision_agent as va
|
231
|
+
import numpy as np
|
232
|
+
|
233
|
+
@va.tools.register_tool(imports=["import numpy as np"])
|
234
|
+
def custom_tool(image_path: str) -> str:
|
235
|
+
"""My custom tool documentation.
|
236
|
+
|
237
|
+
Parameters:
|
238
|
+
image_path (str): The path to the image.
|
239
|
+
|
240
|
+
Returns:
|
241
|
+
str: The result of the tool.
|
242
|
+
|
243
|
+
Example
|
244
|
+
-------
|
245
|
+
>>> custom_tool("image.jpg")
|
246
|
+
"""
|
247
|
+
|
248
|
+
return np.zeros((10, 10))
|
249
|
+
```
|
250
|
+
|
251
|
+
You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
|
252
|
+
variables will not be captured by `register_tool` so you need to include them in the
|
253
|
+
function. Make sure the documentation is in the same format above with description,
|
254
|
+
`Parameters:`, `Returns:`, and `Example\n-------`. The `VisionAgent` will use your
|
255
|
+
documentation when trying to determine when to use your tool. You can find an example
|
256
|
+
use case [here](examples/custom_tools/) for adding a custom tool. Note you may need to
|
257
|
+
play around with the prompt to ensure the model picks the tool when you want it to.
|
258
|
+
|
259
|
+
Can't find the tool you need and want us to host it? Check out our
|
260
|
+
[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
|
261
|
+
we add the source code for all the tools used in `VisionAgent`.
|
262
|
+
|
263
|
+
## LMMs
|
264
|
+
All of our agents are based off of LMMs or Large Multimodal Models. We provide a thin
|
265
|
+
abstraction layer on top of the underlying provider APIs to be able to more easily
|
266
|
+
handle media.
|
267
|
+
|
268
|
+
|
269
|
+
```python
|
270
|
+
from vision_agent.lmm import AnthropicLMM
|
271
|
+
|
272
|
+
lmm = AnthropicLMM()
|
273
|
+
response = lmm("Describe this image", media=["apple.jpg"])
|
274
|
+
>>> "This is an image of an apple."
|
275
|
+
```
|
276
|
+
|
277
|
+
Or you can use the `OpenAI` chat interaface and pass it other media like videos:
|
278
|
+
|
279
|
+
```python
|
280
|
+
response = lmm(
|
281
|
+
[
|
282
|
+
{
|
283
|
+
"role": "user",
|
284
|
+
"content": "What's going on in this video?",
|
285
|
+
"media": ["video.mp4"]
|
286
|
+
}
|
287
|
+
]
|
288
|
+
)
|
289
|
+
```
|
290
|
+
|
291
|
+
## Vision Agent Coder
|
292
|
+
Underneath the hood, `VisionAgent` uses `VisionAgentCoder` to generate code to solve
|
293
|
+
vision tasks. You can use `VisionAgentCoder` directly to generate code if you want:
|
122
294
|
|
123
295
|
```python
|
124
296
|
>>> from vision_agent.agent import VisionAgentCoder
|
@@ -128,17 +300,17 @@ You can interact with the agent as you would with any LLM or LMM model:
|
|
128
300
|
|
129
301
|
Which produces the following code:
|
130
302
|
```python
|
131
|
-
from vision_agent.tools import load_image,
|
303
|
+
from vision_agent.tools import load_image, florence2_sam2_image
|
132
304
|
|
133
305
|
def calculate_filled_percentage(image_path: str) -> float:
|
134
306
|
# Step 1: Load the image
|
135
307
|
image = load_image(image_path)
|
136
308
|
|
137
309
|
# Step 2: Segment the jar
|
138
|
-
jar_segments =
|
310
|
+
jar_segments = florence2_sam2_image("jar", image)
|
139
311
|
|
140
312
|
# Step 3: Segment the coffee beans
|
141
|
-
coffee_beans_segments =
|
313
|
+
coffee_beans_segments = florence2_sam2_image("coffee beans", image)
|
142
314
|
|
143
315
|
# Step 4: Calculate the area of the segmented jar
|
144
316
|
jar_area = 0
|
@@ -166,7 +338,7 @@ mode by passing in the verbose argument:
|
|
166
338
|
>>> agent = VisionAgentCoder(verbosity=2)
|
167
339
|
```
|
168
340
|
|
169
|
-
|
341
|
+
### Detailed Usage
|
170
342
|
You can also have it return more information by calling `chat_with_workflow`. The format
|
171
343
|
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
172
344
|
|
@@ -186,7 +358,7 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
|
|
186
358
|
With this you can examine more detailed information such as the testing code, testing
|
187
359
|
results, plan or working memory it used to complete the task.
|
188
360
|
|
189
|
-
|
361
|
+
### Multi-turn conversations
|
190
362
|
You can have multi-turn conversations with vision-agent as well, giving it feedback on
|
191
363
|
the code and having it update. You just need to add the code as a response from the
|
192
364
|
assistant:
|
@@ -212,60 +384,6 @@ conv.append(
|
|
212
384
|
result = agent.chat_with_workflow(conv)
|
213
385
|
```
|
214
386
|
|
215
|
-
### Tools
|
216
|
-
There are a variety of tools for the model or the user to use. Some are executed locally
|
217
|
-
while others are hosted for you. You can easily access them yourself, for example if
|
218
|
-
you want to run `owl_v2_image` and visualize the output you can run:
|
219
|
-
|
220
|
-
```python
|
221
|
-
import vision_agent.tools as T
|
222
|
-
import matplotlib.pyplot as plt
|
223
|
-
|
224
|
-
image = T.load_image("dogs.jpg")
|
225
|
-
dets = T.owl_v2_image("dogs", image)
|
226
|
-
viz = T.overlay_bounding_boxes(image, dets)
|
227
|
-
plt.imshow(viz)
|
228
|
-
plt.show()
|
229
|
-
```
|
230
|
-
|
231
|
-
You can find all available tools in `vision_agent/tools/tools.py`, however,
|
232
|
-
`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
|
233
|
-
the best performance. Those can be found in the same file under the `TOOLS` variable.
|
234
|
-
|
235
|
-
If you can't find the tool you are looking for you can also add custom tools to the
|
236
|
-
agent:
|
237
|
-
|
238
|
-
```python
|
239
|
-
import vision_agent as va
|
240
|
-
import numpy as np
|
241
|
-
|
242
|
-
@va.tools.register_tool(imports=["import numpy as np"])
|
243
|
-
def custom_tool(image_path: str) -> str:
|
244
|
-
"""My custom tool documentation.
|
245
|
-
|
246
|
-
Parameters:
|
247
|
-
image_path (str): The path to the image.
|
248
|
-
|
249
|
-
Returns:
|
250
|
-
str: The result of the tool.
|
251
|
-
|
252
|
-
Example
|
253
|
-
-------
|
254
|
-
>>> custom_tool("image.jpg")
|
255
|
-
"""
|
256
|
-
|
257
|
-
return np.zeros((10, 10))
|
258
|
-
```
|
259
|
-
|
260
|
-
You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
|
261
|
-
variables will not be captured by `register_tool` so you need to include them in the
|
262
|
-
function. Make sure the documentation is in the same format above with description,
|
263
|
-
`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
|
264
|
-
[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
|
265
|
-
|
266
|
-
Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
267
|
-
[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
|
268
|
-
we add the source code for all the tools used in `VisionAgent`.
|
269
387
|
|
270
388
|
## Additional Backends
|
271
389
|
### Anthropic
|
@@ -370,9 +488,9 @@ agent = va.agent.AzureVisionAgentCoder()
|
|
370
488
|
|
371
489
|
******************************************************************************************************************************
|
372
490
|
|
373
|
-
|
491
|
+
## Q&A
|
374
492
|
|
375
|
-
|
493
|
+
### How to get started with OpenAI API credits
|
376
494
|
|
377
495
|
1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
378
496
|
2. Follow the instructions to purchase and manage your API credits.
|
@@ -15,19 +15,23 @@ accomplish the task you want. Vision Agent aims to provide an in-seconds experie
|
|
15
15
|
allowing users to describe their problem in text and have the agent framework generate
|
16
16
|
code to solve the task for them. Check out our discord for updates and roadmaps!
|
17
17
|
|
18
|
+
## Table of Contents
|
19
|
+
- [🚀Quick Start](#quick-start)
|
20
|
+
- [📚Documentation](#documentation)
|
21
|
+
- [🔍🤖Vision Agent](#vision-agent-basic-usage)
|
22
|
+
- [🛠️Tools](#tools)
|
23
|
+
- [🤖LMMs](#lmms)
|
24
|
+
- [💻🤖Vision Agent Coder](#vision-agent-coder)
|
25
|
+
- [🏗️Additional Backends](#additional-backends)
|
18
26
|
|
19
|
-
##
|
27
|
+
## Quick Start
|
28
|
+
### Web Application
|
29
|
+
The fastest way to test out Vision Agent is to use our web application. You can find it
|
30
|
+
[here](https://va.landing.ai/).
|
20
31
|
|
21
|
-
Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/)
|
22
32
|
|
23
|
-
## Documentation
|
24
|
-
|
25
|
-
[Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
26
|
-
|
27
|
-
|
28
|
-
## Getting Started
|
29
33
|
### Installation
|
30
|
-
To get started, you can install
|
34
|
+
To get started with the python library, you can install it using pip:
|
31
35
|
|
32
36
|
```bash
|
33
37
|
pip install vision-agent
|
@@ -41,17 +45,93 @@ export ANTHROPIC_API_KEY="your-api-key"
|
|
41
45
|
export OPENAI_API_KEY="your-api-key"
|
42
46
|
```
|
43
47
|
|
44
|
-
###
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
### Basic Usage
|
49
|
+
To get started you can just import the `VisionAgent` and start chatting with it:
|
50
|
+
```python
|
51
|
+
>>> from vision_agent.agent import VisionAgent
|
52
|
+
>>> agent = VisionAgent()
|
53
|
+
>>> resp = agent("Hello")
|
54
|
+
>>> print(resp)
|
55
|
+
[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
|
56
|
+
>>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]})
|
57
|
+
>>> resp = agent(resp)
|
58
|
+
```
|
59
|
+
|
60
|
+
The chat messages are similar to `OpenAI`'s format with `role` and `content` keys but
|
61
|
+
in addition to those you can add `medai` which is a list of media files that can either
|
62
|
+
be images or video files.
|
63
|
+
|
64
|
+
## Documentation
|
65
|
+
|
66
|
+
[Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
67
|
+
|
68
|
+
## Vision Agent Basic Usage
|
69
|
+
### Chatting and Message Formats
|
70
|
+
`VisionAgent` is an agent that can chat with you and call other tools or agents to
|
71
|
+
write vision code for you. You can interact with it like you would ChatGPT or any other
|
72
|
+
chatbot. The agent uses Clause-3.5 for it's LMM and OpenAI for embeddings for searching
|
73
|
+
for tools.
|
74
|
+
|
75
|
+
The message format is:
|
76
|
+
```json
|
77
|
+
{
|
78
|
+
"role": "user",
|
79
|
+
"content": "Hello",
|
80
|
+
"media": ["image.jpg"]
|
81
|
+
}
|
82
|
+
```
|
83
|
+
Where `role` can be `user`, `assistant` or `observation` if the agent has executed a
|
84
|
+
function and needs to observe the output. `content` is always the text message and
|
85
|
+
`media` is a list of media files that can be images or videos that you want the agent
|
86
|
+
to examine.
|
87
|
+
|
88
|
+
When the agent responds, inside it's `context` you will find the following data structure:
|
89
|
+
```json
|
90
|
+
{
|
91
|
+
"thoughts": "The user has greeted me. I will respond with a greeting and ask how I can assist them.",
|
92
|
+
"response": "Hello! How can I assist you today?",
|
93
|
+
"let_user_respond": true
|
94
|
+
}
|
95
|
+
```
|
96
|
+
|
97
|
+
`thoughts` are the thoughts the agent had when processing the message, `response` is the
|
98
|
+
response it generated which could contain a python execution, and `let_user_respond` is
|
99
|
+
a boolean that tells the agent if it should wait for the user to respond before
|
100
|
+
continuing, for example it may want to execute code and look at the output before
|
101
|
+
letting the user respond.
|
51
102
|
|
52
|
-
|
53
|
-
|
54
|
-
|
103
|
+
### Chatting and Artifacts
|
104
|
+
If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
|
105
|
+
are a way to sync files between local and remote environments. The agent will read and
|
106
|
+
write to the artifact object, which is just a pickle object, when it wants to save or
|
107
|
+
load files.
|
108
|
+
|
109
|
+
```python
|
110
|
+
import vision_agent as va
|
111
|
+
from vision_agent.tools.meta_tools import Artifact
|
112
|
+
|
113
|
+
artifact = Artifact("artifact.pkl")
|
114
|
+
# you can store text files such as code or images in the artifact
|
115
|
+
with open("code.py", "r") as f:
|
116
|
+
artifacts["code.py"] = f.read()
|
117
|
+
with open("image.png", "rb") as f:
|
118
|
+
artifacts["image.png"] = f.read()
|
119
|
+
|
120
|
+
agent = va.agent.VisionAgent()
|
121
|
+
response, artifacts = agent.chat_with_code(
|
122
|
+
[
|
123
|
+
{
|
124
|
+
"role": "user",
|
125
|
+
"content": "Can you write code to count the number of people in image.png",
|
126
|
+
}
|
127
|
+
],
|
128
|
+
artifacts=artifacts,
|
129
|
+
)
|
130
|
+
```
|
131
|
+
|
132
|
+
### Running the Streamlit App
|
133
|
+
To test out things quickly, sometimes it's easier to run the streamlit app locally to
|
134
|
+
chat with `VisionAgent`, you can run the following command:
|
55
135
|
|
56
136
|
```bash
|
57
137
|
pip install -r examples/chat/requirements.txt
|
@@ -59,25 +139,117 @@ export WORKSPACE=/path/to/your/workspace
|
|
59
139
|
export ZMQ_PORT=5555
|
60
140
|
streamlit run examples/chat/app.py
|
61
141
|
```
|
62
|
-
You can find more details about the streamlit app [here](examples/chat/)
|
142
|
+
You can find more details about the streamlit app [here](examples/chat/), there are
|
143
|
+
still some concurrency issues with the streamlit app so if you find it doing weird things
|
144
|
+
clear your workspace and restart the app.
|
145
|
+
|
146
|
+
## Tools
|
147
|
+
There are a variety of tools for the model or the user to use. Some are executed locally
|
148
|
+
while others are hosted for you. You can easily access them yourself, for example if
|
149
|
+
you want to run `owl_v2_image` and visualize the output you can run:
|
63
150
|
|
64
|
-
#### Basic Programmatic Usage
|
65
151
|
```python
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
152
|
+
import vision_agent.tools as T
|
153
|
+
import matplotlib.pyplot as plt
|
154
|
+
|
155
|
+
image = T.load_image("dogs.jpg")
|
156
|
+
dets = T.owl_v2_image("dogs", image)
|
157
|
+
# visualize the owl_v2_ bounding boxes on the image
|
158
|
+
viz = T.overlay_bounding_boxes(image, dets)
|
159
|
+
|
160
|
+
# plot the image in matplotlib or save it
|
161
|
+
plt.imshow(viz)
|
162
|
+
plt.show()
|
163
|
+
T.save_image(viz, "viz.png")
|
73
164
|
```
|
74
165
|
|
75
|
-
|
76
|
-
embeddings for tool searching.
|
166
|
+
Or if you want to run on video data, for example track sharks and people at 10 FPS:
|
77
167
|
|
78
|
-
|
79
|
-
|
80
|
-
|
168
|
+
```python
|
169
|
+
frames_and_ts = T.extract_frames_and_timestamps("sharks.mp4", fps=10)
|
170
|
+
# extract only the frames from frames and timestamps
|
171
|
+
frames = [f["frame"] for f in frames_and_ts]
|
172
|
+
# track the sharks and people in the frames, returns segmentation masks
|
173
|
+
track = T.florence2_sam2_video_tracking("shark, person", frames)
|
174
|
+
# plot the segmentation masks on the frames
|
175
|
+
viz = T.overlay_segmentation_masks(frames, track)
|
176
|
+
T.save_video(viz, "viz.mp4")
|
177
|
+
```
|
178
|
+
|
179
|
+
You can find all available tools in `vision_agent/tools/tools.py`, however the
|
180
|
+
`VisionAgent` will only utilizes a subset of tools that have been tested and provide
|
181
|
+
the best performance. Those can be found in the same file under the `FUNCION_TOOLS`
|
182
|
+
variable inside `tools.py`.
|
183
|
+
|
184
|
+
#### Custom Tools
|
185
|
+
If you can't find the tool you are looking for you can also add custom tools to the
|
186
|
+
agent:
|
187
|
+
|
188
|
+
```python
|
189
|
+
import vision_agent as va
|
190
|
+
import numpy as np
|
191
|
+
|
192
|
+
@va.tools.register_tool(imports=["import numpy as np"])
|
193
|
+
def custom_tool(image_path: str) -> str:
|
194
|
+
"""My custom tool documentation.
|
195
|
+
|
196
|
+
Parameters:
|
197
|
+
image_path (str): The path to the image.
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
str: The result of the tool.
|
201
|
+
|
202
|
+
Example
|
203
|
+
-------
|
204
|
+
>>> custom_tool("image.jpg")
|
205
|
+
"""
|
206
|
+
|
207
|
+
return np.zeros((10, 10))
|
208
|
+
```
|
209
|
+
|
210
|
+
You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
|
211
|
+
variables will not be captured by `register_tool` so you need to include them in the
|
212
|
+
function. Make sure the documentation is in the same format above with description,
|
213
|
+
`Parameters:`, `Returns:`, and `Example\n-------`. The `VisionAgent` will use your
|
214
|
+
documentation when trying to determine when to use your tool. You can find an example
|
215
|
+
use case [here](examples/custom_tools/) for adding a custom tool. Note you may need to
|
216
|
+
play around with the prompt to ensure the model picks the tool when you want it to.
|
217
|
+
|
218
|
+
Can't find the tool you need and want us to host it? Check out our
|
219
|
+
[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
|
220
|
+
we add the source code for all the tools used in `VisionAgent`.
|
221
|
+
|
222
|
+
## LMMs
|
223
|
+
All of our agents are based off of LMMs or Large Multimodal Models. We provide a thin
|
224
|
+
abstraction layer on top of the underlying provider APIs to be able to more easily
|
225
|
+
handle media.
|
226
|
+
|
227
|
+
|
228
|
+
```python
|
229
|
+
from vision_agent.lmm import AnthropicLMM
|
230
|
+
|
231
|
+
lmm = AnthropicLMM()
|
232
|
+
response = lmm("Describe this image", media=["apple.jpg"])
|
233
|
+
>>> "This is an image of an apple."
|
234
|
+
```
|
235
|
+
|
236
|
+
Or you can use the `OpenAI` chat interaface and pass it other media like videos:
|
237
|
+
|
238
|
+
```python
|
239
|
+
response = lmm(
|
240
|
+
[
|
241
|
+
{
|
242
|
+
"role": "user",
|
243
|
+
"content": "What's going on in this video?",
|
244
|
+
"media": ["video.mp4"]
|
245
|
+
}
|
246
|
+
]
|
247
|
+
)
|
248
|
+
```
|
249
|
+
|
250
|
+
## Vision Agent Coder
|
251
|
+
Underneath the hood, `VisionAgent` uses `VisionAgentCoder` to generate code to solve
|
252
|
+
vision tasks. You can use `VisionAgentCoder` directly to generate code if you want:
|
81
253
|
|
82
254
|
```python
|
83
255
|
>>> from vision_agent.agent import VisionAgentCoder
|
@@ -87,17 +259,17 @@ You can interact with the agent as you would with any LLM or LMM model:
|
|
87
259
|
|
88
260
|
Which produces the following code:
|
89
261
|
```python
|
90
|
-
from vision_agent.tools import load_image,
|
262
|
+
from vision_agent.tools import load_image, florence2_sam2_image
|
91
263
|
|
92
264
|
def calculate_filled_percentage(image_path: str) -> float:
|
93
265
|
# Step 1: Load the image
|
94
266
|
image = load_image(image_path)
|
95
267
|
|
96
268
|
# Step 2: Segment the jar
|
97
|
-
jar_segments =
|
269
|
+
jar_segments = florence2_sam2_image("jar", image)
|
98
270
|
|
99
271
|
# Step 3: Segment the coffee beans
|
100
|
-
coffee_beans_segments =
|
272
|
+
coffee_beans_segments = florence2_sam2_image("coffee beans", image)
|
101
273
|
|
102
274
|
# Step 4: Calculate the area of the segmented jar
|
103
275
|
jar_area = 0
|
@@ -125,7 +297,7 @@ mode by passing in the verbose argument:
|
|
125
297
|
>>> agent = VisionAgentCoder(verbosity=2)
|
126
298
|
```
|
127
299
|
|
128
|
-
|
300
|
+
### Detailed Usage
|
129
301
|
You can also have it return more information by calling `chat_with_workflow`. The format
|
130
302
|
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
131
303
|
|
@@ -145,7 +317,7 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
|
|
145
317
|
With this you can examine more detailed information such as the testing code, testing
|
146
318
|
results, plan or working memory it used to complete the task.
|
147
319
|
|
148
|
-
|
320
|
+
### Multi-turn conversations
|
149
321
|
You can have multi-turn conversations with vision-agent as well, giving it feedback on
|
150
322
|
the code and having it update. You just need to add the code as a response from the
|
151
323
|
assistant:
|
@@ -171,60 +343,6 @@ conv.append(
|
|
171
343
|
result = agent.chat_with_workflow(conv)
|
172
344
|
```
|
173
345
|
|
174
|
-
### Tools
|
175
|
-
There are a variety of tools for the model or the user to use. Some are executed locally
|
176
|
-
while others are hosted for you. You can easily access them yourself, for example if
|
177
|
-
you want to run `owl_v2_image` and visualize the output you can run:
|
178
|
-
|
179
|
-
```python
|
180
|
-
import vision_agent.tools as T
|
181
|
-
import matplotlib.pyplot as plt
|
182
|
-
|
183
|
-
image = T.load_image("dogs.jpg")
|
184
|
-
dets = T.owl_v2_image("dogs", image)
|
185
|
-
viz = T.overlay_bounding_boxes(image, dets)
|
186
|
-
plt.imshow(viz)
|
187
|
-
plt.show()
|
188
|
-
```
|
189
|
-
|
190
|
-
You can find all available tools in `vision_agent/tools/tools.py`, however,
|
191
|
-
`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
|
192
|
-
the best performance. Those can be found in the same file under the `TOOLS` variable.
|
193
|
-
|
194
|
-
If you can't find the tool you are looking for you can also add custom tools to the
|
195
|
-
agent:
|
196
|
-
|
197
|
-
```python
|
198
|
-
import vision_agent as va
|
199
|
-
import numpy as np
|
200
|
-
|
201
|
-
@va.tools.register_tool(imports=["import numpy as np"])
|
202
|
-
def custom_tool(image_path: str) -> str:
|
203
|
-
"""My custom tool documentation.
|
204
|
-
|
205
|
-
Parameters:
|
206
|
-
image_path (str): The path to the image.
|
207
|
-
|
208
|
-
Returns:
|
209
|
-
str: The result of the tool.
|
210
|
-
|
211
|
-
Example
|
212
|
-
-------
|
213
|
-
>>> custom_tool("image.jpg")
|
214
|
-
"""
|
215
|
-
|
216
|
-
return np.zeros((10, 10))
|
217
|
-
```
|
218
|
-
|
219
|
-
You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
|
220
|
-
variables will not be captured by `register_tool` so you need to include them in the
|
221
|
-
function. Make sure the documentation is in the same format above with description,
|
222
|
-
`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
|
223
|
-
[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
|
224
|
-
|
225
|
-
Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
226
|
-
[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
|
227
|
-
we add the source code for all the tools used in `VisionAgent`.
|
228
346
|
|
229
347
|
## Additional Backends
|
230
348
|
### Anthropic
|
@@ -329,9 +447,9 @@ agent = va.agent.AzureVisionAgentCoder()
|
|
329
447
|
|
330
448
|
******************************************************************************************************************************
|
331
449
|
|
332
|
-
|
450
|
+
## Q&A
|
333
451
|
|
334
|
-
|
452
|
+
### How to get started with OpenAI API credits
|
335
453
|
|
336
454
|
1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
337
455
|
2. Follow the instructions to purchase and manage your API credits.
|
@@ -77,3 +77,9 @@ def extract_code(code: str) -> str:
|
|
77
77
|
if code.startswith("python\n"):
|
78
78
|
code = code[len("python\n") :]
|
79
79
|
return code
|
80
|
+
|
81
|
+
|
82
|
+
def remove_installs_from_code(code: str) -> str:
|
83
|
+
pattern = r"\n!pip install.*?(\n|\Z)\n"
|
84
|
+
code = re.sub(pattern, "", code, flags=re.DOTALL)
|
85
|
+
return code
|
@@ -407,8 +407,6 @@ class VisionAgent(Agent):
|
|
407
407
|
code_interpreter.download_file(
|
408
408
|
str(remote_artifacts_path.name), str(self.local_artifacts_path)
|
409
409
|
)
|
410
|
-
artifacts.load(self.local_artifacts_path)
|
411
|
-
artifacts.save()
|
412
410
|
return orig_chat, artifacts
|
413
411
|
|
414
412
|
def streaming_message(self, message: Dict[str, Any]) -> None:
|
@@ -13,7 +13,11 @@ from tabulate import tabulate
|
|
13
13
|
|
14
14
|
import vision_agent.tools as T
|
15
15
|
from vision_agent.agent import Agent
|
16
|
-
from vision_agent.agent.agent_utils import
|
16
|
+
from vision_agent.agent.agent_utils import (
|
17
|
+
extract_code,
|
18
|
+
extract_json,
|
19
|
+
remove_installs_from_code,
|
20
|
+
)
|
17
21
|
from vision_agent.agent.vision_agent_coder_prompts import (
|
18
22
|
CODE,
|
19
23
|
FIX_BUG,
|
@@ -836,8 +840,8 @@ class VisionAgentCoder(Agent):
|
|
836
840
|
media=media_list,
|
837
841
|
)
|
838
842
|
success = cast(bool, results["success"])
|
839
|
-
code = cast(str, results["code"])
|
840
|
-
test = cast(str, results["test"])
|
843
|
+
code = remove_installs_from_code(cast(str, results["code"]))
|
844
|
+
test = remove_installs_from_code(cast(str, results["test"]))
|
841
845
|
working_memory.extend(results["working_memory"]) # type: ignore
|
842
846
|
plan.append({"code": code, "test": test, "plan": plan_i})
|
843
847
|
|
@@ -28,7 +28,8 @@ Here is the current conversation so far:
|
|
28
28
|
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
29
29
|
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
|
30
30
|
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
|
31
|
-
4. **
|
31
|
+
4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
|
32
|
+
5. **Output in JSON**: Respond in the following format in JSON:
|
32
33
|
|
33
34
|
```json
|
34
35
|
{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
|
@@ -62,7 +63,7 @@ OBSERVATION:
|
|
62
63
|
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
|
63
64
|
|
64
65
|
|
65
|
-
AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "
|
66
|
+
AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
|
66
67
|
"""
|
67
68
|
|
68
69
|
EXAMPLES_CODE1_EXTRA = """
|
@@ -91,7 +92,7 @@ OBSERVATION:
|
|
91
92
|
----- stdout -----
|
92
93
|
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
|
93
94
|
|
94
|
-
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "
|
95
|
+
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
|
95
96
|
"""
|
96
97
|
|
97
98
|
EXAMPLES_CODE2 = """
|
@@ -157,16 +158,16 @@ OBSERVATION:
|
|
157
158
|
----- stdout -----
|
158
159
|
2
|
159
160
|
|
160
|
-
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "
|
161
|
+
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
161
162
|
|
162
163
|
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
|
163
164
|
|
164
|
-
AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "
|
165
|
+
AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
|
165
166
|
|
166
167
|
OBSERVATION:
|
167
168
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
168
169
|
|
169
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "
|
170
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
170
171
|
|
171
172
|
OBSERVATION:
|
172
173
|
[Artifact code.py edits]
|
@@ -116,7 +116,9 @@ class Artifacts:
|
|
116
116
|
)
|
117
117
|
output_str = "[Artifacts loaded]\n"
|
118
118
|
for k in self.artifacts.keys():
|
119
|
-
output_str +=
|
119
|
+
output_str += (
|
120
|
+
f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
|
121
|
+
)
|
120
122
|
output_str += "[End of artifacts]\n"
|
121
123
|
print(output_str)
|
122
124
|
return output_str
|
@@ -13,7 +13,7 @@ from uuid import UUID
|
|
13
13
|
import cv2
|
14
14
|
import numpy as np
|
15
15
|
import requests
|
16
|
-
from PIL import Image, ImageDraw,
|
16
|
+
from PIL import Image, ImageDraw, ImageFont
|
17
17
|
from pillow_heif import register_heif_opener # type: ignore
|
18
18
|
from pytube import YouTube # type: ignore
|
19
19
|
|
@@ -1150,10 +1150,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
1150
1150
|
def florence2_phrase_grounding(
|
1151
1151
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1152
1152
|
) -> List[Dict[str, Any]]:
|
1153
|
-
"""'florence2_phrase_grounding'
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1153
|
+
"""'florence2_phrase_grounding' is a tool that can detect multiple
|
1154
|
+
objects given a text prompt which can be object names or caption. You
|
1155
|
+
can optionally separate the object names in the text with commas. It returns a list
|
1156
|
+
of bounding boxes with normalized coordinates, label names and associated
|
1157
1157
|
probability scores of 1.0.
|
1158
1158
|
|
1159
1159
|
Parameters:
|
@@ -1812,6 +1812,11 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
1812
1812
|
"""
|
1813
1813
|
from IPython.display import display
|
1814
1814
|
|
1815
|
+
if not isinstance(image, np.ndarray) or (
|
1816
|
+
image.shape[0] == 0 and image.shape[1] == 0
|
1817
|
+
):
|
1818
|
+
raise ValueError("The image is not a valid NumPy array with shape (H, W, C)")
|
1819
|
+
|
1815
1820
|
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
|
1816
1821
|
display(pil_image)
|
1817
1822
|
pil_image.save(file_path)
|
@@ -1838,6 +1843,15 @@ def save_video(
|
|
1838
1843
|
if fps <= 0:
|
1839
1844
|
raise ValueError(f"fps must be greater than 0 got {fps}")
|
1840
1845
|
|
1846
|
+
if not isinstance(frames, list) or len(frames) == 0:
|
1847
|
+
raise ValueError("Frames must be a list of NumPy arrays")
|
1848
|
+
|
1849
|
+
for frame in frames:
|
1850
|
+
if not isinstance(frame, np.ndarray) or (
|
1851
|
+
frame.shape[0] == 0 and frame.shape[1] == 0
|
1852
|
+
):
|
1853
|
+
raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)")
|
1854
|
+
|
1841
1855
|
if output_video_path is None:
|
1842
1856
|
output_video_path = tempfile.NamedTemporaryFile(
|
1843
1857
|
delete=False, suffix=".mp4"
|
@@ -1907,30 +1921,36 @@ def overlay_bounding_boxes(
|
|
1907
1921
|
bboxes = bbox_int[i]
|
1908
1922
|
bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
|
1909
1923
|
|
1910
|
-
|
1911
|
-
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
1915
|
-
|
1916
|
-
|
1917
|
-
|
1918
|
-
|
1919
|
-
|
1920
|
-
|
1921
|
-
|
1922
|
-
|
1923
|
-
scores = elt["score"]
|
1924
|
-
|
1925
|
-
# denormalize the box if it is normalized
|
1926
|
-
box = denormalize_bbox(box, (height, width))
|
1927
|
-
draw.rectangle(box, outline=color[label], width=4)
|
1928
|
-
text = f"{label}: {scores:.2f}"
|
1929
|
-
text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
|
1930
|
-
draw.rectangle(
|
1931
|
-
(box[0], box[1], text_box[2], text_box[3]), fill=color[label]
|
1924
|
+
if len(bboxes) > 20:
|
1925
|
+
pil_image = _plot_counting(pil_image, bboxes, color)
|
1926
|
+
else:
|
1927
|
+
width, height = pil_image.size
|
1928
|
+
fontsize = max(12, int(min(width, height) / 40))
|
1929
|
+
draw = ImageDraw.Draw(pil_image)
|
1930
|
+
font = ImageFont.truetype(
|
1931
|
+
str(
|
1932
|
+
resources.files("vision_agent.fonts").joinpath(
|
1933
|
+
"default_font_ch_en.ttf"
|
1934
|
+
)
|
1935
|
+
),
|
1936
|
+
fontsize,
|
1932
1937
|
)
|
1933
|
-
|
1938
|
+
|
1939
|
+
for elt in bboxes:
|
1940
|
+
label = elt["label"]
|
1941
|
+
box = elt["bbox"]
|
1942
|
+
scores = elt["score"]
|
1943
|
+
|
1944
|
+
# denormalize the box if it is normalized
|
1945
|
+
box = denormalize_bbox(box, (height, width))
|
1946
|
+
draw.rectangle(box, outline=color[label], width=4)
|
1947
|
+
text = f"{label}: {scores:.2f}"
|
1948
|
+
text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
|
1949
|
+
draw.rectangle(
|
1950
|
+
(box[0], box[1], text_box[2], text_box[3]), fill=color[label]
|
1951
|
+
)
|
1952
|
+
draw.text((box[0], box[1]), text, fill="black", font=font)
|
1953
|
+
|
1934
1954
|
frame_out.append(np.array(pil_image))
|
1935
1955
|
return frame_out[0] if len(frame_out) == 1 else frame_out
|
1936
1956
|
|
@@ -2089,39 +2109,19 @@ def overlay_heat_map(
|
|
2089
2109
|
return np.array(combined)
|
2090
2110
|
|
2091
2111
|
|
2092
|
-
def
|
2093
|
-
image:
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
Parameters:
|
2099
|
-
image (np.ndarray): The image to display the bounding boxes on.
|
2100
|
-
instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding
|
2101
|
-
box information of each instance
|
2102
|
-
|
2103
|
-
Returns:
|
2104
|
-
np.ndarray: The image with the instance_id dislpayed
|
2105
|
-
|
2106
|
-
Example
|
2107
|
-
-------
|
2108
|
-
>>> image_with_bboxes = overlay_counting_results(
|
2109
|
-
image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
2110
|
-
)
|
2111
|
-
"""
|
2112
|
-
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
|
2113
|
-
color = (158, 218, 229)
|
2114
|
-
|
2115
|
-
width, height = pil_image.size
|
2112
|
+
def _plot_counting(
|
2113
|
+
image: Image.Image,
|
2114
|
+
bboxes: List[Dict[str, Any]],
|
2115
|
+
colors: Dict[str, Tuple[int, int, int]],
|
2116
|
+
) -> Image.Image:
|
2117
|
+
width, height = image.size
|
2116
2118
|
fontsize = max(10, int(min(width, height) / 80))
|
2117
|
-
|
2118
|
-
draw = ImageDraw.Draw(pil_image)
|
2119
|
+
draw = ImageDraw.Draw(image)
|
2119
2120
|
font = ImageFont.truetype(
|
2120
2121
|
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
2121
2122
|
fontsize,
|
2122
2123
|
)
|
2123
|
-
|
2124
|
-
for i, elt in enumerate(instances, 1):
|
2124
|
+
for i, elt in enumerate(bboxes, 1):
|
2125
2125
|
label = f"{i}"
|
2126
2126
|
box = elt["bbox"]
|
2127
2127
|
|
@@ -2143,7 +2143,7 @@ def overlay_counting_results(
|
|
2143
2143
|
text_y1 = cy + text_height / 2
|
2144
2144
|
|
2145
2145
|
# Draw the rectangle encapsulating the text
|
2146
|
-
draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=
|
2146
|
+
draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]])
|
2147
2147
|
|
2148
2148
|
# Draw the text at the center of the bounding box
|
2149
2149
|
draw.text(
|
@@ -2154,7 +2154,7 @@ def overlay_counting_results(
|
|
2154
2154
|
anchor="lt",
|
2155
2155
|
)
|
2156
2156
|
|
2157
|
-
return
|
2157
|
+
return image
|
2158
2158
|
|
2159
2159
|
|
2160
2160
|
FUNCTION_TOOLS = [
|
@@ -2187,7 +2187,6 @@ UTIL_TOOLS = [
|
|
2187
2187
|
overlay_bounding_boxes,
|
2188
2188
|
overlay_segmentation_masks,
|
2189
2189
|
overlay_heat_map,
|
2190
|
-
overlay_counting_results,
|
2191
2190
|
]
|
2192
2191
|
|
2193
2192
|
TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.155 → vision_agent-0.2.157}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|