vision-agent 0.2.156__py3-none-any.whl → 0.2.158__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent.py +3 -1
- {vision_agent-0.2.156.dist-info → vision_agent-0.2.158.dist-info}/METADATA +213 -95
- {vision_agent-0.2.156.dist-info → vision_agent-0.2.158.dist-info}/RECORD +5 -5
- {vision_agent-0.2.156.dist-info → vision_agent-0.2.158.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.156.dist-info → vision_agent-0.2.158.dist-info}/WHEEL +0 -0
@@ -383,7 +383,7 @@ class VisionAgent(Agent):
|
|
383
383
|
obs_chat_elt: Message = {"role": "observation", "content": obs}
|
384
384
|
if media_obs and result.success:
|
385
385
|
obs_chat_elt["media"] = [
|
386
|
-
Path(
|
386
|
+
Path(self.local_artifacts_path).parent / media_ob
|
387
387
|
for media_ob in media_obs
|
388
388
|
]
|
389
389
|
|
@@ -407,6 +407,8 @@ class VisionAgent(Agent):
|
|
407
407
|
code_interpreter.download_file(
|
408
408
|
str(remote_artifacts_path.name), str(self.local_artifacts_path)
|
409
409
|
)
|
410
|
+
artifacts.load(self.local_artifacts_path)
|
411
|
+
artifacts.save()
|
410
412
|
return orig_chat, artifacts
|
411
413
|
|
412
414
|
def streaming_message(self, message: Dict[str, Any]) -> None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.158
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -56,19 +56,23 @@ accomplish the task you want. Vision Agent aims to provide an in-seconds experie
|
|
56
56
|
allowing users to describe their problem in text and have the agent framework generate
|
57
57
|
code to solve the task for them. Check out our discord for updates and roadmaps!
|
58
58
|
|
59
|
+
## Table of Contents
|
60
|
+
- [🚀Quick Start](#quick-start)
|
61
|
+
- [📚Documentation](#documentation)
|
62
|
+
- [🔍🤖Vision Agent](#vision-agent-basic-usage)
|
63
|
+
- [🛠️Tools](#tools)
|
64
|
+
- [🤖LMMs](#lmms)
|
65
|
+
- [💻🤖Vision Agent Coder](#vision-agent-coder)
|
66
|
+
- [🏗️Additional Backends](#additional-backends)
|
59
67
|
|
60
|
-
##
|
68
|
+
## Quick Start
|
69
|
+
### Web Application
|
70
|
+
The fastest way to test out Vision Agent is to use our web application. You can find it
|
71
|
+
[here](https://va.landing.ai/).
|
61
72
|
|
62
|
-
Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/)
|
63
73
|
|
64
|
-
## Documentation
|
65
|
-
|
66
|
-
[Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
67
|
-
|
68
|
-
|
69
|
-
## Getting Started
|
70
74
|
### Installation
|
71
|
-
To get started, you can install
|
75
|
+
To get started with the python library, you can install it using pip:
|
72
76
|
|
73
77
|
```bash
|
74
78
|
pip install vision-agent
|
@@ -82,17 +86,93 @@ export ANTHROPIC_API_KEY="your-api-key"
|
|
82
86
|
export OPENAI_API_KEY="your-api-key"
|
83
87
|
```
|
84
88
|
|
85
|
-
###
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
89
|
+
### Basic Usage
|
90
|
+
To get started you can just import the `VisionAgent` and start chatting with it:
|
91
|
+
```python
|
92
|
+
>>> from vision_agent.agent import VisionAgent
|
93
|
+
>>> agent = VisionAgent()
|
94
|
+
>>> resp = agent("Hello")
|
95
|
+
>>> print(resp)
|
96
|
+
[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
|
97
|
+
>>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]})
|
98
|
+
>>> resp = agent(resp)
|
99
|
+
```
|
100
|
+
|
101
|
+
The chat messages are similar to `OpenAI`'s format with `role` and `content` keys but
|
102
|
+
in addition to those you can add `medai` which is a list of media files that can either
|
103
|
+
be images or video files.
|
104
|
+
|
105
|
+
## Documentation
|
106
|
+
|
107
|
+
[Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
108
|
+
|
109
|
+
## Vision Agent Basic Usage
|
110
|
+
### Chatting and Message Formats
|
111
|
+
`VisionAgent` is an agent that can chat with you and call other tools or agents to
|
112
|
+
write vision code for you. You can interact with it like you would ChatGPT or any other
|
113
|
+
chatbot. The agent uses Clause-3.5 for it's LMM and OpenAI for embeddings for searching
|
114
|
+
for tools.
|
115
|
+
|
116
|
+
The message format is:
|
117
|
+
```json
|
118
|
+
{
|
119
|
+
"role": "user",
|
120
|
+
"content": "Hello",
|
121
|
+
"media": ["image.jpg"]
|
122
|
+
}
|
123
|
+
```
|
124
|
+
Where `role` can be `user`, `assistant` or `observation` if the agent has executed a
|
125
|
+
function and needs to observe the output. `content` is always the text message and
|
126
|
+
`media` is a list of media files that can be images or videos that you want the agent
|
127
|
+
to examine.
|
128
|
+
|
129
|
+
When the agent responds, inside it's `context` you will find the following data structure:
|
130
|
+
```json
|
131
|
+
{
|
132
|
+
"thoughts": "The user has greeted me. I will respond with a greeting and ask how I can assist them.",
|
133
|
+
"response": "Hello! How can I assist you today?",
|
134
|
+
"let_user_respond": true
|
135
|
+
}
|
136
|
+
```
|
137
|
+
|
138
|
+
`thoughts` are the thoughts the agent had when processing the message, `response` is the
|
139
|
+
response it generated which could contain a python execution, and `let_user_respond` is
|
140
|
+
a boolean that tells the agent if it should wait for the user to respond before
|
141
|
+
continuing, for example it may want to execute code and look at the output before
|
142
|
+
letting the user respond.
|
92
143
|
|
93
|
-
|
94
|
-
|
95
|
-
|
144
|
+
### Chatting and Artifacts
|
145
|
+
If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
|
146
|
+
are a way to sync files between local and remote environments. The agent will read and
|
147
|
+
write to the artifact object, which is just a pickle object, when it wants to save or
|
148
|
+
load files.
|
149
|
+
|
150
|
+
```python
|
151
|
+
import vision_agent as va
|
152
|
+
from vision_agent.tools.meta_tools import Artifact
|
153
|
+
|
154
|
+
artifact = Artifact("artifact.pkl")
|
155
|
+
# you can store text files such as code or images in the artifact
|
156
|
+
with open("code.py", "r") as f:
|
157
|
+
artifacts["code.py"] = f.read()
|
158
|
+
with open("image.png", "rb") as f:
|
159
|
+
artifacts["image.png"] = f.read()
|
160
|
+
|
161
|
+
agent = va.agent.VisionAgent()
|
162
|
+
response, artifacts = agent.chat_with_code(
|
163
|
+
[
|
164
|
+
{
|
165
|
+
"role": "user",
|
166
|
+
"content": "Can you write code to count the number of people in image.png",
|
167
|
+
}
|
168
|
+
],
|
169
|
+
artifacts=artifacts,
|
170
|
+
)
|
171
|
+
```
|
172
|
+
|
173
|
+
### Running the Streamlit App
|
174
|
+
To test out things quickly, sometimes it's easier to run the streamlit app locally to
|
175
|
+
chat with `VisionAgent`, you can run the following command:
|
96
176
|
|
97
177
|
```bash
|
98
178
|
pip install -r examples/chat/requirements.txt
|
@@ -100,25 +180,117 @@ export WORKSPACE=/path/to/your/workspace
|
|
100
180
|
export ZMQ_PORT=5555
|
101
181
|
streamlit run examples/chat/app.py
|
102
182
|
```
|
103
|
-
You can find more details about the streamlit app [here](examples/chat/)
|
183
|
+
You can find more details about the streamlit app [here](examples/chat/), there are
|
184
|
+
still some concurrency issues with the streamlit app so if you find it doing weird things
|
185
|
+
clear your workspace and restart the app.
|
186
|
+
|
187
|
+
## Tools
|
188
|
+
There are a variety of tools for the model or the user to use. Some are executed locally
|
189
|
+
while others are hosted for you. You can easily access them yourself, for example if
|
190
|
+
you want to run `owl_v2_image` and visualize the output you can run:
|
104
191
|
|
105
|
-
#### Basic Programmatic Usage
|
106
192
|
```python
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
193
|
+
import vision_agent.tools as T
|
194
|
+
import matplotlib.pyplot as plt
|
195
|
+
|
196
|
+
image = T.load_image("dogs.jpg")
|
197
|
+
dets = T.owl_v2_image("dogs", image)
|
198
|
+
# visualize the owl_v2_ bounding boxes on the image
|
199
|
+
viz = T.overlay_bounding_boxes(image, dets)
|
200
|
+
|
201
|
+
# plot the image in matplotlib or save it
|
202
|
+
plt.imshow(viz)
|
203
|
+
plt.show()
|
204
|
+
T.save_image(viz, "viz.png")
|
114
205
|
```
|
115
206
|
|
116
|
-
|
117
|
-
embeddings for tool searching.
|
207
|
+
Or if you want to run on video data, for example track sharks and people at 10 FPS:
|
118
208
|
|
119
|
-
|
120
|
-
|
121
|
-
|
209
|
+
```python
|
210
|
+
frames_and_ts = T.extract_frames_and_timestamps("sharks.mp4", fps=10)
|
211
|
+
# extract only the frames from frames and timestamps
|
212
|
+
frames = [f["frame"] for f in frames_and_ts]
|
213
|
+
# track the sharks and people in the frames, returns segmentation masks
|
214
|
+
track = T.florence2_sam2_video_tracking("shark, person", frames)
|
215
|
+
# plot the segmentation masks on the frames
|
216
|
+
viz = T.overlay_segmentation_masks(frames, track)
|
217
|
+
T.save_video(viz, "viz.mp4")
|
218
|
+
```
|
219
|
+
|
220
|
+
You can find all available tools in `vision_agent/tools/tools.py`, however the
|
221
|
+
`VisionAgent` will only utilizes a subset of tools that have been tested and provide
|
222
|
+
the best performance. Those can be found in the same file under the `FUNCION_TOOLS`
|
223
|
+
variable inside `tools.py`.
|
224
|
+
|
225
|
+
#### Custom Tools
|
226
|
+
If you can't find the tool you are looking for you can also add custom tools to the
|
227
|
+
agent:
|
228
|
+
|
229
|
+
```python
|
230
|
+
import vision_agent as va
|
231
|
+
import numpy as np
|
232
|
+
|
233
|
+
@va.tools.register_tool(imports=["import numpy as np"])
|
234
|
+
def custom_tool(image_path: str) -> str:
|
235
|
+
"""My custom tool documentation.
|
236
|
+
|
237
|
+
Parameters:
|
238
|
+
image_path (str): The path to the image.
|
239
|
+
|
240
|
+
Returns:
|
241
|
+
str: The result of the tool.
|
242
|
+
|
243
|
+
Example
|
244
|
+
-------
|
245
|
+
>>> custom_tool("image.jpg")
|
246
|
+
"""
|
247
|
+
|
248
|
+
return np.zeros((10, 10))
|
249
|
+
```
|
250
|
+
|
251
|
+
You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
|
252
|
+
variables will not be captured by `register_tool` so you need to include them in the
|
253
|
+
function. Make sure the documentation is in the same format above with description,
|
254
|
+
`Parameters:`, `Returns:`, and `Example\n-------`. The `VisionAgent` will use your
|
255
|
+
documentation when trying to determine when to use your tool. You can find an example
|
256
|
+
use case [here](examples/custom_tools/) for adding a custom tool. Note you may need to
|
257
|
+
play around with the prompt to ensure the model picks the tool when you want it to.
|
258
|
+
|
259
|
+
Can't find the tool you need and want us to host it? Check out our
|
260
|
+
[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
|
261
|
+
we add the source code for all the tools used in `VisionAgent`.
|
262
|
+
|
263
|
+
## LMMs
|
264
|
+
All of our agents are based off of LMMs or Large Multimodal Models. We provide a thin
|
265
|
+
abstraction layer on top of the underlying provider APIs to be able to more easily
|
266
|
+
handle media.
|
267
|
+
|
268
|
+
|
269
|
+
```python
|
270
|
+
from vision_agent.lmm import AnthropicLMM
|
271
|
+
|
272
|
+
lmm = AnthropicLMM()
|
273
|
+
response = lmm("Describe this image", media=["apple.jpg"])
|
274
|
+
>>> "This is an image of an apple."
|
275
|
+
```
|
276
|
+
|
277
|
+
Or you can use the `OpenAI` chat interaface and pass it other media like videos:
|
278
|
+
|
279
|
+
```python
|
280
|
+
response = lmm(
|
281
|
+
[
|
282
|
+
{
|
283
|
+
"role": "user",
|
284
|
+
"content": "What's going on in this video?",
|
285
|
+
"media": ["video.mp4"]
|
286
|
+
}
|
287
|
+
]
|
288
|
+
)
|
289
|
+
```
|
290
|
+
|
291
|
+
## Vision Agent Coder
|
292
|
+
Underneath the hood, `VisionAgent` uses `VisionAgentCoder` to generate code to solve
|
293
|
+
vision tasks. You can use `VisionAgentCoder` directly to generate code if you want:
|
122
294
|
|
123
295
|
```python
|
124
296
|
>>> from vision_agent.agent import VisionAgentCoder
|
@@ -128,17 +300,17 @@ You can interact with the agent as you would with any LLM or LMM model:
|
|
128
300
|
|
129
301
|
Which produces the following code:
|
130
302
|
```python
|
131
|
-
from vision_agent.tools import load_image,
|
303
|
+
from vision_agent.tools import load_image, florence2_sam2_image
|
132
304
|
|
133
305
|
def calculate_filled_percentage(image_path: str) -> float:
|
134
306
|
# Step 1: Load the image
|
135
307
|
image = load_image(image_path)
|
136
308
|
|
137
309
|
# Step 2: Segment the jar
|
138
|
-
jar_segments =
|
310
|
+
jar_segments = florence2_sam2_image("jar", image)
|
139
311
|
|
140
312
|
# Step 3: Segment the coffee beans
|
141
|
-
coffee_beans_segments =
|
313
|
+
coffee_beans_segments = florence2_sam2_image("coffee beans", image)
|
142
314
|
|
143
315
|
# Step 4: Calculate the area of the segmented jar
|
144
316
|
jar_area = 0
|
@@ -166,7 +338,7 @@ mode by passing in the verbose argument:
|
|
166
338
|
>>> agent = VisionAgentCoder(verbosity=2)
|
167
339
|
```
|
168
340
|
|
169
|
-
|
341
|
+
### Detailed Usage
|
170
342
|
You can also have it return more information by calling `chat_with_workflow`. The format
|
171
343
|
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
172
344
|
|
@@ -186,7 +358,7 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
|
|
186
358
|
With this you can examine more detailed information such as the testing code, testing
|
187
359
|
results, plan or working memory it used to complete the task.
|
188
360
|
|
189
|
-
|
361
|
+
### Multi-turn conversations
|
190
362
|
You can have multi-turn conversations with vision-agent as well, giving it feedback on
|
191
363
|
the code and having it update. You just need to add the code as a response from the
|
192
364
|
assistant:
|
@@ -212,60 +384,6 @@ conv.append(
|
|
212
384
|
result = agent.chat_with_workflow(conv)
|
213
385
|
```
|
214
386
|
|
215
|
-
### Tools
|
216
|
-
There are a variety of tools for the model or the user to use. Some are executed locally
|
217
|
-
while others are hosted for you. You can easily access them yourself, for example if
|
218
|
-
you want to run `owl_v2_image` and visualize the output you can run:
|
219
|
-
|
220
|
-
```python
|
221
|
-
import vision_agent.tools as T
|
222
|
-
import matplotlib.pyplot as plt
|
223
|
-
|
224
|
-
image = T.load_image("dogs.jpg")
|
225
|
-
dets = T.owl_v2_image("dogs", image)
|
226
|
-
viz = T.overlay_bounding_boxes(image, dets)
|
227
|
-
plt.imshow(viz)
|
228
|
-
plt.show()
|
229
|
-
```
|
230
|
-
|
231
|
-
You can find all available tools in `vision_agent/tools/tools.py`, however,
|
232
|
-
`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
|
233
|
-
the best performance. Those can be found in the same file under the `TOOLS` variable.
|
234
|
-
|
235
|
-
If you can't find the tool you are looking for you can also add custom tools to the
|
236
|
-
agent:
|
237
|
-
|
238
|
-
```python
|
239
|
-
import vision_agent as va
|
240
|
-
import numpy as np
|
241
|
-
|
242
|
-
@va.tools.register_tool(imports=["import numpy as np"])
|
243
|
-
def custom_tool(image_path: str) -> str:
|
244
|
-
"""My custom tool documentation.
|
245
|
-
|
246
|
-
Parameters:
|
247
|
-
image_path (str): The path to the image.
|
248
|
-
|
249
|
-
Returns:
|
250
|
-
str: The result of the tool.
|
251
|
-
|
252
|
-
Example
|
253
|
-
-------
|
254
|
-
>>> custom_tool("image.jpg")
|
255
|
-
"""
|
256
|
-
|
257
|
-
return np.zeros((10, 10))
|
258
|
-
```
|
259
|
-
|
260
|
-
You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
|
261
|
-
variables will not be captured by `register_tool` so you need to include them in the
|
262
|
-
function. Make sure the documentation is in the same format above with description,
|
263
|
-
`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
|
264
|
-
[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
|
265
|
-
|
266
|
-
Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
267
|
-
[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
|
268
|
-
we add the source code for all the tools used in `VisionAgent`.
|
269
387
|
|
270
388
|
## Additional Backends
|
271
389
|
### Anthropic
|
@@ -370,9 +488,9 @@ agent = va.agent.AzureVisionAgentCoder()
|
|
370
488
|
|
371
489
|
******************************************************************************************************************************
|
372
490
|
|
373
|
-
|
491
|
+
## Q&A
|
374
492
|
|
375
|
-
|
493
|
+
### How to get started with OpenAI API credits
|
376
494
|
|
377
495
|
1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
378
496
|
2. Follow the instructions to purchase and manage your API credits.
|
@@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=eIpLz2NunEqEsBBrECJaD34-2uY0bsFNnW-XKfqqohs,2518
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=GAoTxGrWrJGk-4rC-e1BvjMzv0UuIVv45rGVW3kmLJk,18463
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=2ZoGikn2nakGDfs20XRshZjQUyvbw6l47UhExJAYkqI,38515
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.158.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.158.dist-info/METADATA,sha256=VGAG_jcVVy8RRa_H6KuxCUIkrATVfyw_WBOceGhVgN4,17753
|
32
|
+
vision_agent-0.2.158.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.158.dist-info/RECORD,,
|
File without changes
|
File without changes
|