vision-agent 0.2.140__tar.gz → 0.2.142__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.140 → vision_agent-0.2.142}/PKG-INFO +60 -12
- {vision_agent-0.2.140 → vision_agent-0.2.142}/README.md +59 -11
- {vision_agent-0.2.140 → vision_agent-0.2.142}/pyproject.toml +1 -1
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/agent/__init__.py +2 -1
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/agent/agent_utils.py +8 -2
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/agent/vision_agent.py +97 -17
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/agent/vision_agent_coder.py +93 -66
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/agent/vision_agent_coder_prompts.py +53 -19
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/agent/vision_agent_prompts.py +31 -9
- vision_agent-0.2.142/vision_agent/lmm/__init__.py +2 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/lmm/lmm.py +6 -9
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/tools/__init__.py +1 -1
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/tools/meta_tools.py +65 -33
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/tools/tools.py +115 -30
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/tools/tools_types.py +1 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/utils/image_utils.py +18 -7
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/utils/video.py +2 -1
- vision_agent-0.2.140/vision_agent/lmm/__init__.py +0 -2
- {vision_agent-0.2.140 → vision_agent-0.2.142}/LICENSE +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.140 → vision_agent-0.2.142}/vision_agent/utils/type_defs.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.142
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -74,10 +74,11 @@ To get started, you can install the library using pip:
|
|
74
74
|
pip install vision-agent
|
75
75
|
```
|
76
76
|
|
77
|
-
Ensure you have an OpenAI API key and set
|
78
|
-
using Azure OpenAI please see the Azure setup section):
|
77
|
+
Ensure you have an Anthropic key and an OpenAI API key and set in your environment
|
78
|
+
variables (if you are using Azure OpenAI please see the Azure setup section):
|
79
79
|
|
80
80
|
```bash
|
81
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
81
82
|
export OPENAI_API_KEY="your-api-key"
|
82
83
|
```
|
83
84
|
|
@@ -112,6 +113,9 @@ You can find more details about the streamlit app [here](examples/chat/).
|
|
112
113
|
>>> resp = agent(resp)
|
113
114
|
```
|
114
115
|
|
116
|
+
`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
|
117
|
+
embeddings for tool searching.
|
118
|
+
|
115
119
|
### Vision Agent Coder
|
116
120
|
#### Basic Usage
|
117
121
|
You can interact with the agent as you would with any LLM or LMM model:
|
@@ -173,7 +177,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
|
|
173
177
|
"code": "from vision_agent.tools import ..."
|
174
178
|
"test": "calculate_filled_percentage('jar.jpg')",
|
175
179
|
"test_result": "...",
|
176
|
-
"
|
180
|
+
"plans": {"plan1": {"thoughts": "..."}, ...},
|
181
|
+
"plan_thoughts": "...",
|
177
182
|
"working_memory": ...,
|
178
183
|
}
|
179
184
|
```
|
@@ -210,20 +215,25 @@ result = agent.chat_with_workflow(conv)
|
|
210
215
|
### Tools
|
211
216
|
There are a variety of tools for the model or the user to use. Some are executed locally
|
212
217
|
while others are hosted for you. You can easily access them yourself, for example if
|
213
|
-
you want to run `
|
218
|
+
you want to run `owl_v2_image` and visualize the output you can run:
|
214
219
|
|
215
220
|
```python
|
216
221
|
import vision_agent.tools as T
|
217
222
|
import matplotlib.pyplot as plt
|
218
223
|
|
219
224
|
image = T.load_image("dogs.jpg")
|
220
|
-
dets = T.
|
225
|
+
dets = T.owl_v2_image("dogs", image)
|
221
226
|
viz = T.overlay_bounding_boxes(image, dets)
|
222
227
|
plt.imshow(viz)
|
223
228
|
plt.show()
|
224
229
|
```
|
225
230
|
|
226
|
-
You can
|
231
|
+
You can find all available tools in `vision_agent/tools/tools.py`, however,
|
232
|
+
`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
|
233
|
+
the best performance. Those can be found in the same file under the `TOOLS` variable.
|
234
|
+
|
235
|
+
If you can't find the tool you are looking for you can also add custom tools to the
|
236
|
+
agent:
|
227
237
|
|
228
238
|
```python
|
229
239
|
import vision_agent as va
|
@@ -258,9 +268,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
|
258
268
|
we add the source code for all the tools used in `VisionAgent`.
|
259
269
|
|
260
270
|
## Additional Backends
|
271
|
+
### Anthropic
|
272
|
+
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
|
273
|
+
Anthropic API key and set it in your environment variables:
|
274
|
+
|
275
|
+
```bash
|
276
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
277
|
+
```
|
278
|
+
|
279
|
+
Because Anthropic does not support embedding models, the default embedding model used
|
280
|
+
is the OpenAI model so you will also need to set your OpenAI API key:
|
281
|
+
|
282
|
+
```bash
|
283
|
+
export OPEN_AI_API_KEY="your-api-key"
|
284
|
+
```
|
285
|
+
|
286
|
+
Usage is the same as `VisionAgentCoder`:
|
287
|
+
|
288
|
+
```python
|
289
|
+
>>> import vision_agent as va
|
290
|
+
>>> agent = va.agent.AnthropicVisionAgentCoder()
|
291
|
+
>>> agent("Count the apples in the image", media="apples.jpg")
|
292
|
+
```
|
293
|
+
|
294
|
+
### OpenAI
|
295
|
+
`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
|
296
|
+
key and set it in your environment variables:
|
297
|
+
|
298
|
+
```bash
|
299
|
+
export OPEN_AI_API_KEY="your-api-key"
|
300
|
+
```
|
301
|
+
|
302
|
+
Usage is the same as `VisionAgentCoder`:
|
303
|
+
|
304
|
+
```python
|
305
|
+
>>> import vision_agent as va
|
306
|
+
>>> agent = va.agent.OpenAIVisionAgentCoder()
|
307
|
+
>>> agent("Count the apples in the image", media="apples.jpg")
|
308
|
+
```
|
309
|
+
|
310
|
+
|
261
311
|
### Ollama
|
262
|
-
|
263
|
-
a few models:
|
312
|
+
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
264
313
|
|
265
314
|
```bash
|
266
315
|
ollama pull llama3.1
|
@@ -281,9 +330,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
|
|
281
330
|
> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
|
282
331
|
|
283
332
|
### Azure OpenAI
|
284
|
-
|
285
|
-
|
286
|
-
`VisionAgentCoder`:
|
333
|
+
`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
|
334
|
+
section below. You can use it just like you would use `VisionAgentCoder`:
|
287
335
|
|
288
336
|
```python
|
289
337
|
>>> import vision_agent as va
|
@@ -33,10 +33,11 @@ To get started, you can install the library using pip:
|
|
33
33
|
pip install vision-agent
|
34
34
|
```
|
35
35
|
|
36
|
-
Ensure you have an OpenAI API key and set
|
37
|
-
using Azure OpenAI please see the Azure setup section):
|
36
|
+
Ensure you have an Anthropic key and an OpenAI API key and set in your environment
|
37
|
+
variables (if you are using Azure OpenAI please see the Azure setup section):
|
38
38
|
|
39
39
|
```bash
|
40
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
40
41
|
export OPENAI_API_KEY="your-api-key"
|
41
42
|
```
|
42
43
|
|
@@ -71,6 +72,9 @@ You can find more details about the streamlit app [here](examples/chat/).
|
|
71
72
|
>>> resp = agent(resp)
|
72
73
|
```
|
73
74
|
|
75
|
+
`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
|
76
|
+
embeddings for tool searching.
|
77
|
+
|
74
78
|
### Vision Agent Coder
|
75
79
|
#### Basic Usage
|
76
80
|
You can interact with the agent as you would with any LLM or LMM model:
|
@@ -132,7 +136,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
|
|
132
136
|
"code": "from vision_agent.tools import ..."
|
133
137
|
"test": "calculate_filled_percentage('jar.jpg')",
|
134
138
|
"test_result": "...",
|
135
|
-
"
|
139
|
+
"plans": {"plan1": {"thoughts": "..."}, ...},
|
140
|
+
"plan_thoughts": "...",
|
136
141
|
"working_memory": ...,
|
137
142
|
}
|
138
143
|
```
|
@@ -169,20 +174,25 @@ result = agent.chat_with_workflow(conv)
|
|
169
174
|
### Tools
|
170
175
|
There are a variety of tools for the model or the user to use. Some are executed locally
|
171
176
|
while others are hosted for you. You can easily access them yourself, for example if
|
172
|
-
you want to run `
|
177
|
+
you want to run `owl_v2_image` and visualize the output you can run:
|
173
178
|
|
174
179
|
```python
|
175
180
|
import vision_agent.tools as T
|
176
181
|
import matplotlib.pyplot as plt
|
177
182
|
|
178
183
|
image = T.load_image("dogs.jpg")
|
179
|
-
dets = T.
|
184
|
+
dets = T.owl_v2_image("dogs", image)
|
180
185
|
viz = T.overlay_bounding_boxes(image, dets)
|
181
186
|
plt.imshow(viz)
|
182
187
|
plt.show()
|
183
188
|
```
|
184
189
|
|
185
|
-
You can
|
190
|
+
You can find all available tools in `vision_agent/tools/tools.py`, however,
|
191
|
+
`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
|
192
|
+
the best performance. Those can be found in the same file under the `TOOLS` variable.
|
193
|
+
|
194
|
+
If you can't find the tool you are looking for you can also add custom tools to the
|
195
|
+
agent:
|
186
196
|
|
187
197
|
```python
|
188
198
|
import vision_agent as va
|
@@ -217,9 +227,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
|
217
227
|
we add the source code for all the tools used in `VisionAgent`.
|
218
228
|
|
219
229
|
## Additional Backends
|
230
|
+
### Anthropic
|
231
|
+
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
|
232
|
+
Anthropic API key and set it in your environment variables:
|
233
|
+
|
234
|
+
```bash
|
235
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
236
|
+
```
|
237
|
+
|
238
|
+
Because Anthropic does not support embedding models, the default embedding model used
|
239
|
+
is the OpenAI model so you will also need to set your OpenAI API key:
|
240
|
+
|
241
|
+
```bash
|
242
|
+
export OPEN_AI_API_KEY="your-api-key"
|
243
|
+
```
|
244
|
+
|
245
|
+
Usage is the same as `VisionAgentCoder`:
|
246
|
+
|
247
|
+
```python
|
248
|
+
>>> import vision_agent as va
|
249
|
+
>>> agent = va.agent.AnthropicVisionAgentCoder()
|
250
|
+
>>> agent("Count the apples in the image", media="apples.jpg")
|
251
|
+
```
|
252
|
+
|
253
|
+
### OpenAI
|
254
|
+
`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
|
255
|
+
key and set it in your environment variables:
|
256
|
+
|
257
|
+
```bash
|
258
|
+
export OPEN_AI_API_KEY="your-api-key"
|
259
|
+
```
|
260
|
+
|
261
|
+
Usage is the same as `VisionAgentCoder`:
|
262
|
+
|
263
|
+
```python
|
264
|
+
>>> import vision_agent as va
|
265
|
+
>>> agent = va.agent.OpenAIVisionAgentCoder()
|
266
|
+
>>> agent("Count the apples in the image", media="apples.jpg")
|
267
|
+
```
|
268
|
+
|
269
|
+
|
220
270
|
### Ollama
|
221
|
-
|
222
|
-
a few models:
|
271
|
+
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
223
272
|
|
224
273
|
```bash
|
225
274
|
ollama pull llama3.1
|
@@ -240,9 +289,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
|
|
240
289
|
> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
|
241
290
|
|
242
291
|
### Azure OpenAI
|
243
|
-
|
244
|
-
|
245
|
-
`VisionAgentCoder`:
|
292
|
+
`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
|
293
|
+
section below. You can use it just like you would use `VisionAgentCoder`:
|
246
294
|
|
247
295
|
```python
|
248
296
|
>>> import vision_agent as va
|
@@ -40,12 +40,18 @@ def _strip_markdown_code(inp_str: str) -> str:
|
|
40
40
|
|
41
41
|
|
42
42
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
43
|
-
|
43
|
+
json_str_mod = json_str.replace("\n", " ").strip()
|
44
|
+
json_str_mod = json_str_mod.replace("'", '"')
|
45
|
+
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
46
|
+
": False", ": false"
|
47
|
+
)
|
44
48
|
|
45
49
|
try:
|
46
|
-
return json.loads(
|
50
|
+
return json.loads(json_str_mod) # type: ignore
|
47
51
|
except json.JSONDecodeError:
|
48
52
|
json_orig = json_str
|
53
|
+
# don't replace quotes here or booleans since it can also introduce errors
|
54
|
+
json_str = json_str.replace("\n", " ").strip()
|
49
55
|
json_str = _strip_markdown_code(json_str)
|
50
56
|
json_str = _find_markdown_json(json_str)
|
51
57
|
json_dict = _extract_sub_json(json_str)
|
@@ -3,18 +3,23 @@ import logging
|
|
3
3
|
import os
|
4
4
|
import tempfile
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
7
7
|
|
8
8
|
from vision_agent.agent import Agent
|
9
9
|
from vision_agent.agent.agent_utils import extract_json
|
10
10
|
from vision_agent.agent.vision_agent_prompts import (
|
11
11
|
EXAMPLES_CODE1,
|
12
12
|
EXAMPLES_CODE2,
|
13
|
+
EXAMPLES_CODE3,
|
13
14
|
VA_CODE,
|
14
15
|
)
|
15
|
-
from vision_agent.lmm import LMM, Message, OpenAILMM
|
16
|
+
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
16
17
|
from vision_agent.tools import META_TOOL_DOCSTRING
|
17
|
-
from vision_agent.tools.meta_tools import
|
18
|
+
from vision_agent.tools.meta_tools import (
|
19
|
+
Artifacts,
|
20
|
+
check_and_load_image,
|
21
|
+
use_extra_vision_agent_args,
|
22
|
+
)
|
18
23
|
from vision_agent.utils import CodeInterpreterFactory
|
19
24
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
20
25
|
|
@@ -30,7 +35,7 @@ class BoilerplateCode:
|
|
30
35
|
pre_code = [
|
31
36
|
"from typing import *",
|
32
37
|
"from vision_agent.utils.execute import CodeInterpreter",
|
33
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact,
|
38
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
|
34
39
|
"artifacts = Artifacts('{remote_path}')",
|
35
40
|
"artifacts.load('{remote_path}')",
|
36
41
|
]
|
@@ -68,10 +73,18 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
68
73
|
|
69
74
|
prompt = VA_CODE.format(
|
70
75
|
documentation=META_TOOL_DOCSTRING,
|
71
|
-
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
|
76
|
+
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
|
72
77
|
conversation=conversation,
|
73
78
|
)
|
74
|
-
|
79
|
+
message: Message = {"role": "user", "content": prompt}
|
80
|
+
# only add recent media so we don't overload the model with old images
|
81
|
+
if (
|
82
|
+
chat[-1]["role"] == "observation"
|
83
|
+
and "media" in chat[-1]
|
84
|
+
and len(chat[-1]["media"]) > 0 # type: ignore
|
85
|
+
):
|
86
|
+
message["media"] = chat[-1]["media"]
|
87
|
+
return extract_json(orch([message], stream=False)) # type: ignore
|
75
88
|
|
76
89
|
|
77
90
|
def run_code_action(
|
@@ -136,10 +149,8 @@ class VisionAgent(Agent):
|
|
136
149
|
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
137
150
|
"""
|
138
151
|
|
139
|
-
self.agent = (
|
140
|
-
|
141
|
-
)
|
142
|
-
self.max_iterations = 100
|
152
|
+
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
153
|
+
self.max_iterations = 12
|
143
154
|
self.verbosity = verbosity
|
144
155
|
self.code_sandbox_runtime = code_sandbox_runtime
|
145
156
|
self.callback_message = callback_message
|
@@ -267,7 +278,8 @@ class VisionAgent(Agent):
|
|
267
278
|
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
268
279
|
self.streaming_message({"role": "observation", "content": artifacts_loaded})
|
269
280
|
|
270
|
-
if
|
281
|
+
if int_chat[-1]["role"] == "user":
|
282
|
+
last_user_message_content = cast(str, int_chat[-1].get("content", ""))
|
271
283
|
user_code_action = parse_execution(last_user_message_content, False)
|
272
284
|
if user_code_action is not None:
|
273
285
|
user_result, user_obs = run_code_action(
|
@@ -309,8 +321,7 @@ class VisionAgent(Agent):
|
|
309
321
|
else:
|
310
322
|
self.streaming_message({"role": "assistant", "content": response})
|
311
323
|
|
312
|
-
|
313
|
-
break
|
324
|
+
finished = response["let_user_respond"]
|
314
325
|
|
315
326
|
code_action = parse_execution(
|
316
327
|
response["response"], test_multi_plan, customized_tool_names
|
@@ -321,13 +332,22 @@ class VisionAgent(Agent):
|
|
321
332
|
code_action, code_interpreter, str(remote_artifacts_path)
|
322
333
|
)
|
323
334
|
|
335
|
+
media_obs = check_and_load_image(code_action)
|
336
|
+
|
324
337
|
if self.verbosity >= 1:
|
325
338
|
_LOGGER.info(obs)
|
339
|
+
|
340
|
+
chat_elt: Message = {"role": "observation", "content": obs}
|
341
|
+
if media_obs and result.success:
|
342
|
+
chat_elt["media"] = [
|
343
|
+
Path(code_interpreter.remote_path) / media_ob
|
344
|
+
for media_ob in media_obs
|
345
|
+
]
|
346
|
+
|
326
347
|
# don't add execution results to internal chat
|
327
|
-
int_chat.append(
|
328
|
-
|
329
|
-
|
330
|
-
)
|
348
|
+
int_chat.append(chat_elt)
|
349
|
+
chat_elt["execution"] = result
|
350
|
+
orig_chat.append(chat_elt)
|
331
351
|
self.streaming_message(
|
332
352
|
{
|
333
353
|
"role": "observation",
|
@@ -353,3 +373,63 @@ class VisionAgent(Agent):
|
|
353
373
|
|
354
374
|
def log_progress(self, data: Dict[str, Any]) -> None:
|
355
375
|
pass
|
376
|
+
|
377
|
+
|
378
|
+
class OpenAIVisionAgent(VisionAgent):
|
379
|
+
def __init__(
|
380
|
+
self,
|
381
|
+
agent: Optional[LMM] = None,
|
382
|
+
verbosity: int = 0,
|
383
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
384
|
+
code_sandbox_runtime: Optional[str] = None,
|
385
|
+
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
386
|
+
) -> None:
|
387
|
+
"""Initialize the VisionAgent using OpenAI LMMs.
|
388
|
+
|
389
|
+
Parameters:
|
390
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
391
|
+
of other agents.
|
392
|
+
verbosity (int): The verbosity level of the agent.
|
393
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
394
|
+
artifacts file.
|
395
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
396
|
+
"""
|
397
|
+
|
398
|
+
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
399
|
+
super().__init__(
|
400
|
+
agent,
|
401
|
+
verbosity,
|
402
|
+
local_artifacts_path,
|
403
|
+
code_sandbox_runtime,
|
404
|
+
callback_message,
|
405
|
+
)
|
406
|
+
|
407
|
+
|
408
|
+
class AnthropicVisionAgent(VisionAgent):
|
409
|
+
def __init__(
|
410
|
+
self,
|
411
|
+
agent: Optional[LMM] = None,
|
412
|
+
verbosity: int = 0,
|
413
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
414
|
+
code_sandbox_runtime: Optional[str] = None,
|
415
|
+
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
416
|
+
) -> None:
|
417
|
+
"""Initialize the VisionAgent using Anthropic LMMs.
|
418
|
+
|
419
|
+
Parameters:
|
420
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
421
|
+
of other agents.
|
422
|
+
verbosity (int): The verbosity level of the agent.
|
423
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
424
|
+
artifacts file.
|
425
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
426
|
+
"""
|
427
|
+
|
428
|
+
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
429
|
+
super().__init__(
|
430
|
+
agent,
|
431
|
+
verbosity,
|
432
|
+
local_artifacts_path,
|
433
|
+
code_sandbox_runtime,
|
434
|
+
callback_message,
|
435
|
+
)
|