vision-agent 0.2.161__tar.gz → 0.2.163__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.161 → vision_agent-0.2.163}/PKG-INFO +8 -7
- {vision_agent-0.2.161 → vision_agent-0.2.163}/README.md +6 -6
- {vision_agent-0.2.161 → vision_agent-0.2.163}/pyproject.toml +2 -1
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/agent/__init__.py +8 -0
- vision_agent-0.2.163/vision_agent/agent/agent_utils.py +181 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/agent/vision_agent.py +54 -22
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/agent/vision_agent_coder.py +222 -512
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/agent/vision_agent_coder_prompts.py +12 -221
- vision_agent-0.2.163/vision_agent/agent/vision_agent_planner.py +583 -0
- vision_agent-0.2.163/vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/tools/__init__.py +0 -1
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/tools/meta_tools.py +107 -35
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/tools/tools.py +2 -2
- vision_agent-0.2.161/vision_agent/agent/agent_utils.py +0 -85
- {vision_agent-0.2.161 → vision_agent-0.2.163}/LICENSE +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.163
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -27,6 +27,7 @@ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
|
27
27
|
Requires-Dist: pydantic (==2.7.4)
|
28
28
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
29
29
|
Requires-Dist: pytube (==15.0.0)
|
30
|
+
Requires-Dist: redbaron (>=0.9.2,<0.10.0)
|
30
31
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
31
32
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
32
33
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
@@ -142,7 +143,7 @@ continuing, for example it may want to execute code and look at the output befor
|
|
142
143
|
letting the user respond.
|
143
144
|
|
144
145
|
### Chatting and Artifacts
|
145
|
-
If you run `
|
146
|
+
If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
|
146
147
|
are a way to sync files between local and remote environments. The agent will read and
|
147
148
|
write to the artifact object, which is just a pickle object, when it wants to save or
|
148
149
|
load files.
|
@@ -159,7 +160,7 @@ with open("image.png", "rb") as f:
|
|
159
160
|
artifacts["image.png"] = f.read()
|
160
161
|
|
161
162
|
agent = va.agent.VisionAgent()
|
162
|
-
response, artifacts = agent.
|
163
|
+
response, artifacts = agent.chat_with_artifacts(
|
163
164
|
[
|
164
165
|
{
|
165
166
|
"role": "user",
|
@@ -339,11 +340,11 @@ mode by passing in the verbose argument:
|
|
339
340
|
```
|
340
341
|
|
341
342
|
### Detailed Usage
|
342
|
-
You can also have it return more information by calling `
|
343
|
+
You can also have it return more information by calling `generate_code`. The format
|
343
344
|
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
344
345
|
|
345
346
|
```python
|
346
|
-
>>> results = agent.
|
347
|
+
>>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
|
347
348
|
>>> print(results)
|
348
349
|
{
|
349
350
|
"code": "from vision_agent.tools import ..."
|
@@ -372,7 +373,7 @@ conv = [
|
|
372
373
|
"media": ["workers.png"],
|
373
374
|
}
|
374
375
|
]
|
375
|
-
result = agent.
|
376
|
+
result = agent.generate_code(conv)
|
376
377
|
code = result["code"]
|
377
378
|
conv.append({"role": "assistant", "content": code})
|
378
379
|
conv.append(
|
@@ -381,7 +382,7 @@ conv.append(
|
|
381
382
|
"content": "Can you also return the number of workers wearing safety gear?",
|
382
383
|
}
|
383
384
|
)
|
384
|
-
result = agent.
|
385
|
+
result = agent.generate_code(conv)
|
385
386
|
```
|
386
387
|
|
387
388
|
|
@@ -101,7 +101,7 @@ continuing, for example it may want to execute code and look at the output befor
|
|
101
101
|
letting the user respond.
|
102
102
|
|
103
103
|
### Chatting and Artifacts
|
104
|
-
If you run `
|
104
|
+
If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
|
105
105
|
are a way to sync files between local and remote environments. The agent will read and
|
106
106
|
write to the artifact object, which is just a pickle object, when it wants to save or
|
107
107
|
load files.
|
@@ -118,7 +118,7 @@ with open("image.png", "rb") as f:
|
|
118
118
|
artifacts["image.png"] = f.read()
|
119
119
|
|
120
120
|
agent = va.agent.VisionAgent()
|
121
|
-
response, artifacts = agent.
|
121
|
+
response, artifacts = agent.chat_with_artifacts(
|
122
122
|
[
|
123
123
|
{
|
124
124
|
"role": "user",
|
@@ -298,11 +298,11 @@ mode by passing in the verbose argument:
|
|
298
298
|
```
|
299
299
|
|
300
300
|
### Detailed Usage
|
301
|
-
You can also have it return more information by calling `
|
301
|
+
You can also have it return more information by calling `generate_code`. The format
|
302
302
|
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
303
303
|
|
304
304
|
```python
|
305
|
-
>>> results = agent.
|
305
|
+
>>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
|
306
306
|
>>> print(results)
|
307
307
|
{
|
308
308
|
"code": "from vision_agent.tools import ..."
|
@@ -331,7 +331,7 @@ conv = [
|
|
331
331
|
"media": ["workers.png"],
|
332
332
|
}
|
333
333
|
]
|
334
|
-
result = agent.
|
334
|
+
result = agent.generate_code(conv)
|
335
335
|
code = result["code"]
|
336
336
|
conv.append({"role": "assistant", "content": code})
|
337
337
|
conv.append(
|
@@ -340,7 +340,7 @@ conv.append(
|
|
340
340
|
"content": "Can you also return the number of workers wearing safety gear?",
|
341
341
|
}
|
342
342
|
)
|
343
|
-
result = agent.
|
343
|
+
result = agent.generate_code(conv)
|
344
344
|
```
|
345
345
|
|
346
346
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "vision-agent"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.163"
|
8
8
|
description = "Toolset for Vision Agent"
|
9
9
|
authors = ["Landing AI <dev@landing.ai>"]
|
10
10
|
readme = "README.md"
|
@@ -43,6 +43,7 @@ pytube = "15.0.0"
|
|
43
43
|
anthropic = "^0.31.0"
|
44
44
|
pydantic = "2.7.4"
|
45
45
|
av = "^11.0.0"
|
46
|
+
redbaron = "^0.9.2"
|
46
47
|
|
47
48
|
[tool.poetry.group.dev.dependencies]
|
48
49
|
autoflake = "1.*"
|
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
|
|
7
7
|
OpenAIVisionAgentCoder,
|
8
8
|
VisionAgentCoder,
|
9
9
|
)
|
10
|
+
from .vision_agent_planner import (
|
11
|
+
AnthropicVisionAgentPlanner,
|
12
|
+
AzureVisionAgentPlanner,
|
13
|
+
OllamaVisionAgentPlanner,
|
14
|
+
OpenAIVisionAgentPlanner,
|
15
|
+
PlanContext,
|
16
|
+
VisionAgentPlanner,
|
17
|
+
)
|
@@ -0,0 +1,181 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
import sys
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.style import Style
|
9
|
+
from rich.syntax import Syntax
|
10
|
+
|
11
|
+
import vision_agent.tools as T
|
12
|
+
|
13
|
+
logging.basicConfig(stream=sys.stdout)
|
14
|
+
_LOGGER = logging.getLogger(__name__)
|
15
|
+
_CONSOLE = Console()
|
16
|
+
_MAX_TABULATE_COL_WIDTH = 80
|
17
|
+
|
18
|
+
|
19
|
+
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
20
|
+
json_pattern = r"\{.*\}"
|
21
|
+
match = re.search(json_pattern, json_str, re.DOTALL)
|
22
|
+
if match:
|
23
|
+
json_str = match.group()
|
24
|
+
try:
|
25
|
+
# remove trailing comma
|
26
|
+
trailing_bracket_pattern = r",\s+\}"
|
27
|
+
json_str = re.sub(trailing_bracket_pattern, "}", json_str, flags=re.DOTALL)
|
28
|
+
|
29
|
+
json_dict = json.loads(json_str)
|
30
|
+
return json_dict # type: ignore
|
31
|
+
except json.JSONDecodeError:
|
32
|
+
return None
|
33
|
+
return None
|
34
|
+
|
35
|
+
|
36
|
+
def _find_markdown_json(json_str: str) -> str:
|
37
|
+
pattern = r"```json(.*?)```"
|
38
|
+
match = re.search(pattern, json_str, re.DOTALL)
|
39
|
+
if match:
|
40
|
+
return match.group(1).strip()
|
41
|
+
return json_str
|
42
|
+
|
43
|
+
|
44
|
+
def _strip_markdown_code(inp_str: str) -> str:
|
45
|
+
pattern = r"```python.*?```"
|
46
|
+
cleaned_str = re.sub(pattern, "", inp_str, flags=re.DOTALL)
|
47
|
+
return cleaned_str
|
48
|
+
|
49
|
+
|
50
|
+
def extract_json(json_str: str) -> Dict[str, Any]:
|
51
|
+
json_str_mod = json_str.replace("\n", " ").strip()
|
52
|
+
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
53
|
+
": False", ": false"
|
54
|
+
)
|
55
|
+
|
56
|
+
# sometimes the json is in single quotes
|
57
|
+
try:
|
58
|
+
return json.loads(json_str_mod.replace("'", '"')) # type: ignore
|
59
|
+
except json.JSONDecodeError:
|
60
|
+
pass
|
61
|
+
|
62
|
+
try:
|
63
|
+
return json.loads(json_str_mod) # type: ignore
|
64
|
+
except json.JSONDecodeError:
|
65
|
+
json_orig = json_str
|
66
|
+
# don't replace quotes here or booleans since it can also introduce errors
|
67
|
+
json_str = json_str.replace("\n", " ").strip()
|
68
|
+
json_str = _strip_markdown_code(json_str)
|
69
|
+
json_str = _find_markdown_json(json_str)
|
70
|
+
json_dict = _extract_sub_json(json_str)
|
71
|
+
|
72
|
+
if json_dict is None:
|
73
|
+
error_msg = f"Could not extract JSON from the given str: {json_orig}"
|
74
|
+
_LOGGER.exception(error_msg)
|
75
|
+
raise ValueError(error_msg)
|
76
|
+
|
77
|
+
return json_dict
|
78
|
+
|
79
|
+
|
80
|
+
def extract_code(code: str) -> str:
|
81
|
+
if "\n```python" in code:
|
82
|
+
start = "\n```python"
|
83
|
+
elif "```python" in code:
|
84
|
+
start = "```python"
|
85
|
+
else:
|
86
|
+
return code
|
87
|
+
|
88
|
+
code = code[code.find(start) + len(start) :]
|
89
|
+
code = code[: code.find("```")]
|
90
|
+
if code.startswith("python\n"):
|
91
|
+
code = code[len("python\n") :]
|
92
|
+
return code
|
93
|
+
|
94
|
+
|
95
|
+
def extract_tag(
|
96
|
+
content: str,
|
97
|
+
tag: str,
|
98
|
+
) -> Optional[str]:
|
99
|
+
inner_content = None
|
100
|
+
remaning = content
|
101
|
+
all_inner_content = []
|
102
|
+
|
103
|
+
while f"<{tag}>" in remaning:
|
104
|
+
inner_content_i = remaning[remaning.find(f"<{tag}>") + len(f"<{tag}>") :]
|
105
|
+
if f"</{tag}>" not in inner_content_i:
|
106
|
+
break
|
107
|
+
inner_content_i = inner_content_i[: inner_content_i.find(f"</{tag}>")]
|
108
|
+
remaning = remaning[remaning.find(f"</{tag}>") + len(f"</{tag}>") :]
|
109
|
+
all_inner_content.append(inner_content_i)
|
110
|
+
|
111
|
+
if len(all_inner_content) > 0:
|
112
|
+
inner_content = "\n".join(all_inner_content)
|
113
|
+
return inner_content
|
114
|
+
|
115
|
+
|
116
|
+
def remove_installs_from_code(code: str) -> str:
|
117
|
+
pattern = r"\n!pip install.*?(\n|\Z)\n"
|
118
|
+
code = re.sub(pattern, "", code, flags=re.DOTALL)
|
119
|
+
return code
|
120
|
+
|
121
|
+
|
122
|
+
def format_memory(memory: List[Dict[str, str]]) -> str:
|
123
|
+
output_str = ""
|
124
|
+
for i, m in enumerate(memory):
|
125
|
+
output_str += f"### Feedback {i}:\n"
|
126
|
+
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
127
|
+
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
128
|
+
if "edits" in m:
|
129
|
+
output_str += f"Edits {i}:\n{m['edits']}\n"
|
130
|
+
output_str += "\n"
|
131
|
+
|
132
|
+
return output_str
|
133
|
+
|
134
|
+
|
135
|
+
def format_plans(plans: Dict[str, Any]) -> str:
|
136
|
+
plan_str = ""
|
137
|
+
for k, v in plans.items():
|
138
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
139
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
140
|
+
|
141
|
+
return plan_str
|
142
|
+
|
143
|
+
|
144
|
+
class DefaultImports:
|
145
|
+
"""Container for default imports used in the code execution."""
|
146
|
+
|
147
|
+
common_imports = [
|
148
|
+
"import os",
|
149
|
+
"import numpy as np",
|
150
|
+
"from vision_agent.tools import *",
|
151
|
+
"from typing import *",
|
152
|
+
"from pillow_heif import register_heif_opener",
|
153
|
+
"register_heif_opener()",
|
154
|
+
]
|
155
|
+
|
156
|
+
@staticmethod
|
157
|
+
def to_code_string() -> str:
|
158
|
+
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
159
|
+
|
160
|
+
@staticmethod
|
161
|
+
def prepend_imports(code: str) -> str:
|
162
|
+
"""Run this method to prepend the default imports to the code.
|
163
|
+
NOTE: be sure to run this method after the custom tools have been registered.
|
164
|
+
"""
|
165
|
+
return DefaultImports.to_code_string() + "\n\n" + code
|
166
|
+
|
167
|
+
|
168
|
+
def print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
169
|
+
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
170
|
+
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
171
|
+
_CONSOLE.print(
|
172
|
+
Syntax(
|
173
|
+
DefaultImports.prepend_imports(code),
|
174
|
+
"python",
|
175
|
+
theme="gruvbox-dark",
|
176
|
+
line_numbers=True,
|
177
|
+
)
|
178
|
+
)
|
179
|
+
if test:
|
180
|
+
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
181
|
+
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
14
14
|
VA_CODE,
|
15
15
|
)
|
16
16
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
17
|
-
from vision_agent.tools import META_TOOL_DOCSTRING
|
18
17
|
from vision_agent.tools.meta_tools import (
|
18
|
+
META_TOOL_DOCSTRING,
|
19
19
|
Artifacts,
|
20
20
|
check_and_load_image,
|
21
21
|
use_extra_vision_agent_args,
|
@@ -103,7 +103,7 @@ def execute_code_action(
|
|
103
103
|
def parse_execution(
|
104
104
|
response: str,
|
105
105
|
test_multi_plan: bool = True,
|
106
|
-
|
106
|
+
custom_tool_names: Optional[List[str]] = None,
|
107
107
|
) -> Optional[str]:
|
108
108
|
code = None
|
109
109
|
remaining = response
|
@@ -122,7 +122,7 @@ def parse_execution(
|
|
122
122
|
code = "\n".join(all_code)
|
123
123
|
|
124
124
|
if code is not None:
|
125
|
-
code = use_extra_vision_agent_args(code, test_multi_plan,
|
125
|
+
code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
|
126
126
|
return code
|
127
127
|
|
128
128
|
|
@@ -195,9 +195,8 @@ class VisionAgent(Agent):
|
|
195
195
|
agent: Optional[LMM] = None,
|
196
196
|
verbosity: int = 0,
|
197
197
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
198
|
-
code_sandbox_runtime: Optional[str] = None,
|
199
198
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
200
|
-
code_interpreter: Optional[CodeInterpreter] = None,
|
199
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
201
200
|
) -> None:
|
202
201
|
"""Initialize the VisionAgent.
|
203
202
|
|
@@ -207,14 +206,17 @@ class VisionAgent(Agent):
|
|
207
206
|
verbosity (int): The verbosity level of the agent.
|
208
207
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
209
208
|
artifacts file.
|
210
|
-
|
211
|
-
|
209
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
210
|
+
function to send intermediate update messages.
|
211
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
212
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
213
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
214
|
+
object is provided it will use that.
|
212
215
|
"""
|
213
216
|
|
214
217
|
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
215
218
|
self.max_iterations = 12
|
216
219
|
self.verbosity = verbosity
|
217
|
-
self.code_sandbox_runtime = code_sandbox_runtime
|
218
220
|
self.code_interpreter = code_interpreter
|
219
221
|
self.callback_message = callback_message
|
220
222
|
if self.verbosity >= 1:
|
@@ -233,7 +235,7 @@ class VisionAgent(Agent):
|
|
233
235
|
input: Union[str, List[Message]],
|
234
236
|
media: Optional[Union[str, Path]] = None,
|
235
237
|
artifacts: Optional[Artifacts] = None,
|
236
|
-
) ->
|
238
|
+
) -> str:
|
237
239
|
"""Chat with VisionAgent and get the conversation response.
|
238
240
|
|
239
241
|
Parameters:
|
@@ -250,15 +252,33 @@ class VisionAgent(Agent):
|
|
250
252
|
input = [{"role": "user", "content": input}]
|
251
253
|
if media is not None:
|
252
254
|
input[0]["media"] = [media]
|
253
|
-
results, _ = self.
|
254
|
-
return results
|
255
|
+
results, _ = self.chat_with_artifacts(input, artifacts)
|
256
|
+
return results[-1]["content"] # type: ignore
|
257
|
+
|
258
|
+
def chat(
|
259
|
+
self,
|
260
|
+
chat: List[Message],
|
261
|
+
) -> List[Message]:
|
262
|
+
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
263
|
+
its tasks.
|
264
|
+
|
265
|
+
Parameters:
|
266
|
+
chat (List[Message]): A conversation in the format of:
|
267
|
+
[{"role": "user", "content": "describe your task here..."}]
|
268
|
+
or if it contains media files, it should be in the format of:
|
269
|
+
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
List[Message]: The conversation response.
|
273
|
+
"""
|
274
|
+
return self.chat_with_artifacts(chat)[0]
|
255
275
|
|
256
|
-
def
|
276
|
+
def chat_with_artifacts(
|
257
277
|
self,
|
258
278
|
chat: List[Message],
|
259
279
|
artifacts: Optional[Artifacts] = None,
|
260
280
|
test_multi_plan: bool = True,
|
261
|
-
|
281
|
+
custom_tool_names: Optional[List[str]] = None,
|
262
282
|
) -> Tuple[List[Message], Artifacts]:
|
263
283
|
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
264
284
|
its tasks.
|
@@ -272,7 +292,7 @@ class VisionAgent(Agent):
|
|
272
292
|
test_multi_plan (bool): If True, it will test tools for multiple plans and
|
273
293
|
pick the best one based off of the tool results. If False, it will go
|
274
294
|
with the first plan.
|
275
|
-
|
295
|
+
custom_tool_names (List[str]): A list of customized tools for agent to
|
276
296
|
pick and use. If not provided, default to full tool set from
|
277
297
|
vision_agent.tools.
|
278
298
|
|
@@ -287,11 +307,13 @@ class VisionAgent(Agent):
|
|
287
307
|
# this is setting remote artifacts path
|
288
308
|
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
289
309
|
|
310
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
290
311
|
code_interpreter = (
|
291
312
|
self.code_interpreter
|
292
313
|
if self.code_interpreter is not None
|
314
|
+
and not isinstance(self.code_interpreter, str)
|
293
315
|
else CodeInterpreterFactory.new_instance(
|
294
|
-
code_sandbox_runtime=self.
|
316
|
+
code_sandbox_runtime=self.code_interpreter,
|
295
317
|
)
|
296
318
|
)
|
297
319
|
with code_interpreter:
|
@@ -389,7 +411,7 @@ class VisionAgent(Agent):
|
|
389
411
|
finished = response["let_user_respond"]
|
390
412
|
|
391
413
|
code_action = parse_execution(
|
392
|
-
response["response"], test_multi_plan,
|
414
|
+
response["response"], test_multi_plan, custom_tool_names
|
393
415
|
)
|
394
416
|
|
395
417
|
if last_response == response:
|
@@ -480,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
480
502
|
agent: Optional[LMM] = None,
|
481
503
|
verbosity: int = 0,
|
482
504
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
483
|
-
code_sandbox_runtime: Optional[str] = None,
|
484
505
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
506
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
485
507
|
) -> None:
|
486
508
|
"""Initialize the VisionAgent using OpenAI LMMs.
|
487
509
|
|
@@ -491,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
|
|
491
513
|
verbosity (int): The verbosity level of the agent.
|
492
514
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
493
515
|
artifacts file.
|
494
|
-
|
516
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
517
|
+
function to send intermediate update messages.
|
518
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
519
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
520
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
521
|
+
object is provided it will use that.
|
495
522
|
"""
|
496
523
|
|
497
524
|
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
@@ -499,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
499
526
|
agent,
|
500
527
|
verbosity,
|
501
528
|
local_artifacts_path,
|
502
|
-
code_sandbox_runtime,
|
503
529
|
callback_message,
|
530
|
+
code_interpreter,
|
504
531
|
)
|
505
532
|
|
506
533
|
|
@@ -510,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
|
|
510
537
|
agent: Optional[LMM] = None,
|
511
538
|
verbosity: int = 0,
|
512
539
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
513
|
-
code_sandbox_runtime: Optional[str] = None,
|
514
540
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
541
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
515
542
|
) -> None:
|
516
543
|
"""Initialize the VisionAgent using Anthropic LMMs.
|
517
544
|
|
@@ -521,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
|
|
521
548
|
verbosity (int): The verbosity level of the agent.
|
522
549
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
523
550
|
artifacts file.
|
524
|
-
|
551
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
552
|
+
function to send intermediate update messages.
|
553
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
554
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
555
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
556
|
+
object is provided it will use that.
|
525
557
|
"""
|
526
558
|
|
527
559
|
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
@@ -529,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
529
561
|
agent,
|
530
562
|
verbosity,
|
531
563
|
local_artifacts_path,
|
532
|
-
code_sandbox_runtime,
|
533
564
|
callback_message,
|
565
|
+
code_interpreter,
|
534
566
|
)
|