vision-agent 0.2.117__tar.gz → 0.2.119__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.117 → vision_agent-0.2.119}/PKG-INFO +12 -8
- {vision_agent-0.2.117 → vision_agent-0.2.119}/README.md +11 -7
- {vision_agent-0.2.117 → vision_agent-0.2.119}/pyproject.toml +1 -1
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/agent/agent.py +1 -1
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/agent/vision_agent.py +107 -49
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/agent/vision_agent_coder.py +46 -23
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/agent/vision_agent_prompts.py +43 -22
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/clients/landing_public_api.py +2 -2
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/lmm/lmm.py +15 -6
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/lmm/types.py +3 -1
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/tools/__init__.py +2 -2
- vision_agent-0.2.119/vision_agent/tools/meta_tools.py +410 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/tools/tools.py +36 -14
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/tools/tools_types.py +3 -3
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/utils/execute.py +69 -22
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/utils/image_utils.py +2 -2
- vision_agent-0.2.117/vision_agent/tools/meta_tools.py +0 -402
- {vision_agent-0.2.117 → vision_agent-0.2.119}/LICENSE +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.119}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.119
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -81,15 +81,15 @@ export OPENAI_API_KEY="your-api-key"
|
|
81
81
|
```
|
82
82
|
|
83
83
|
### Vision Agent
|
84
|
-
There are two agents that you can use.
|
84
|
+
There are two agents that you can use. `VisionAgent` is a conversational agent that has
|
85
85
|
access to tools that allow it to write an navigate python code and file systems. It can
|
86
|
-
converse with the user in natural language. VisionAgentCoder is an agent
|
87
|
-
code for vision tasks, such as counting people in an image. However, it
|
88
|
-
and can only respond with code. VisionAgent can call
|
89
|
-
code.
|
86
|
+
converse with the user in natural language. `VisionAgentCoder` is an agent specifically
|
87
|
+
for writing code for vision tasks, such as counting people in an image. However, it
|
88
|
+
cannot chat with you and can only respond with code. `VisionAgent` can call
|
89
|
+
`VisionAgentCoder` to write vision code.
|
90
90
|
|
91
91
|
#### Basic Usage
|
92
|
-
To run the streamlit app locally to chat with
|
92
|
+
To run the streamlit app locally to chat with `VisionAgent`, you can run the following
|
93
93
|
command:
|
94
94
|
|
95
95
|
```bash
|
@@ -186,7 +186,7 @@ the code and having it update. You just need to add the code as a response from
|
|
186
186
|
assistant:
|
187
187
|
|
188
188
|
```python
|
189
|
-
agent = va.agent.
|
189
|
+
agent = va.agent.VisionAgentCoder(verbosity=2)
|
190
190
|
conv = [
|
191
191
|
{
|
192
192
|
"role": "user",
|
@@ -252,6 +252,10 @@ function. Make sure the documentation is in the same format above with descripti
|
|
252
252
|
`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
|
253
253
|
[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
|
254
254
|
|
255
|
+
Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
256
|
+
[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
|
257
|
+
we add the source code for all the tools used in `VisionAgent`.
|
258
|
+
|
255
259
|
## Additional Backends
|
256
260
|
### Ollama
|
257
261
|
We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
|
@@ -41,15 +41,15 @@ export OPENAI_API_KEY="your-api-key"
|
|
41
41
|
```
|
42
42
|
|
43
43
|
### Vision Agent
|
44
|
-
There are two agents that you can use.
|
44
|
+
There are two agents that you can use. `VisionAgent` is a conversational agent that has
|
45
45
|
access to tools that allow it to write an navigate python code and file systems. It can
|
46
|
-
converse with the user in natural language. VisionAgentCoder is an agent
|
47
|
-
code for vision tasks, such as counting people in an image. However, it
|
48
|
-
and can only respond with code. VisionAgent can call
|
49
|
-
code.
|
46
|
+
converse with the user in natural language. `VisionAgentCoder` is an agent specifically
|
47
|
+
for writing code for vision tasks, such as counting people in an image. However, it
|
48
|
+
cannot chat with you and can only respond with code. `VisionAgent` can call
|
49
|
+
`VisionAgentCoder` to write vision code.
|
50
50
|
|
51
51
|
#### Basic Usage
|
52
|
-
To run the streamlit app locally to chat with
|
52
|
+
To run the streamlit app locally to chat with `VisionAgent`, you can run the following
|
53
53
|
command:
|
54
54
|
|
55
55
|
```bash
|
@@ -146,7 +146,7 @@ the code and having it update. You just need to add the code as a response from
|
|
146
146
|
assistant:
|
147
147
|
|
148
148
|
```python
|
149
|
-
agent = va.agent.
|
149
|
+
agent = va.agent.VisionAgentCoder(verbosity=2)
|
150
150
|
conv = [
|
151
151
|
{
|
152
152
|
"role": "user",
|
@@ -212,6 +212,10 @@ function. Make sure the documentation is in the same format above with descripti
|
|
212
212
|
`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
|
213
213
|
[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
|
214
214
|
|
215
|
+
Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
216
|
+
[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
|
217
|
+
we add the source code for all the tools used in `VisionAgent`.
|
218
|
+
|
215
219
|
## Additional Backends
|
216
220
|
### Ollama
|
217
221
|
We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import tempfile
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import Any, Dict, List, Optional, Union, cast
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
6
7
|
|
7
8
|
from vision_agent.agent import Agent
|
8
9
|
from vision_agent.agent.agent_utils import extract_json
|
@@ -13,8 +14,9 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
13
14
|
)
|
14
15
|
from vision_agent.lmm import LMM, Message, OpenAILMM
|
15
16
|
from vision_agent.tools import META_TOOL_DOCSTRING
|
17
|
+
from vision_agent.tools.meta_tools import Artifacts
|
16
18
|
from vision_agent.utils import CodeInterpreterFactory
|
17
|
-
from vision_agent.utils.execute import CodeInterpreter
|
19
|
+
from vision_agent.utils.execute import CodeInterpreter, Execution
|
18
20
|
|
19
21
|
logging.basicConfig(level=logging.INFO)
|
20
22
|
_LOGGER = logging.getLogger(__name__)
|
@@ -24,23 +26,30 @@ if str(WORKSPACE) != "":
|
|
24
26
|
os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
|
25
27
|
|
26
28
|
|
27
|
-
class
|
28
|
-
|
29
|
+
class BoilerplateCode:
|
30
|
+
pre_code = [
|
29
31
|
"from typing import *",
|
30
32
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import
|
33
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
|
34
|
+
"artifacts = Artifacts('{remote_path}')",
|
35
|
+
"artifacts.load('{remote_path}')",
|
36
|
+
]
|
37
|
+
post_code = [
|
38
|
+
"artifacts.save()",
|
32
39
|
]
|
33
40
|
|
34
41
|
@staticmethod
|
35
|
-
def
|
36
|
-
return "\n".join(DefaultImports.code)
|
37
|
-
|
38
|
-
@staticmethod
|
39
|
-
def prepend_imports(code: str) -> str:
|
42
|
+
def add_boilerplate(code: str, **format: Any) -> str:
|
40
43
|
"""Run this method to prepend the default imports to the code.
|
41
44
|
NOTE: be sure to run this method after the custom tools have been registered.
|
42
45
|
"""
|
43
|
-
return
|
46
|
+
return (
|
47
|
+
"\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
|
48
|
+
+ "\n\n"
|
49
|
+
+ code
|
50
|
+
+ "\n\n"
|
51
|
+
+ "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
|
52
|
+
)
|
44
53
|
|
45
54
|
|
46
55
|
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
@@ -60,35 +69,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
60
69
|
prompt = VA_CODE.format(
|
61
70
|
documentation=META_TOOL_DOCSTRING,
|
62
71
|
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
|
63
|
-
dir=WORKSPACE,
|
64
72
|
conversation=conversation,
|
65
73
|
)
|
66
74
|
return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
|
67
75
|
|
68
76
|
|
69
|
-
def run_code_action(
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
if result.success:
|
76
|
-
for res in result.results:
|
77
|
-
if res.text is not None:
|
78
|
-
return_str += res.text.replace("\\n", "\n")
|
79
|
-
if result.logs.stdout:
|
80
|
-
return_str += "----- stdout -----\n"
|
81
|
-
for log in result.logs.stdout:
|
82
|
-
return_str += log.replace("\\n", "\n")
|
83
|
-
else:
|
84
|
-
# for log in result.logs.stderr:
|
85
|
-
# return_str += log.replace("\\n", "\n")
|
86
|
-
if result.error:
|
87
|
-
return_str += (
|
88
|
-
"\n" + result.error.value + "\n".join(result.error.traceback_raw)
|
89
|
-
)
|
90
|
-
|
91
|
-
return return_str
|
77
|
+
def run_code_action(
|
78
|
+
code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
|
79
|
+
) -> Execution:
|
80
|
+
return code_interpreter.exec_isolation(
|
81
|
+
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
|
82
|
+
)
|
92
83
|
|
93
84
|
|
94
85
|
def parse_execution(response: str) -> Optional[str]:
|
@@ -101,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
|
|
101
92
|
|
102
93
|
class VisionAgent(Agent):
|
103
94
|
"""Vision Agent is an agent that can chat with the user and call tools or other
|
104
|
-
agents to generate code for it. Vision Agent uses python code to execute actions
|
105
|
-
the user. Vision Agent is inspired by by OpenDev
|
95
|
+
agents to generate code for it. Vision Agent uses python code to execute actions
|
96
|
+
for the user. Vision Agent is inspired by by OpenDev
|
106
97
|
https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
|
107
98
|
|
108
99
|
Example
|
@@ -118,8 +109,20 @@ class VisionAgent(Agent):
|
|
118
109
|
self,
|
119
110
|
agent: Optional[LMM] = None,
|
120
111
|
verbosity: int = 0,
|
112
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
121
113
|
code_sandbox_runtime: Optional[str] = None,
|
122
114
|
) -> None:
|
115
|
+
"""Initialize the VisionAgent.
|
116
|
+
|
117
|
+
Parameters:
|
118
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
119
|
+
of other agents.
|
120
|
+
verbosity (int): The verbosity level of the agent.
|
121
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
122
|
+
artifacts file.
|
123
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
124
|
+
"""
|
125
|
+
|
123
126
|
self.agent = (
|
124
127
|
OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
125
128
|
)
|
@@ -128,12 +131,21 @@ class VisionAgent(Agent):
|
|
128
131
|
self.code_sandbox_runtime = code_sandbox_runtime
|
129
132
|
if self.verbosity >= 1:
|
130
133
|
_LOGGER.setLevel(logging.INFO)
|
134
|
+
self.local_artifacts_path = cast(
|
135
|
+
str,
|
136
|
+
(
|
137
|
+
Path(local_artifacts_path)
|
138
|
+
if local_artifacts_path is not None
|
139
|
+
else Path(tempfile.NamedTemporaryFile(delete=False).name)
|
140
|
+
),
|
141
|
+
)
|
131
142
|
|
132
143
|
def __call__(
|
133
144
|
self,
|
134
145
|
input: Union[str, List[Message]],
|
135
146
|
media: Optional[Union[str, Path]] = None,
|
136
|
-
|
147
|
+
artifacts: Optional[Artifacts] = None,
|
148
|
+
) -> List[Message]:
|
137
149
|
"""Chat with VisionAgent and get the conversation response.
|
138
150
|
|
139
151
|
Parameters:
|
@@ -141,6 +153,7 @@ class VisionAgent(Agent):
|
|
141
153
|
[{"role": "user", "content": "describe your task here..."}, ...] or a
|
142
154
|
string of just the contents.
|
143
155
|
media (Optional[Union[str, Path]]): The media file to be used in the task.
|
156
|
+
artifacts (Optional[Artifacts]): The artifacts to use in the task.
|
144
157
|
|
145
158
|
Returns:
|
146
159
|
str: The conversation response.
|
@@ -149,22 +162,23 @@ class VisionAgent(Agent):
|
|
149
162
|
input = [{"role": "user", "content": input}]
|
150
163
|
if media is not None:
|
151
164
|
input[0]["media"] = [media]
|
152
|
-
results = self.chat_with_code(input)
|
153
|
-
return results
|
165
|
+
results, _ = self.chat_with_code(input, artifacts)
|
166
|
+
return results
|
154
167
|
|
155
168
|
def chat_with_code(
|
156
169
|
self,
|
157
170
|
chat: List[Message],
|
158
|
-
|
171
|
+
artifacts: Optional[Artifacts] = None,
|
172
|
+
) -> Tuple[List[Message], Artifacts]:
|
159
173
|
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
160
174
|
its tasks.
|
161
175
|
|
162
176
|
Parameters:
|
163
|
-
chat (List[Message]): A conversation
|
164
|
-
in the format of:
|
177
|
+
chat (List[Message]): A conversation in the format of:
|
165
178
|
[{"role": "user", "content": "describe your task here..."}]
|
166
179
|
or if it contains media files, it should be in the format of:
|
167
180
|
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
181
|
+
artifacts (Optional[Artifacts]): The artifacts to use in the task.
|
168
182
|
|
169
183
|
Returns:
|
170
184
|
List[Message]: The conversation response.
|
@@ -173,6 +187,10 @@ class VisionAgent(Agent):
|
|
173
187
|
if not chat:
|
174
188
|
raise ValueError("chat cannot be empty")
|
175
189
|
|
190
|
+
if not artifacts:
|
191
|
+
# this is setting remote artifacts path
|
192
|
+
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
193
|
+
|
176
194
|
with CodeInterpreterFactory.new_instance(
|
177
195
|
code_sandbox_runtime=self.code_sandbox_runtime
|
178
196
|
) as code_interpreter:
|
@@ -182,9 +200,14 @@ class VisionAgent(Agent):
|
|
182
200
|
for chat_i in int_chat:
|
183
201
|
if "media" in chat_i:
|
184
202
|
for media in chat_i["media"]:
|
185
|
-
media =
|
186
|
-
|
187
|
-
|
203
|
+
media = cast(str, media)
|
204
|
+
artifacts.artifacts[Path(media).name] = open(media, "rb").read()
|
205
|
+
|
206
|
+
media_remote_path = (
|
207
|
+
Path(code_interpreter.remote_path) / Path(media).name
|
208
|
+
)
|
209
|
+
chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
|
210
|
+
media_list.append(media_remote_path)
|
188
211
|
|
189
212
|
int_chat = cast(
|
190
213
|
List[Message],
|
@@ -204,6 +227,22 @@ class VisionAgent(Agent):
|
|
204
227
|
|
205
228
|
finished = False
|
206
229
|
iterations = 0
|
230
|
+
last_response = None
|
231
|
+
|
232
|
+
# Save the current state of artifacts, will include any images the user
|
233
|
+
# passed in.
|
234
|
+
artifacts.save(self.local_artifacts_path)
|
235
|
+
|
236
|
+
# Upload artifacts to remote location and show where they are going
|
237
|
+
# to be loaded to. The actual loading happens in BoilerplateCode as
|
238
|
+
# part of the pre_code.
|
239
|
+
remote_artifacts_path = code_interpreter.upload_file(
|
240
|
+
self.local_artifacts_path
|
241
|
+
)
|
242
|
+
artifacts_loaded = artifacts.show()
|
243
|
+
int_chat.append({"role": "observation", "content": artifacts_loaded})
|
244
|
+
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
245
|
+
|
207
246
|
while not finished and iterations < self.max_iterations:
|
208
247
|
response = run_conversation(self.agent, int_chat)
|
209
248
|
if self.verbosity >= 1:
|
@@ -211,20 +250,39 @@ class VisionAgent(Agent):
|
|
211
250
|
int_chat.append({"role": "assistant", "content": str(response)})
|
212
251
|
orig_chat.append({"role": "assistant", "content": str(response)})
|
213
252
|
|
253
|
+
# sometimes it gets stuck in a loop, so we force it to exit
|
254
|
+
if last_response == response:
|
255
|
+
response["let_user_respond"] = True
|
256
|
+
|
214
257
|
if response["let_user_respond"]:
|
215
258
|
break
|
216
259
|
|
217
260
|
code_action = parse_execution(response["response"])
|
218
261
|
|
219
262
|
if code_action is not None:
|
220
|
-
|
263
|
+
result = run_code_action(
|
264
|
+
code_action, code_interpreter, str(remote_artifacts_path)
|
265
|
+
)
|
266
|
+
obs = str(result.logs)
|
267
|
+
|
221
268
|
if self.verbosity >= 1:
|
222
269
|
_LOGGER.info(obs)
|
270
|
+
# don't add execution results to internal chat
|
223
271
|
int_chat.append({"role": "observation", "content": obs})
|
224
|
-
orig_chat.append(
|
272
|
+
orig_chat.append(
|
273
|
+
{"role": "observation", "content": obs, "execution": result}
|
274
|
+
)
|
225
275
|
|
226
276
|
iterations += 1
|
227
|
-
|
277
|
+
last_response = response
|
278
|
+
|
279
|
+
# after running the agent, download the artifacts locally
|
280
|
+
code_interpreter.download_file(
|
281
|
+
str(remote_artifacts_path.name), str(self.local_artifacts_path)
|
282
|
+
)
|
283
|
+
artifacts.load(self.local_artifacts_path)
|
284
|
+
artifacts.save()
|
285
|
+
return orig_chat, artifacts
|
228
286
|
|
229
287
|
def log_progress(self, data: Dict[str, Any]) -> None:
|
230
288
|
pass
|
@@ -718,9 +718,14 @@ class VisionAgentCoder(Agent):
|
|
718
718
|
for chat_i in chat:
|
719
719
|
if "media" in chat_i:
|
720
720
|
for media in chat_i["media"]:
|
721
|
-
media =
|
721
|
+
media = (
|
722
|
+
media
|
723
|
+
if type(media) is str
|
724
|
+
and media.startswith(("http", "https"))
|
725
|
+
else code_interpreter.upload_file(cast(str, media))
|
726
|
+
)
|
722
727
|
chat_i["content"] += f" Media name {media}" # type: ignore
|
723
|
-
media_list.append(media)
|
728
|
+
media_list.append(str(media))
|
724
729
|
|
725
730
|
int_chat = cast(
|
726
731
|
List[Message],
|
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
|
|
744
749
|
results = {"code": "", "test": "", "plan": []}
|
745
750
|
plan = []
|
746
751
|
success = False
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
"log_content": "Creating plans",
|
751
|
-
"status": "started",
|
752
|
-
}
|
753
|
-
)
|
754
|
-
plans = write_plans(
|
755
|
-
int_chat,
|
756
|
-
T.get_tool_descriptions_by_names(
|
757
|
-
customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
758
|
-
),
|
759
|
-
format_memory(working_memory),
|
760
|
-
self.planner,
|
752
|
+
|
753
|
+
plans = self._create_plans(
|
754
|
+
int_chat, customized_tool_names, working_memory, self.planner
|
761
755
|
)
|
762
756
|
|
763
|
-
if
|
764
|
-
|
765
|
-
|
766
|
-
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
767
|
-
_LOGGER.info(
|
768
|
-
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
769
|
-
)
|
757
|
+
if test_multi_plan:
|
758
|
+
self._log_plans(plans, self.verbosity)
|
759
|
+
|
770
760
|
tool_infos = retrieve_tools(
|
771
761
|
plans,
|
772
762
|
self.tool_recommender,
|
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
|
|
860
850
|
if self.report_progress_callback is not None:
|
861
851
|
self.report_progress_callback(data)
|
862
852
|
|
853
|
+
def _create_plans(
|
854
|
+
self,
|
855
|
+
int_chat: List[Message],
|
856
|
+
customized_tool_names: Optional[List[str]],
|
857
|
+
working_memory: List[Dict[str, str]],
|
858
|
+
planner: LMM,
|
859
|
+
) -> Dict[str, Any]:
|
860
|
+
self.log_progress(
|
861
|
+
{
|
862
|
+
"type": "log",
|
863
|
+
"log_content": "Creating plans",
|
864
|
+
"status": "started",
|
865
|
+
}
|
866
|
+
)
|
867
|
+
plans = write_plans(
|
868
|
+
int_chat,
|
869
|
+
T.get_tool_descriptions_by_names(
|
870
|
+
customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
871
|
+
),
|
872
|
+
format_memory(working_memory),
|
873
|
+
planner,
|
874
|
+
)
|
875
|
+
return plans
|
876
|
+
|
877
|
+
def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
|
878
|
+
if verbosity >= 1:
|
879
|
+
for p in plans:
|
880
|
+
# tabulate will fail if the keys are not the same for all elements
|
881
|
+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
882
|
+
_LOGGER.info(
|
883
|
+
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
884
|
+
)
|
885
|
+
|
863
886
|
|
864
887
|
class OllamaVisionAgentCoder(VisionAgentCoder):
|
865
888
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
VA_CODE = """
|
2
2
|
**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
|
3
3
|
|
4
|
-
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
|
4
|
+
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
|
5
5
|
|
6
6
|
<execute_python>
|
7
7
|
print("Hello World!")
|
@@ -15,7 +15,6 @@ This is the documentation for the different actions you can take:
|
|
15
15
|
**Examples**:
|
16
16
|
Here is an example of how you can interact with a user and Actions to complete a task:
|
17
17
|
--- START EXAMPLES ---
|
18
|
-
[Current directory: /example/workspace]
|
19
18
|
{examples}
|
20
19
|
--- END EXAMPLES ---
|
21
20
|
|
@@ -26,24 +25,28 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
26
25
|
**Conversation**:
|
27
26
|
Here is the current conversation so far:
|
28
27
|
--- START CONVERSATION ---
|
29
|
-
[Current directory: {dir}]
|
30
|
-
|
31
28
|
{conversation}
|
32
29
|
"""
|
33
30
|
|
31
|
+
|
34
32
|
EXAMPLES_CODE1 = """
|
35
33
|
USER: Can you detect the dogs in this image? Media name dog.jpg
|
36
34
|
|
37
|
-
|
35
|
+
OBSERVATION:
|
36
|
+
[Artifacts loaded]
|
37
|
+
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
38
|
+
[End of artifacts]
|
39
|
+
|
40
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
|
38
41
|
|
39
42
|
OBSERVATION:
|
40
|
-
[
|
43
|
+
[Artifact dog_detector.py]
|
41
44
|
0|from vision_agent.tools import load_image, owl_v2
|
42
45
|
1|def detect_dogs(image_path: str):
|
43
46
|
2| image = load_image(image_path)
|
44
47
|
3| dogs = owl_v2("dog", image)
|
45
48
|
4| return dogs
|
46
|
-
[End of
|
49
|
+
[End of artifact]
|
47
50
|
|
48
51
|
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
49
52
|
|
@@ -56,18 +59,23 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
56
59
|
|
57
60
|
USER: The the image only has one dog, can you fix this?
|
58
61
|
|
59
|
-
|
62
|
+
[Artifacts loaded]
|
63
|
+
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
64
|
+
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
|
65
|
+
[End of artifacts]
|
66
|
+
|
67
|
+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
|
60
68
|
|
61
69
|
OBSERVATION:
|
62
|
-
[
|
70
|
+
[Artifact dog_detector.py]
|
63
71
|
0|from vision_agent.tools import load_image, owl_v2
|
64
72
|
1|def detect_dogs(image_path: str):
|
65
73
|
2| image = load_image(image_path)
|
66
74
|
3| dogs = owl_v2("dog", image, threshold=0.24)
|
67
75
|
4| return dogs
|
68
|
-
[End of
|
76
|
+
[End of artifact]
|
69
77
|
|
70
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/
|
78
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
|
71
79
|
|
72
80
|
OBSERVATION:
|
73
81
|
----- stdout -----
|
@@ -76,23 +84,34 @@ OBSERVATION:
|
|
76
84
|
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
|
77
85
|
"""
|
78
86
|
|
87
|
+
|
79
88
|
EXAMPLES_CODE2 = """
|
80
|
-
USER: Can you create a function to count workers with helmets?
|
89
|
+
USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
|
90
|
+
|
91
|
+
OBSERVATION:
|
92
|
+
[Artifacts loaded]
|
93
|
+
[End of artifacts]
|
81
94
|
|
82
95
|
AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
|
83
96
|
|
84
97
|
USER: Yes you can use workers.png
|
85
98
|
|
86
|
-
|
99
|
+
OBSERVATION:
|
100
|
+
[Artifacts loaded]
|
101
|
+
Artifact workers.png loaded to /path/to/images/workers.png
|
102
|
+
[End of artifacts]
|
103
|
+
|
104
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
|
87
105
|
|
88
106
|
OBSERVATION:
|
89
|
-
[
|
90
|
-
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
|
91
|
-
1|def count_workers_with_helmets(image_path: str):
|
107
|
+
[Artifact code.py]
|
108
|
+
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
|
109
|
+
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
92
110
|
2| image = load_image(image_path)
|
93
|
-
3|
|
94
|
-
4|
|
95
|
-
5|
|
111
|
+
3| detections = owl_v2("worker, helmet", image)
|
112
|
+
4| workers = [d for d in detections if d['label'] == 'worker']
|
113
|
+
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
114
|
+
6| count = 0
|
96
115
|
6| for worker in workers:
|
97
116
|
7| person_box = worker['bbox']
|
98
117
|
8| person_has_helmet = False
|
@@ -102,14 +121,16 @@ OBSERVATION:
|
|
102
121
|
12| break
|
103
122
|
13| if person_has_helmet:
|
104
123
|
14| count += 1
|
124
|
+
15| overlay_bounding_boxes(image, detections)
|
125
|
+
16| save_image(output_path, image)
|
105
126
|
15| return count
|
106
|
-
[End of
|
127
|
+
[End of artifact]
|
107
128
|
|
108
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/
|
129
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
109
130
|
|
110
131
|
OBSERVATION:
|
111
132
|
----- stdout -----
|
112
133
|
2
|
113
134
|
|
114
|
-
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
|
135
|
+
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
115
136
|
"""
|
@@ -5,9 +5,9 @@ from uuid import UUID
|
|
5
5
|
from requests.exceptions import HTTPError
|
6
6
|
|
7
7
|
from vision_agent.clients.http import BaseHTTP
|
8
|
-
from vision_agent.
|
8
|
+
from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
|
9
9
|
from vision_agent.utils.exceptions import FineTuneModelNotFound
|
10
|
-
from vision_agent.
|
10
|
+
from vision_agent.utils.type_defs import LandingaiAPIKey
|
11
11
|
|
12
12
|
|
13
13
|
class LandingPublicAPI(BaseHTTP):
|