vision-agent 0.2.118__py3-none-any.whl → 0.2.120__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/agent.py +1 -1
- vision_agent/agent/vision_agent.py +107 -49
- vision_agent/agent/vision_agent_coder.py +2 -2
- vision_agent/agent/vision_agent_prompts.py +43 -22
- vision_agent/clients/landing_public_api.py +2 -2
- vision_agent/lmm/lmm.py +4 -2
- vision_agent/lmm/types.py +3 -1
- vision_agent/tools/__init__.py +2 -2
- vision_agent/tools/meta_tools.py +281 -273
- vision_agent/tools/tools.py +3 -3
- vision_agent/tools/tools_types.py +3 -3
- vision_agent/utils/execute.py +69 -22
- vision_agent/utils/image_utils.py +2 -2
- {vision_agent-0.2.118.dist-info → vision_agent-0.2.120.dist-info}/METADATA +12 -8
- {vision_agent-0.2.118.dist-info → vision_agent-0.2.120.dist-info}/RECORD +17 -17
- {vision_agent-0.2.118.dist-info → vision_agent-0.2.120.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.118.dist-info → vision_agent-0.2.120.dist-info}/WHEEL +0 -0
vision_agent/agent/agent.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import tempfile
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import Any, Dict, List, Optional, Union, cast
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
6
7
|
|
7
8
|
from vision_agent.agent import Agent
|
8
9
|
from vision_agent.agent.agent_utils import extract_json
|
@@ -13,8 +14,9 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
13
14
|
)
|
14
15
|
from vision_agent.lmm import LMM, Message, OpenAILMM
|
15
16
|
from vision_agent.tools import META_TOOL_DOCSTRING
|
17
|
+
from vision_agent.tools.meta_tools import Artifacts
|
16
18
|
from vision_agent.utils import CodeInterpreterFactory
|
17
|
-
from vision_agent.utils.execute import CodeInterpreter
|
19
|
+
from vision_agent.utils.execute import CodeInterpreter, Execution
|
18
20
|
|
19
21
|
logging.basicConfig(level=logging.INFO)
|
20
22
|
_LOGGER = logging.getLogger(__name__)
|
@@ -24,23 +26,30 @@ if str(WORKSPACE) != "":
|
|
24
26
|
os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
|
25
27
|
|
26
28
|
|
27
|
-
class
|
28
|
-
|
29
|
+
class BoilerplateCode:
|
30
|
+
pre_code = [
|
29
31
|
"from typing import *",
|
30
32
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import
|
33
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
|
34
|
+
"artifacts = Artifacts('{remote_path}')",
|
35
|
+
"artifacts.load('{remote_path}')",
|
36
|
+
]
|
37
|
+
post_code = [
|
38
|
+
"artifacts.save()",
|
32
39
|
]
|
33
40
|
|
34
41
|
@staticmethod
|
35
|
-
def
|
36
|
-
return "\n".join(DefaultImports.code)
|
37
|
-
|
38
|
-
@staticmethod
|
39
|
-
def prepend_imports(code: str) -> str:
|
42
|
+
def add_boilerplate(code: str, **format: Any) -> str:
|
40
43
|
"""Run this method to prepend the default imports to the code.
|
41
44
|
NOTE: be sure to run this method after the custom tools have been registered.
|
42
45
|
"""
|
43
|
-
return
|
46
|
+
return (
|
47
|
+
"\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
|
48
|
+
+ "\n\n"
|
49
|
+
+ code
|
50
|
+
+ "\n\n"
|
51
|
+
+ "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
|
52
|
+
)
|
44
53
|
|
45
54
|
|
46
55
|
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
@@ -60,35 +69,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
60
69
|
prompt = VA_CODE.format(
|
61
70
|
documentation=META_TOOL_DOCSTRING,
|
62
71
|
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
|
63
|
-
dir=WORKSPACE,
|
64
72
|
conversation=conversation,
|
65
73
|
)
|
66
74
|
return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
|
67
75
|
|
68
76
|
|
69
|
-
def run_code_action(
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
if result.success:
|
76
|
-
for res in result.results:
|
77
|
-
if res.text is not None:
|
78
|
-
return_str += res.text.replace("\\n", "\n")
|
79
|
-
if result.logs.stdout:
|
80
|
-
return_str += "----- stdout -----\n"
|
81
|
-
for log in result.logs.stdout:
|
82
|
-
return_str += log.replace("\\n", "\n")
|
83
|
-
else:
|
84
|
-
# for log in result.logs.stderr:
|
85
|
-
# return_str += log.replace("\\n", "\n")
|
86
|
-
if result.error:
|
87
|
-
return_str += (
|
88
|
-
"\n" + result.error.value + "\n".join(result.error.traceback_raw)
|
89
|
-
)
|
90
|
-
|
91
|
-
return return_str
|
77
|
+
def run_code_action(
|
78
|
+
code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
|
79
|
+
) -> Execution:
|
80
|
+
return code_interpreter.exec_isolation(
|
81
|
+
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
|
82
|
+
)
|
92
83
|
|
93
84
|
|
94
85
|
def parse_execution(response: str) -> Optional[str]:
|
@@ -101,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
|
|
101
92
|
|
102
93
|
class VisionAgent(Agent):
|
103
94
|
"""Vision Agent is an agent that can chat with the user and call tools or other
|
104
|
-
agents to generate code for it. Vision Agent uses python code to execute actions
|
105
|
-
the user. Vision Agent is inspired by by OpenDev
|
95
|
+
agents to generate code for it. Vision Agent uses python code to execute actions
|
96
|
+
for the user. Vision Agent is inspired by by OpenDev
|
106
97
|
https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
|
107
98
|
|
108
99
|
Example
|
@@ -118,8 +109,20 @@ class VisionAgent(Agent):
|
|
118
109
|
self,
|
119
110
|
agent: Optional[LMM] = None,
|
120
111
|
verbosity: int = 0,
|
112
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
121
113
|
code_sandbox_runtime: Optional[str] = None,
|
122
114
|
) -> None:
|
115
|
+
"""Initialize the VisionAgent.
|
116
|
+
|
117
|
+
Parameters:
|
118
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
119
|
+
of other agents.
|
120
|
+
verbosity (int): The verbosity level of the agent.
|
121
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
122
|
+
artifacts file.
|
123
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
124
|
+
"""
|
125
|
+
|
123
126
|
self.agent = (
|
124
127
|
OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
125
128
|
)
|
@@ -128,12 +131,21 @@ class VisionAgent(Agent):
|
|
128
131
|
self.code_sandbox_runtime = code_sandbox_runtime
|
129
132
|
if self.verbosity >= 1:
|
130
133
|
_LOGGER.setLevel(logging.INFO)
|
134
|
+
self.local_artifacts_path = cast(
|
135
|
+
str,
|
136
|
+
(
|
137
|
+
Path(local_artifacts_path)
|
138
|
+
if local_artifacts_path is not None
|
139
|
+
else Path(tempfile.NamedTemporaryFile(delete=False).name)
|
140
|
+
),
|
141
|
+
)
|
131
142
|
|
132
143
|
def __call__(
|
133
144
|
self,
|
134
145
|
input: Union[str, List[Message]],
|
135
146
|
media: Optional[Union[str, Path]] = None,
|
136
|
-
|
147
|
+
artifacts: Optional[Artifacts] = None,
|
148
|
+
) -> List[Message]:
|
137
149
|
"""Chat with VisionAgent and get the conversation response.
|
138
150
|
|
139
151
|
Parameters:
|
@@ -141,6 +153,7 @@ class VisionAgent(Agent):
|
|
141
153
|
[{"role": "user", "content": "describe your task here..."}, ...] or a
|
142
154
|
string of just the contents.
|
143
155
|
media (Optional[Union[str, Path]]): The media file to be used in the task.
|
156
|
+
artifacts (Optional[Artifacts]): The artifacts to use in the task.
|
144
157
|
|
145
158
|
Returns:
|
146
159
|
str: The conversation response.
|
@@ -149,22 +162,23 @@ class VisionAgent(Agent):
|
|
149
162
|
input = [{"role": "user", "content": input}]
|
150
163
|
if media is not None:
|
151
164
|
input[0]["media"] = [media]
|
152
|
-
results = self.chat_with_code(input)
|
153
|
-
return results
|
165
|
+
results, _ = self.chat_with_code(input, artifacts)
|
166
|
+
return results
|
154
167
|
|
155
168
|
def chat_with_code(
|
156
169
|
self,
|
157
170
|
chat: List[Message],
|
158
|
-
|
171
|
+
artifacts: Optional[Artifacts] = None,
|
172
|
+
) -> Tuple[List[Message], Artifacts]:
|
159
173
|
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
160
174
|
its tasks.
|
161
175
|
|
162
176
|
Parameters:
|
163
|
-
chat (List[Message]): A conversation
|
164
|
-
in the format of:
|
177
|
+
chat (List[Message]): A conversation in the format of:
|
165
178
|
[{"role": "user", "content": "describe your task here..."}]
|
166
179
|
or if it contains media files, it should be in the format of:
|
167
180
|
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
181
|
+
artifacts (Optional[Artifacts]): The artifacts to use in the task.
|
168
182
|
|
169
183
|
Returns:
|
170
184
|
List[Message]: The conversation response.
|
@@ -173,6 +187,10 @@ class VisionAgent(Agent):
|
|
173
187
|
if not chat:
|
174
188
|
raise ValueError("chat cannot be empty")
|
175
189
|
|
190
|
+
if not artifacts:
|
191
|
+
# this is setting remote artifacts path
|
192
|
+
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
193
|
+
|
176
194
|
with CodeInterpreterFactory.new_instance(
|
177
195
|
code_sandbox_runtime=self.code_sandbox_runtime
|
178
196
|
) as code_interpreter:
|
@@ -182,9 +200,14 @@ class VisionAgent(Agent):
|
|
182
200
|
for chat_i in int_chat:
|
183
201
|
if "media" in chat_i:
|
184
202
|
for media in chat_i["media"]:
|
185
|
-
media =
|
186
|
-
|
187
|
-
|
203
|
+
media = cast(str, media)
|
204
|
+
artifacts.artifacts[Path(media).name] = open(media, "rb").read()
|
205
|
+
|
206
|
+
media_remote_path = (
|
207
|
+
Path(code_interpreter.remote_path) / Path(media).name
|
208
|
+
)
|
209
|
+
chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
|
210
|
+
media_list.append(media_remote_path)
|
188
211
|
|
189
212
|
int_chat = cast(
|
190
213
|
List[Message],
|
@@ -204,6 +227,22 @@ class VisionAgent(Agent):
|
|
204
227
|
|
205
228
|
finished = False
|
206
229
|
iterations = 0
|
230
|
+
last_response = None
|
231
|
+
|
232
|
+
# Save the current state of artifacts, will include any images the user
|
233
|
+
# passed in.
|
234
|
+
artifacts.save(self.local_artifacts_path)
|
235
|
+
|
236
|
+
# Upload artifacts to remote location and show where they are going
|
237
|
+
# to be loaded to. The actual loading happens in BoilerplateCode as
|
238
|
+
# part of the pre_code.
|
239
|
+
remote_artifacts_path = code_interpreter.upload_file(
|
240
|
+
self.local_artifacts_path
|
241
|
+
)
|
242
|
+
artifacts_loaded = artifacts.show()
|
243
|
+
int_chat.append({"role": "observation", "content": artifacts_loaded})
|
244
|
+
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
245
|
+
|
207
246
|
while not finished and iterations < self.max_iterations:
|
208
247
|
response = run_conversation(self.agent, int_chat)
|
209
248
|
if self.verbosity >= 1:
|
@@ -211,20 +250,39 @@ class VisionAgent(Agent):
|
|
211
250
|
int_chat.append({"role": "assistant", "content": str(response)})
|
212
251
|
orig_chat.append({"role": "assistant", "content": str(response)})
|
213
252
|
|
253
|
+
# sometimes it gets stuck in a loop, so we force it to exit
|
254
|
+
if last_response == response:
|
255
|
+
response["let_user_respond"] = True
|
256
|
+
|
214
257
|
if response["let_user_respond"]:
|
215
258
|
break
|
216
259
|
|
217
260
|
code_action = parse_execution(response["response"])
|
218
261
|
|
219
262
|
if code_action is not None:
|
220
|
-
|
263
|
+
result = run_code_action(
|
264
|
+
code_action, code_interpreter, str(remote_artifacts_path)
|
265
|
+
)
|
266
|
+
obs = str(result.logs)
|
267
|
+
|
221
268
|
if self.verbosity >= 1:
|
222
269
|
_LOGGER.info(obs)
|
270
|
+
# don't add execution results to internal chat
|
223
271
|
int_chat.append({"role": "observation", "content": obs})
|
224
|
-
orig_chat.append(
|
272
|
+
orig_chat.append(
|
273
|
+
{"role": "observation", "content": obs, "execution": result}
|
274
|
+
)
|
225
275
|
|
226
276
|
iterations += 1
|
227
|
-
|
277
|
+
last_response = response
|
278
|
+
|
279
|
+
# after running the agent, download the artifacts locally
|
280
|
+
code_interpreter.download_file(
|
281
|
+
str(remote_artifacts_path.name), str(self.local_artifacts_path)
|
282
|
+
)
|
283
|
+
artifacts.load(self.local_artifacts_path)
|
284
|
+
artifacts.save()
|
285
|
+
return orig_chat, artifacts
|
228
286
|
|
229
287
|
def log_progress(self, data: Dict[str, Any]) -> None:
|
230
288
|
pass
|
@@ -722,10 +722,10 @@ class VisionAgentCoder(Agent):
|
|
722
722
|
media
|
723
723
|
if type(media) is str
|
724
724
|
and media.startswith(("http", "https"))
|
725
|
-
else code_interpreter.upload_file(media)
|
725
|
+
else code_interpreter.upload_file(cast(str, media))
|
726
726
|
)
|
727
727
|
chat_i["content"] += f" Media name {media}" # type: ignore
|
728
|
-
media_list.append(media)
|
728
|
+
media_list.append(str(media))
|
729
729
|
|
730
730
|
int_chat = cast(
|
731
731
|
List[Message],
|
@@ -1,7 +1,7 @@
|
|
1
1
|
VA_CODE = """
|
2
2
|
**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
|
3
3
|
|
4
|
-
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
|
4
|
+
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
|
5
5
|
|
6
6
|
<execute_python>
|
7
7
|
print("Hello World!")
|
@@ -15,7 +15,6 @@ This is the documentation for the different actions you can take:
|
|
15
15
|
**Examples**:
|
16
16
|
Here is an example of how you can interact with a user and Actions to complete a task:
|
17
17
|
--- START EXAMPLES ---
|
18
|
-
[Current directory: /example/workspace]
|
19
18
|
{examples}
|
20
19
|
--- END EXAMPLES ---
|
21
20
|
|
@@ -26,24 +25,28 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
26
25
|
**Conversation**:
|
27
26
|
Here is the current conversation so far:
|
28
27
|
--- START CONVERSATION ---
|
29
|
-
[Current directory: {dir}]
|
30
|
-
|
31
28
|
{conversation}
|
32
29
|
"""
|
33
30
|
|
31
|
+
|
34
32
|
EXAMPLES_CODE1 = """
|
35
33
|
USER: Can you detect the dogs in this image? Media name dog.jpg
|
36
34
|
|
37
|
-
|
35
|
+
OBSERVATION:
|
36
|
+
[Artifacts loaded]
|
37
|
+
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
38
|
+
[End of artifacts]
|
39
|
+
|
40
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
|
38
41
|
|
39
42
|
OBSERVATION:
|
40
|
-
[
|
43
|
+
[Artifact dog_detector.py]
|
41
44
|
0|from vision_agent.tools import load_image, owl_v2
|
42
45
|
1|def detect_dogs(image_path: str):
|
43
46
|
2| image = load_image(image_path)
|
44
47
|
3| dogs = owl_v2("dog", image)
|
45
48
|
4| return dogs
|
46
|
-
[End of
|
49
|
+
[End of artifact]
|
47
50
|
|
48
51
|
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
49
52
|
|
@@ -56,18 +59,23 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
56
59
|
|
57
60
|
USER: The the image only has one dog, can you fix this?
|
58
61
|
|
59
|
-
|
62
|
+
[Artifacts loaded]
|
63
|
+
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
64
|
+
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
|
65
|
+
[End of artifacts]
|
66
|
+
|
67
|
+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
|
60
68
|
|
61
69
|
OBSERVATION:
|
62
|
-
[
|
70
|
+
[Artifact dog_detector.py]
|
63
71
|
0|from vision_agent.tools import load_image, owl_v2
|
64
72
|
1|def detect_dogs(image_path: str):
|
65
73
|
2| image = load_image(image_path)
|
66
74
|
3| dogs = owl_v2("dog", image, threshold=0.24)
|
67
75
|
4| return dogs
|
68
|
-
[End of
|
76
|
+
[End of artifact]
|
69
77
|
|
70
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/
|
78
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
|
71
79
|
|
72
80
|
OBSERVATION:
|
73
81
|
----- stdout -----
|
@@ -76,23 +84,34 @@ OBSERVATION:
|
|
76
84
|
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
|
77
85
|
"""
|
78
86
|
|
87
|
+
|
79
88
|
EXAMPLES_CODE2 = """
|
80
|
-
USER: Can you create a function to count workers with helmets?
|
89
|
+
USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
|
90
|
+
|
91
|
+
OBSERVATION:
|
92
|
+
[Artifacts loaded]
|
93
|
+
[End of artifacts]
|
81
94
|
|
82
95
|
AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
|
83
96
|
|
84
97
|
USER: Yes you can use workers.png
|
85
98
|
|
86
|
-
|
99
|
+
OBSERVATION:
|
100
|
+
[Artifacts loaded]
|
101
|
+
Artifact workers.png loaded to /path/to/images/workers.png
|
102
|
+
[End of artifacts]
|
103
|
+
|
104
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
|
87
105
|
|
88
106
|
OBSERVATION:
|
89
|
-
[
|
90
|
-
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
|
91
|
-
1|def count_workers_with_helmets(image_path: str):
|
107
|
+
[Artifact code.py]
|
108
|
+
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
|
109
|
+
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
92
110
|
2| image = load_image(image_path)
|
93
|
-
3|
|
94
|
-
4|
|
95
|
-
5|
|
111
|
+
3| detections = owl_v2("worker, helmet", image)
|
112
|
+
4| workers = [d for d in detections if d['label'] == 'worker']
|
113
|
+
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
114
|
+
6| count = 0
|
96
115
|
6| for worker in workers:
|
97
116
|
7| person_box = worker['bbox']
|
98
117
|
8| person_has_helmet = False
|
@@ -102,14 +121,16 @@ OBSERVATION:
|
|
102
121
|
12| break
|
103
122
|
13| if person_has_helmet:
|
104
123
|
14| count += 1
|
124
|
+
15| overlay_bounding_boxes(image, detections)
|
125
|
+
16| save_image(output_path, image)
|
105
126
|
15| return count
|
106
|
-
[End of
|
127
|
+
[End of artifact]
|
107
128
|
|
108
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/
|
129
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
109
130
|
|
110
131
|
OBSERVATION:
|
111
132
|
----- stdout -----
|
112
133
|
2
|
113
134
|
|
114
|
-
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
|
135
|
+
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
115
136
|
"""
|
@@ -5,9 +5,9 @@ from uuid import UUID
|
|
5
5
|
from requests.exceptions import HTTPError
|
6
6
|
|
7
7
|
from vision_agent.clients.http import BaseHTTP
|
8
|
-
from vision_agent.
|
8
|
+
from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
|
9
9
|
from vision_agent.utils.exceptions import FineTuneModelNotFound
|
10
|
-
from vision_agent.
|
10
|
+
from vision_agent.utils.type_defs import LandingaiAPIKey
|
11
11
|
|
12
12
|
|
13
13
|
class LandingPublicAPI(BaseHTTP):
|
vision_agent/lmm/lmm.py
CHANGED
@@ -138,7 +138,7 @@ class OpenAILMM(LMM):
|
|
138
138
|
fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
|
139
139
|
if "media" in c:
|
140
140
|
for media in c["media"]:
|
141
|
-
encoded_media = encode_media(media)
|
141
|
+
encoded_media = encode_media(cast(str, media))
|
142
142
|
|
143
143
|
fixed_c["content"].append( # type: ignore
|
144
144
|
{
|
@@ -389,7 +389,9 @@ class OllamaLMM(LMM):
|
|
389
389
|
fixed_chat = []
|
390
390
|
for message in chat:
|
391
391
|
if "media" in message:
|
392
|
-
message["images"] = [
|
392
|
+
message["images"] = [
|
393
|
+
encode_media(cast(str, m)) for m in message["media"]
|
394
|
+
]
|
393
395
|
del message["media"]
|
394
396
|
fixed_chat.append(message)
|
395
397
|
url = f"{self.url}/chat"
|
vision_agent/lmm/types.py
CHANGED
vision_agent/tools/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from typing import Callable, List, Optional
|
2
2
|
|
3
|
-
from .meta_tools import META_TOOL_DOCSTRING
|
3
|
+
from .meta_tools import META_TOOL_DOCSTRING, Artifacts
|
4
4
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
5
5
|
from .tool_utils import get_tool_descriptions_by_names
|
6
6
|
from .tools import (
|
@@ -21,8 +21,8 @@ from .tools import (
|
|
21
21
|
dpt_hybrid_midas,
|
22
22
|
extract_frames,
|
23
23
|
florence2_image_caption,
|
24
|
-
florence2_phrase_grounding,
|
25
24
|
florence2_ocr,
|
25
|
+
florence2_phrase_grounding,
|
26
26
|
florence2_roberta_vqa,
|
27
27
|
florence2_sam2_image,
|
28
28
|
florence2_sam2_video,
|