vision-agent 0.2.117__py3-none-any.whl → 0.2.119__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent.py +1 -1
- vision_agent/agent/vision_agent.py +107 -49
- vision_agent/agent/vision_agent_coder.py +46 -23
- vision_agent/agent/vision_agent_prompts.py +43 -22
- vision_agent/clients/landing_public_api.py +2 -2
- vision_agent/lmm/lmm.py +15 -6
- vision_agent/lmm/types.py +3 -1
- vision_agent/tools/__init__.py +2 -2
- vision_agent/tools/meta_tools.py +281 -273
- vision_agent/tools/tools.py +36 -14
- vision_agent/tools/tools_types.py +3 -3
- vision_agent/utils/execute.py +69 -22
- vision_agent/utils/image_utils.py +2 -2
- {vision_agent-0.2.117.dist-info → vision_agent-0.2.119.dist-info}/METADATA +12 -8
- {vision_agent-0.2.117.dist-info → vision_agent-0.2.119.dist-info}/RECORD +17 -17
- {vision_agent-0.2.117.dist-info → vision_agent-0.2.119.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.117.dist-info → vision_agent-0.2.119.dist-info}/WHEEL +0 -0
vision_agent/agent/agent.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import tempfile
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import Any, Dict, List, Optional, Union, cast
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
6
7
|
|
7
8
|
from vision_agent.agent import Agent
|
8
9
|
from vision_agent.agent.agent_utils import extract_json
|
@@ -13,8 +14,9 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
13
14
|
)
|
14
15
|
from vision_agent.lmm import LMM, Message, OpenAILMM
|
15
16
|
from vision_agent.tools import META_TOOL_DOCSTRING
|
17
|
+
from vision_agent.tools.meta_tools import Artifacts
|
16
18
|
from vision_agent.utils import CodeInterpreterFactory
|
17
|
-
from vision_agent.utils.execute import CodeInterpreter
|
19
|
+
from vision_agent.utils.execute import CodeInterpreter, Execution
|
18
20
|
|
19
21
|
logging.basicConfig(level=logging.INFO)
|
20
22
|
_LOGGER = logging.getLogger(__name__)
|
@@ -24,23 +26,30 @@ if str(WORKSPACE) != "":
|
|
24
26
|
os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
|
25
27
|
|
26
28
|
|
27
|
-
class
|
28
|
-
|
29
|
+
class BoilerplateCode:
|
30
|
+
pre_code = [
|
29
31
|
"from typing import *",
|
30
32
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import
|
33
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
|
34
|
+
"artifacts = Artifacts('{remote_path}')",
|
35
|
+
"artifacts.load('{remote_path}')",
|
36
|
+
]
|
37
|
+
post_code = [
|
38
|
+
"artifacts.save()",
|
32
39
|
]
|
33
40
|
|
34
41
|
@staticmethod
|
35
|
-
def
|
36
|
-
return "\n".join(DefaultImports.code)
|
37
|
-
|
38
|
-
@staticmethod
|
39
|
-
def prepend_imports(code: str) -> str:
|
42
|
+
def add_boilerplate(code: str, **format: Any) -> str:
|
40
43
|
"""Run this method to prepend the default imports to the code.
|
41
44
|
NOTE: be sure to run this method after the custom tools have been registered.
|
42
45
|
"""
|
43
|
-
return
|
46
|
+
return (
|
47
|
+
"\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
|
48
|
+
+ "\n\n"
|
49
|
+
+ code
|
50
|
+
+ "\n\n"
|
51
|
+
+ "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
|
52
|
+
)
|
44
53
|
|
45
54
|
|
46
55
|
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
@@ -60,35 +69,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
60
69
|
prompt = VA_CODE.format(
|
61
70
|
documentation=META_TOOL_DOCSTRING,
|
62
71
|
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
|
63
|
-
dir=WORKSPACE,
|
64
72
|
conversation=conversation,
|
65
73
|
)
|
66
74
|
return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
|
67
75
|
|
68
76
|
|
69
|
-
def run_code_action(
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
if result.success:
|
76
|
-
for res in result.results:
|
77
|
-
if res.text is not None:
|
78
|
-
return_str += res.text.replace("\\n", "\n")
|
79
|
-
if result.logs.stdout:
|
80
|
-
return_str += "----- stdout -----\n"
|
81
|
-
for log in result.logs.stdout:
|
82
|
-
return_str += log.replace("\\n", "\n")
|
83
|
-
else:
|
84
|
-
# for log in result.logs.stderr:
|
85
|
-
# return_str += log.replace("\\n", "\n")
|
86
|
-
if result.error:
|
87
|
-
return_str += (
|
88
|
-
"\n" + result.error.value + "\n".join(result.error.traceback_raw)
|
89
|
-
)
|
90
|
-
|
91
|
-
return return_str
|
77
|
+
def run_code_action(
|
78
|
+
code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
|
79
|
+
) -> Execution:
|
80
|
+
return code_interpreter.exec_isolation(
|
81
|
+
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
|
82
|
+
)
|
92
83
|
|
93
84
|
|
94
85
|
def parse_execution(response: str) -> Optional[str]:
|
@@ -101,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
|
|
101
92
|
|
102
93
|
class VisionAgent(Agent):
|
103
94
|
"""Vision Agent is an agent that can chat with the user and call tools or other
|
104
|
-
agents to generate code for it. Vision Agent uses python code to execute actions
|
105
|
-
the user. Vision Agent is inspired by by OpenDev
|
95
|
+
agents to generate code for it. Vision Agent uses python code to execute actions
|
96
|
+
for the user. Vision Agent is inspired by by OpenDev
|
106
97
|
https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
|
107
98
|
|
108
99
|
Example
|
@@ -118,8 +109,20 @@ class VisionAgent(Agent):
|
|
118
109
|
self,
|
119
110
|
agent: Optional[LMM] = None,
|
120
111
|
verbosity: int = 0,
|
112
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
121
113
|
code_sandbox_runtime: Optional[str] = None,
|
122
114
|
) -> None:
|
115
|
+
"""Initialize the VisionAgent.
|
116
|
+
|
117
|
+
Parameters:
|
118
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
119
|
+
of other agents.
|
120
|
+
verbosity (int): The verbosity level of the agent.
|
121
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
122
|
+
artifacts file.
|
123
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
124
|
+
"""
|
125
|
+
|
123
126
|
self.agent = (
|
124
127
|
OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
125
128
|
)
|
@@ -128,12 +131,21 @@ class VisionAgent(Agent):
|
|
128
131
|
self.code_sandbox_runtime = code_sandbox_runtime
|
129
132
|
if self.verbosity >= 1:
|
130
133
|
_LOGGER.setLevel(logging.INFO)
|
134
|
+
self.local_artifacts_path = cast(
|
135
|
+
str,
|
136
|
+
(
|
137
|
+
Path(local_artifacts_path)
|
138
|
+
if local_artifacts_path is not None
|
139
|
+
else Path(tempfile.NamedTemporaryFile(delete=False).name)
|
140
|
+
),
|
141
|
+
)
|
131
142
|
|
132
143
|
def __call__(
|
133
144
|
self,
|
134
145
|
input: Union[str, List[Message]],
|
135
146
|
media: Optional[Union[str, Path]] = None,
|
136
|
-
|
147
|
+
artifacts: Optional[Artifacts] = None,
|
148
|
+
) -> List[Message]:
|
137
149
|
"""Chat with VisionAgent and get the conversation response.
|
138
150
|
|
139
151
|
Parameters:
|
@@ -141,6 +153,7 @@ class VisionAgent(Agent):
|
|
141
153
|
[{"role": "user", "content": "describe your task here..."}, ...] or a
|
142
154
|
string of just the contents.
|
143
155
|
media (Optional[Union[str, Path]]): The media file to be used in the task.
|
156
|
+
artifacts (Optional[Artifacts]): The artifacts to use in the task.
|
144
157
|
|
145
158
|
Returns:
|
146
159
|
str: The conversation response.
|
@@ -149,22 +162,23 @@ class VisionAgent(Agent):
|
|
149
162
|
input = [{"role": "user", "content": input}]
|
150
163
|
if media is not None:
|
151
164
|
input[0]["media"] = [media]
|
152
|
-
results = self.chat_with_code(input)
|
153
|
-
return results
|
165
|
+
results, _ = self.chat_with_code(input, artifacts)
|
166
|
+
return results
|
154
167
|
|
155
168
|
def chat_with_code(
|
156
169
|
self,
|
157
170
|
chat: List[Message],
|
158
|
-
|
171
|
+
artifacts: Optional[Artifacts] = None,
|
172
|
+
) -> Tuple[List[Message], Artifacts]:
|
159
173
|
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
160
174
|
its tasks.
|
161
175
|
|
162
176
|
Parameters:
|
163
|
-
chat (List[Message]): A conversation
|
164
|
-
in the format of:
|
177
|
+
chat (List[Message]): A conversation in the format of:
|
165
178
|
[{"role": "user", "content": "describe your task here..."}]
|
166
179
|
or if it contains media files, it should be in the format of:
|
167
180
|
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
181
|
+
artifacts (Optional[Artifacts]): The artifacts to use in the task.
|
168
182
|
|
169
183
|
Returns:
|
170
184
|
List[Message]: The conversation response.
|
@@ -173,6 +187,10 @@ class VisionAgent(Agent):
|
|
173
187
|
if not chat:
|
174
188
|
raise ValueError("chat cannot be empty")
|
175
189
|
|
190
|
+
if not artifacts:
|
191
|
+
# this is setting remote artifacts path
|
192
|
+
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
193
|
+
|
176
194
|
with CodeInterpreterFactory.new_instance(
|
177
195
|
code_sandbox_runtime=self.code_sandbox_runtime
|
178
196
|
) as code_interpreter:
|
@@ -182,9 +200,14 @@ class VisionAgent(Agent):
|
|
182
200
|
for chat_i in int_chat:
|
183
201
|
if "media" in chat_i:
|
184
202
|
for media in chat_i["media"]:
|
185
|
-
media =
|
186
|
-
|
187
|
-
|
203
|
+
media = cast(str, media)
|
204
|
+
artifacts.artifacts[Path(media).name] = open(media, "rb").read()
|
205
|
+
|
206
|
+
media_remote_path = (
|
207
|
+
Path(code_interpreter.remote_path) / Path(media).name
|
208
|
+
)
|
209
|
+
chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
|
210
|
+
media_list.append(media_remote_path)
|
188
211
|
|
189
212
|
int_chat = cast(
|
190
213
|
List[Message],
|
@@ -204,6 +227,22 @@ class VisionAgent(Agent):
|
|
204
227
|
|
205
228
|
finished = False
|
206
229
|
iterations = 0
|
230
|
+
last_response = None
|
231
|
+
|
232
|
+
# Save the current state of artifacts, will include any images the user
|
233
|
+
# passed in.
|
234
|
+
artifacts.save(self.local_artifacts_path)
|
235
|
+
|
236
|
+
# Upload artifacts to remote location and show where they are going
|
237
|
+
# to be loaded to. The actual loading happens in BoilerplateCode as
|
238
|
+
# part of the pre_code.
|
239
|
+
remote_artifacts_path = code_interpreter.upload_file(
|
240
|
+
self.local_artifacts_path
|
241
|
+
)
|
242
|
+
artifacts_loaded = artifacts.show()
|
243
|
+
int_chat.append({"role": "observation", "content": artifacts_loaded})
|
244
|
+
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
245
|
+
|
207
246
|
while not finished and iterations < self.max_iterations:
|
208
247
|
response = run_conversation(self.agent, int_chat)
|
209
248
|
if self.verbosity >= 1:
|
@@ -211,20 +250,39 @@ class VisionAgent(Agent):
|
|
211
250
|
int_chat.append({"role": "assistant", "content": str(response)})
|
212
251
|
orig_chat.append({"role": "assistant", "content": str(response)})
|
213
252
|
|
253
|
+
# sometimes it gets stuck in a loop, so we force it to exit
|
254
|
+
if last_response == response:
|
255
|
+
response["let_user_respond"] = True
|
256
|
+
|
214
257
|
if response["let_user_respond"]:
|
215
258
|
break
|
216
259
|
|
217
260
|
code_action = parse_execution(response["response"])
|
218
261
|
|
219
262
|
if code_action is not None:
|
220
|
-
|
263
|
+
result = run_code_action(
|
264
|
+
code_action, code_interpreter, str(remote_artifacts_path)
|
265
|
+
)
|
266
|
+
obs = str(result.logs)
|
267
|
+
|
221
268
|
if self.verbosity >= 1:
|
222
269
|
_LOGGER.info(obs)
|
270
|
+
# don't add execution results to internal chat
|
223
271
|
int_chat.append({"role": "observation", "content": obs})
|
224
|
-
orig_chat.append(
|
272
|
+
orig_chat.append(
|
273
|
+
{"role": "observation", "content": obs, "execution": result}
|
274
|
+
)
|
225
275
|
|
226
276
|
iterations += 1
|
227
|
-
|
277
|
+
last_response = response
|
278
|
+
|
279
|
+
# after running the agent, download the artifacts locally
|
280
|
+
code_interpreter.download_file(
|
281
|
+
str(remote_artifacts_path.name), str(self.local_artifacts_path)
|
282
|
+
)
|
283
|
+
artifacts.load(self.local_artifacts_path)
|
284
|
+
artifacts.save()
|
285
|
+
return orig_chat, artifacts
|
228
286
|
|
229
287
|
def log_progress(self, data: Dict[str, Any]) -> None:
|
230
288
|
pass
|
@@ -718,9 +718,14 @@ class VisionAgentCoder(Agent):
|
|
718
718
|
for chat_i in chat:
|
719
719
|
if "media" in chat_i:
|
720
720
|
for media in chat_i["media"]:
|
721
|
-
media =
|
721
|
+
media = (
|
722
|
+
media
|
723
|
+
if type(media) is str
|
724
|
+
and media.startswith(("http", "https"))
|
725
|
+
else code_interpreter.upload_file(cast(str, media))
|
726
|
+
)
|
722
727
|
chat_i["content"] += f" Media name {media}" # type: ignore
|
723
|
-
media_list.append(media)
|
728
|
+
media_list.append(str(media))
|
724
729
|
|
725
730
|
int_chat = cast(
|
726
731
|
List[Message],
|
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
|
|
744
749
|
results = {"code": "", "test": "", "plan": []}
|
745
750
|
plan = []
|
746
751
|
success = False
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
"log_content": "Creating plans",
|
751
|
-
"status": "started",
|
752
|
-
}
|
753
|
-
)
|
754
|
-
plans = write_plans(
|
755
|
-
int_chat,
|
756
|
-
T.get_tool_descriptions_by_names(
|
757
|
-
customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
758
|
-
),
|
759
|
-
format_memory(working_memory),
|
760
|
-
self.planner,
|
752
|
+
|
753
|
+
plans = self._create_plans(
|
754
|
+
int_chat, customized_tool_names, working_memory, self.planner
|
761
755
|
)
|
762
756
|
|
763
|
-
if
|
764
|
-
|
765
|
-
|
766
|
-
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
767
|
-
_LOGGER.info(
|
768
|
-
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
769
|
-
)
|
757
|
+
if test_multi_plan:
|
758
|
+
self._log_plans(plans, self.verbosity)
|
759
|
+
|
770
760
|
tool_infos = retrieve_tools(
|
771
761
|
plans,
|
772
762
|
self.tool_recommender,
|
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
|
|
860
850
|
if self.report_progress_callback is not None:
|
861
851
|
self.report_progress_callback(data)
|
862
852
|
|
853
|
+
def _create_plans(
|
854
|
+
self,
|
855
|
+
int_chat: List[Message],
|
856
|
+
customized_tool_names: Optional[List[str]],
|
857
|
+
working_memory: List[Dict[str, str]],
|
858
|
+
planner: LMM,
|
859
|
+
) -> Dict[str, Any]:
|
860
|
+
self.log_progress(
|
861
|
+
{
|
862
|
+
"type": "log",
|
863
|
+
"log_content": "Creating plans",
|
864
|
+
"status": "started",
|
865
|
+
}
|
866
|
+
)
|
867
|
+
plans = write_plans(
|
868
|
+
int_chat,
|
869
|
+
T.get_tool_descriptions_by_names(
|
870
|
+
customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
871
|
+
),
|
872
|
+
format_memory(working_memory),
|
873
|
+
planner,
|
874
|
+
)
|
875
|
+
return plans
|
876
|
+
|
877
|
+
def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
|
878
|
+
if verbosity >= 1:
|
879
|
+
for p in plans:
|
880
|
+
# tabulate will fail if the keys are not the same for all elements
|
881
|
+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
882
|
+
_LOGGER.info(
|
883
|
+
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
884
|
+
)
|
885
|
+
|
863
886
|
|
864
887
|
class OllamaVisionAgentCoder(VisionAgentCoder):
|
865
888
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
VA_CODE = """
|
2
2
|
**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
|
3
3
|
|
4
|
-
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
|
4
|
+
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
|
5
5
|
|
6
6
|
<execute_python>
|
7
7
|
print("Hello World!")
|
@@ -15,7 +15,6 @@ This is the documentation for the different actions you can take:
|
|
15
15
|
**Examples**:
|
16
16
|
Here is an example of how you can interact with a user and Actions to complete a task:
|
17
17
|
--- START EXAMPLES ---
|
18
|
-
[Current directory: /example/workspace]
|
19
18
|
{examples}
|
20
19
|
--- END EXAMPLES ---
|
21
20
|
|
@@ -26,24 +25,28 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
26
25
|
**Conversation**:
|
27
26
|
Here is the current conversation so far:
|
28
27
|
--- START CONVERSATION ---
|
29
|
-
[Current directory: {dir}]
|
30
|
-
|
31
28
|
{conversation}
|
32
29
|
"""
|
33
30
|
|
31
|
+
|
34
32
|
EXAMPLES_CODE1 = """
|
35
33
|
USER: Can you detect the dogs in this image? Media name dog.jpg
|
36
34
|
|
37
|
-
|
35
|
+
OBSERVATION:
|
36
|
+
[Artifacts loaded]
|
37
|
+
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
38
|
+
[End of artifacts]
|
39
|
+
|
40
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
|
38
41
|
|
39
42
|
OBSERVATION:
|
40
|
-
[
|
43
|
+
[Artifact dog_detector.py]
|
41
44
|
0|from vision_agent.tools import load_image, owl_v2
|
42
45
|
1|def detect_dogs(image_path: str):
|
43
46
|
2| image = load_image(image_path)
|
44
47
|
3| dogs = owl_v2("dog", image)
|
45
48
|
4| return dogs
|
46
|
-
[End of
|
49
|
+
[End of artifact]
|
47
50
|
|
48
51
|
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
49
52
|
|
@@ -56,18 +59,23 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
56
59
|
|
57
60
|
USER: The the image only has one dog, can you fix this?
|
58
61
|
|
59
|
-
|
62
|
+
[Artifacts loaded]
|
63
|
+
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
64
|
+
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
|
65
|
+
[End of artifacts]
|
66
|
+
|
67
|
+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
|
60
68
|
|
61
69
|
OBSERVATION:
|
62
|
-
[
|
70
|
+
[Artifact dog_detector.py]
|
63
71
|
0|from vision_agent.tools import load_image, owl_v2
|
64
72
|
1|def detect_dogs(image_path: str):
|
65
73
|
2| image = load_image(image_path)
|
66
74
|
3| dogs = owl_v2("dog", image, threshold=0.24)
|
67
75
|
4| return dogs
|
68
|
-
[End of
|
76
|
+
[End of artifact]
|
69
77
|
|
70
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/
|
78
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
|
71
79
|
|
72
80
|
OBSERVATION:
|
73
81
|
----- stdout -----
|
@@ -76,23 +84,34 @@ OBSERVATION:
|
|
76
84
|
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
|
77
85
|
"""
|
78
86
|
|
87
|
+
|
79
88
|
EXAMPLES_CODE2 = """
|
80
|
-
USER: Can you create a function to count workers with helmets?
|
89
|
+
USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
|
90
|
+
|
91
|
+
OBSERVATION:
|
92
|
+
[Artifacts loaded]
|
93
|
+
[End of artifacts]
|
81
94
|
|
82
95
|
AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
|
83
96
|
|
84
97
|
USER: Yes you can use workers.png
|
85
98
|
|
86
|
-
|
99
|
+
OBSERVATION:
|
100
|
+
[Artifacts loaded]
|
101
|
+
Artifact workers.png loaded to /path/to/images/workers.png
|
102
|
+
[End of artifacts]
|
103
|
+
|
104
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
|
87
105
|
|
88
106
|
OBSERVATION:
|
89
|
-
[
|
90
|
-
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
|
91
|
-
1|def count_workers_with_helmets(image_path: str):
|
107
|
+
[Artifact code.py]
|
108
|
+
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
|
109
|
+
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
92
110
|
2| image = load_image(image_path)
|
93
|
-
3|
|
94
|
-
4|
|
95
|
-
5|
|
111
|
+
3| detections = owl_v2("worker, helmet", image)
|
112
|
+
4| workers = [d for d in detections if d['label'] == 'worker']
|
113
|
+
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
114
|
+
6| count = 0
|
96
115
|
6| for worker in workers:
|
97
116
|
7| person_box = worker['bbox']
|
98
117
|
8| person_has_helmet = False
|
@@ -102,14 +121,16 @@ OBSERVATION:
|
|
102
121
|
12| break
|
103
122
|
13| if person_has_helmet:
|
104
123
|
14| count += 1
|
124
|
+
15| overlay_bounding_boxes(image, detections)
|
125
|
+
16| save_image(output_path, image)
|
105
126
|
15| return count
|
106
|
-
[End of
|
127
|
+
[End of artifact]
|
107
128
|
|
108
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/
|
129
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
109
130
|
|
110
131
|
OBSERVATION:
|
111
132
|
----- stdout -----
|
112
133
|
2
|
113
134
|
|
114
|
-
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
|
135
|
+
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
115
136
|
"""
|
@@ -5,9 +5,9 @@ from uuid import UUID
|
|
5
5
|
from requests.exceptions import HTTPError
|
6
6
|
|
7
7
|
from vision_agent.clients.http import BaseHTTP
|
8
|
-
from vision_agent.
|
8
|
+
from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
|
9
9
|
from vision_agent.utils.exceptions import FineTuneModelNotFound
|
10
|
-
from vision_agent.
|
10
|
+
from vision_agent.utils.type_defs import LandingaiAPIKey
|
11
11
|
|
12
12
|
|
13
13
|
class LandingPublicAPI(BaseHTTP):
|
vision_agent/lmm/lmm.py
CHANGED
@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
|
|
30
30
|
|
31
31
|
|
32
32
|
def encode_media(media: Union[str, Path]) -> str:
|
33
|
+
if type(media) is str and media.startswith(("http", "https")):
|
34
|
+
# for mp4 video url, we assume there is a same url but ends with png
|
35
|
+
# vision-agent-ui will upload this png when uploading the video
|
36
|
+
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
|
37
|
+
return media[:-4] + ".png"
|
38
|
+
return media
|
33
39
|
extension = "png"
|
34
40
|
extension = Path(media).suffix
|
35
41
|
if extension.lower() not in {
|
@@ -132,13 +138,17 @@ class OpenAILMM(LMM):
|
|
132
138
|
fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
|
133
139
|
if "media" in c:
|
134
140
|
for media in c["media"]:
|
135
|
-
encoded_media = encode_media(media)
|
141
|
+
encoded_media = encode_media(cast(str, media))
|
136
142
|
|
137
143
|
fixed_c["content"].append( # type: ignore
|
138
144
|
{
|
139
145
|
"type": "image_url",
|
140
146
|
"image_url": {
|
141
|
-
"url":
|
147
|
+
"url": (
|
148
|
+
encoded_media
|
149
|
+
if encoded_media.startswith(("http", "https"))
|
150
|
+
else f"data:image/png;base64,{encoded_media}"
|
151
|
+
),
|
142
152
|
"detail": "low",
|
143
153
|
},
|
144
154
|
},
|
@@ -379,7 +389,9 @@ class OllamaLMM(LMM):
|
|
379
389
|
fixed_chat = []
|
380
390
|
for message in chat:
|
381
391
|
if "media" in message:
|
382
|
-
message["images"] = [
|
392
|
+
message["images"] = [
|
393
|
+
encode_media(cast(str, m)) for m in message["media"]
|
394
|
+
]
|
383
395
|
del message["media"]
|
384
396
|
fixed_chat.append(message)
|
385
397
|
url = f"{self.url}/chat"
|
@@ -390,7 +402,6 @@ class OllamaLMM(LMM):
|
|
390
402
|
tmp_kwargs = self.kwargs | kwargs
|
391
403
|
data.update(tmp_kwargs)
|
392
404
|
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
393
|
-
|
394
405
|
json_data = json.dumps(data)
|
395
406
|
|
396
407
|
def f() -> Iterator[Optional[str]]:
|
@@ -424,7 +435,6 @@ class OllamaLMM(LMM):
|
|
424
435
|
media: Optional[List[Union[str, Path]]] = None,
|
425
436
|
**kwargs: Any,
|
426
437
|
) -> Union[str, Iterator[Optional[str]]]:
|
427
|
-
|
428
438
|
url = f"{self.url}/generate"
|
429
439
|
data: Dict[str, Any] = {
|
430
440
|
"model": self.model_name,
|
@@ -439,7 +449,6 @@ class OllamaLMM(LMM):
|
|
439
449
|
tmp_kwargs = self.kwargs | kwargs
|
440
450
|
data.update(tmp_kwargs)
|
441
451
|
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
442
|
-
|
443
452
|
json_data = json.dumps(data)
|
444
453
|
|
445
454
|
def f() -> Iterator[Optional[str]]:
|
vision_agent/lmm/types.py
CHANGED
vision_agent/tools/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from typing import Callable, List, Optional
|
2
2
|
|
3
|
-
from .meta_tools import META_TOOL_DOCSTRING
|
3
|
+
from .meta_tools import META_TOOL_DOCSTRING, Artifacts
|
4
4
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
5
5
|
from .tool_utils import get_tool_descriptions_by_names
|
6
6
|
from .tools import (
|
@@ -21,8 +21,8 @@ from .tools import (
|
|
21
21
|
dpt_hybrid_midas,
|
22
22
|
extract_frames,
|
23
23
|
florence2_image_caption,
|
24
|
-
florence2_object_detection,
|
25
24
|
florence2_ocr,
|
25
|
+
florence2_phrase_grounding,
|
26
26
|
florence2_roberta_vqa,
|
27
27
|
florence2_sam2_image,
|
28
28
|
florence2_sam2_video,
|