vision-agent 0.2.118__tar.gz → 0.2.119__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. {vision_agent-0.2.118 → vision_agent-0.2.119}/PKG-INFO +12 -8
  2. {vision_agent-0.2.118 → vision_agent-0.2.119}/README.md +11 -7
  3. {vision_agent-0.2.118 → vision_agent-0.2.119}/pyproject.toml +1 -1
  4. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/agent/agent.py +1 -1
  5. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/agent/vision_agent.py +107 -49
  6. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/agent/vision_agent_coder.py +2 -2
  7. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/agent/vision_agent_prompts.py +43 -22
  8. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/clients/landing_public_api.py +2 -2
  9. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/lmm/lmm.py +4 -2
  10. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/lmm/types.py +3 -1
  11. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/tools/__init__.py +2 -2
  12. vision_agent-0.2.119/vision_agent/tools/meta_tools.py +410 -0
  13. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/tools/tools.py +3 -3
  14. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/tools/tools_types.py +3 -3
  15. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/utils/execute.py +69 -22
  16. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/utils/image_utils.py +2 -2
  17. vision_agent-0.2.118/vision_agent/tools/meta_tools.py +0 -402
  18. {vision_agent-0.2.118 → vision_agent-0.2.119}/LICENSE +0 -0
  19. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/__init__.py +0 -0
  20. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/agent/__init__.py +0 -0
  21. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/agent/agent_utils.py +0 -0
  22. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  23. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/clients/__init__.py +0 -0
  24. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/clients/http.py +0 -0
  25. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/fonts/__init__.py +0 -0
  26. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  27. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/lmm/__init__.py +0 -0
  28. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/tools/prompts.py +0 -0
  29. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/tools/tool_utils.py +0 -0
  30. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/utils/__init__.py +0 -0
  31. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/utils/exceptions.py +0 -0
  32. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/utils/sim.py +0 -0
  33. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/utils/type_defs.py +0 -0
  34. {vision_agent-0.2.118 → vision_agent-0.2.119}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.118
3
+ Version: 0.2.119
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -81,15 +81,15 @@ export OPENAI_API_KEY="your-api-key"
81
81
  ```
82
82
 
83
83
  ### Vision Agent
84
- There are two agents that you can use. Vision Agent is a conversational agent that has
84
+ There are two agents that you can use. `VisionAgent` is a conversational agent that has
85
85
  access to tools that allow it to write an navigate python code and file systems. It can
86
- converse with the user in natural language. VisionAgentCoder is an agent that can write
87
- code for vision tasks, such as counting people in an image. However, it cannot converse
88
- and can only respond with code. VisionAgent can call VisionAgentCoder to write vision
89
- code.
86
+ converse with the user in natural language. `VisionAgentCoder` is an agent specifically
87
+ for writing code for vision tasks, such as counting people in an image. However, it
88
+ cannot chat with you and can only respond with code. `VisionAgent` can call
89
+ `VisionAgentCoder` to write vision code.
90
90
 
91
91
  #### Basic Usage
92
- To run the streamlit app locally to chat with Vision Agent, you can run the following
92
+ To run the streamlit app locally to chat with `VisionAgent`, you can run the following
93
93
  command:
94
94
 
95
95
  ```bash
@@ -186,7 +186,7 @@ the code and having it update. You just need to add the code as a response from
186
186
  assistant:
187
187
 
188
188
  ```python
189
- agent = va.agent.VisionAgent(verbosity=2)
189
+ agent = va.agent.VisionAgentCoder(verbosity=2)
190
190
  conv = [
191
191
  {
192
192
  "role": "user",
@@ -252,6 +252,10 @@ function. Make sure the documentation is in the same format above with descripti
252
252
  `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
253
253
  [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
254
254
 
255
+ Can't find the tool you need and want add it to `VisionAgent`? Check out our
256
+ [vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
257
+ we add the source code for all the tools used in `VisionAgent`.
258
+
255
259
  ## Additional Backends
256
260
  ### Ollama
257
261
  We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
@@ -41,15 +41,15 @@ export OPENAI_API_KEY="your-api-key"
41
41
  ```
42
42
 
43
43
  ### Vision Agent
44
- There are two agents that you can use. Vision Agent is a conversational agent that has
44
+ There are two agents that you can use. `VisionAgent` is a conversational agent that has
45
45
  access to tools that allow it to write an navigate python code and file systems. It can
46
- converse with the user in natural language. VisionAgentCoder is an agent that can write
47
- code for vision tasks, such as counting people in an image. However, it cannot converse
48
- and can only respond with code. VisionAgent can call VisionAgentCoder to write vision
49
- code.
46
+ converse with the user in natural language. `VisionAgentCoder` is an agent specifically
47
+ for writing code for vision tasks, such as counting people in an image. However, it
48
+ cannot chat with you and can only respond with code. `VisionAgent` can call
49
+ `VisionAgentCoder` to write vision code.
50
50
 
51
51
  #### Basic Usage
52
- To run the streamlit app locally to chat with Vision Agent, you can run the following
52
+ To run the streamlit app locally to chat with `VisionAgent`, you can run the following
53
53
  command:
54
54
 
55
55
  ```bash
@@ -146,7 +146,7 @@ the code and having it update. You just need to add the code as a response from
146
146
  assistant:
147
147
 
148
148
  ```python
149
- agent = va.agent.VisionAgent(verbosity=2)
149
+ agent = va.agent.VisionAgentCoder(verbosity=2)
150
150
  conv = [
151
151
  {
152
152
  "role": "user",
@@ -212,6 +212,10 @@ function. Make sure the documentation is in the same format above with descripti
212
212
  `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
213
213
  [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
214
214
 
215
+ Can't find the tool you need and want add it to `VisionAgent`? Check out our
216
+ [vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
217
+ we add the source code for all the tools used in `VisionAgent`.
218
+
215
219
  ## Additional Backends
216
220
  ### Ollama
217
221
  We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.118"
7
+ version = "0.2.119"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -11,7 +11,7 @@ class Agent(ABC):
11
11
  self,
12
12
  input: Union[str, List[Message]],
13
13
  media: Optional[Union[str, Path]] = None,
14
- ) -> str:
14
+ ) -> Union[str, List[Message]]:
15
15
  pass
16
16
 
17
17
  @abstractmethod
@@ -1,8 +1,9 @@
1
1
  import copy
2
2
  import logging
3
3
  import os
4
+ import tempfile
4
5
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional, Union, cast
6
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
6
7
 
7
8
  from vision_agent.agent import Agent
8
9
  from vision_agent.agent.agent_utils import extract_json
@@ -13,8 +14,9 @@ from vision_agent.agent.vision_agent_prompts import (
13
14
  )
14
15
  from vision_agent.lmm import LMM, Message, OpenAILMM
15
16
  from vision_agent.tools import META_TOOL_DOCSTRING
17
+ from vision_agent.tools.meta_tools import Artifacts
16
18
  from vision_agent.utils import CodeInterpreterFactory
17
- from vision_agent.utils.execute import CodeInterpreter
19
+ from vision_agent.utils.execute import CodeInterpreter, Execution
18
20
 
19
21
  logging.basicConfig(level=logging.INFO)
20
22
  _LOGGER = logging.getLogger(__name__)
@@ -24,23 +26,30 @@ if str(WORKSPACE) != "":
24
26
  os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
25
27
 
26
28
 
27
- class DefaultImports:
28
- code = [
29
+ class BoilerplateCode:
30
+ pre_code = [
29
31
  "from typing import *",
30
32
  "from vision_agent.utils.execute import CodeInterpreter",
31
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
33
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
34
+ "artifacts = Artifacts('{remote_path}')",
35
+ "artifacts.load('{remote_path}')",
36
+ ]
37
+ post_code = [
38
+ "artifacts.save()",
32
39
  ]
33
40
 
34
41
  @staticmethod
35
- def to_code_string() -> str:
36
- return "\n".join(DefaultImports.code)
37
-
38
- @staticmethod
39
- def prepend_imports(code: str) -> str:
42
+ def add_boilerplate(code: str, **format: Any) -> str:
40
43
  """Run this method to prepend the default imports to the code.
41
44
  NOTE: be sure to run this method after the custom tools have been registered.
42
45
  """
43
- return DefaultImports.to_code_string() + "\n\n" + code
46
+ return (
47
+ "\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
48
+ + "\n\n"
49
+ + code
50
+ + "\n\n"
51
+ + "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
52
+ )
44
53
 
45
54
 
46
55
  def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
@@ -60,35 +69,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
60
69
  prompt = VA_CODE.format(
61
70
  documentation=META_TOOL_DOCSTRING,
62
71
  examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
63
- dir=WORKSPACE,
64
72
  conversation=conversation,
65
73
  )
66
74
  return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
67
75
 
68
76
 
69
- def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
70
- # Note the code interpreter needs to keep running in the same environment because
71
- # the SWE tools hold state like line numbers and currently open files.
72
- result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
73
-
74
- return_str = ""
75
- if result.success:
76
- for res in result.results:
77
- if res.text is not None:
78
- return_str += res.text.replace("\\n", "\n")
79
- if result.logs.stdout:
80
- return_str += "----- stdout -----\n"
81
- for log in result.logs.stdout:
82
- return_str += log.replace("\\n", "\n")
83
- else:
84
- # for log in result.logs.stderr:
85
- # return_str += log.replace("\\n", "\n")
86
- if result.error:
87
- return_str += (
88
- "\n" + result.error.value + "\n".join(result.error.traceback_raw)
89
- )
90
-
91
- return return_str
77
+ def run_code_action(
78
+ code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
79
+ ) -> Execution:
80
+ return code_interpreter.exec_isolation(
81
+ BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
82
+ )
92
83
 
93
84
 
94
85
  def parse_execution(response: str) -> Optional[str]:
@@ -101,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
101
92
 
102
93
  class VisionAgent(Agent):
103
94
  """Vision Agent is an agent that can chat with the user and call tools or other
104
- agents to generate code for it. Vision Agent uses python code to execute actions for
105
- the user. Vision Agent is inspired by by OpenDev
95
+ agents to generate code for it. Vision Agent uses python code to execute actions
96
+ for the user. Vision Agent is inspired by by OpenDev
106
97
  https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
107
98
 
108
99
  Example
@@ -118,8 +109,20 @@ class VisionAgent(Agent):
118
109
  self,
119
110
  agent: Optional[LMM] = None,
120
111
  verbosity: int = 0,
112
+ local_artifacts_path: Optional[Union[str, Path]] = None,
121
113
  code_sandbox_runtime: Optional[str] = None,
122
114
  ) -> None:
115
+ """Initialize the VisionAgent.
116
+
117
+ Parameters:
118
+ agent (Optional[LMM]): The agent to use for conversation and orchestration
119
+ of other agents.
120
+ verbosity (int): The verbosity level of the agent.
121
+ local_artifacts_path (Optional[Union[str, Path]]): The path to the local
122
+ artifacts file.
123
+ code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
124
+ """
125
+
123
126
  self.agent = (
124
127
  OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
125
128
  )
@@ -128,12 +131,21 @@ class VisionAgent(Agent):
128
131
  self.code_sandbox_runtime = code_sandbox_runtime
129
132
  if self.verbosity >= 1:
130
133
  _LOGGER.setLevel(logging.INFO)
134
+ self.local_artifacts_path = cast(
135
+ str,
136
+ (
137
+ Path(local_artifacts_path)
138
+ if local_artifacts_path is not None
139
+ else Path(tempfile.NamedTemporaryFile(delete=False).name)
140
+ ),
141
+ )
131
142
 
132
143
  def __call__(
133
144
  self,
134
145
  input: Union[str, List[Message]],
135
146
  media: Optional[Union[str, Path]] = None,
136
- ) -> str:
147
+ artifacts: Optional[Artifacts] = None,
148
+ ) -> List[Message]:
137
149
  """Chat with VisionAgent and get the conversation response.
138
150
 
139
151
  Parameters:
@@ -141,6 +153,7 @@ class VisionAgent(Agent):
141
153
  [{"role": "user", "content": "describe your task here..."}, ...] or a
142
154
  string of just the contents.
143
155
  media (Optional[Union[str, Path]]): The media file to be used in the task.
156
+ artifacts (Optional[Artifacts]): The artifacts to use in the task.
144
157
 
145
158
  Returns:
146
159
  str: The conversation response.
@@ -149,22 +162,23 @@ class VisionAgent(Agent):
149
162
  input = [{"role": "user", "content": input}]
150
163
  if media is not None:
151
164
  input[0]["media"] = [media]
152
- results = self.chat_with_code(input)
153
- return results # type: ignore
165
+ results, _ = self.chat_with_code(input, artifacts)
166
+ return results
154
167
 
155
168
  def chat_with_code(
156
169
  self,
157
170
  chat: List[Message],
158
- ) -> List[Message]:
171
+ artifacts: Optional[Artifacts] = None,
172
+ ) -> Tuple[List[Message], Artifacts]:
159
173
  """Chat with VisionAgent, it will use code to execute actions to accomplish
160
174
  its tasks.
161
175
 
162
176
  Parameters:
163
- chat (List[Message]): A conversation
164
- in the format of:
177
+ chat (List[Message]): A conversation in the format of:
165
178
  [{"role": "user", "content": "describe your task here..."}]
166
179
  or if it contains media files, it should be in the format of:
167
180
  [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
181
+ artifacts (Optional[Artifacts]): The artifacts to use in the task.
168
182
 
169
183
  Returns:
170
184
  List[Message]: The conversation response.
@@ -173,6 +187,10 @@ class VisionAgent(Agent):
173
187
  if not chat:
174
188
  raise ValueError("chat cannot be empty")
175
189
 
190
+ if not artifacts:
191
+ # this is setting remote artifacts path
192
+ artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
193
+
176
194
  with CodeInterpreterFactory.new_instance(
177
195
  code_sandbox_runtime=self.code_sandbox_runtime
178
196
  ) as code_interpreter:
@@ -182,9 +200,14 @@ class VisionAgent(Agent):
182
200
  for chat_i in int_chat:
183
201
  if "media" in chat_i:
184
202
  for media in chat_i["media"]:
185
- media = code_interpreter.upload_file(media)
186
- chat_i["content"] += f" Media name {media}" # type: ignore
187
- media_list.append(media)
203
+ media = cast(str, media)
204
+ artifacts.artifacts[Path(media).name] = open(media, "rb").read()
205
+
206
+ media_remote_path = (
207
+ Path(code_interpreter.remote_path) / Path(media).name
208
+ )
209
+ chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
210
+ media_list.append(media_remote_path)
188
211
 
189
212
  int_chat = cast(
190
213
  List[Message],
@@ -204,6 +227,22 @@ class VisionAgent(Agent):
204
227
 
205
228
  finished = False
206
229
  iterations = 0
230
+ last_response = None
231
+
232
+ # Save the current state of artifacts, will include any images the user
233
+ # passed in.
234
+ artifacts.save(self.local_artifacts_path)
235
+
236
+ # Upload artifacts to remote location and show where they are going
237
+ # to be loaded to. The actual loading happens in BoilerplateCode as
238
+ # part of the pre_code.
239
+ remote_artifacts_path = code_interpreter.upload_file(
240
+ self.local_artifacts_path
241
+ )
242
+ artifacts_loaded = artifacts.show()
243
+ int_chat.append({"role": "observation", "content": artifacts_loaded})
244
+ orig_chat.append({"role": "observation", "content": artifacts_loaded})
245
+
207
246
  while not finished and iterations < self.max_iterations:
208
247
  response = run_conversation(self.agent, int_chat)
209
248
  if self.verbosity >= 1:
@@ -211,20 +250,39 @@ class VisionAgent(Agent):
211
250
  int_chat.append({"role": "assistant", "content": str(response)})
212
251
  orig_chat.append({"role": "assistant", "content": str(response)})
213
252
 
253
+ # sometimes it gets stuck in a loop, so we force it to exit
254
+ if last_response == response:
255
+ response["let_user_respond"] = True
256
+
214
257
  if response["let_user_respond"]:
215
258
  break
216
259
 
217
260
  code_action = parse_execution(response["response"])
218
261
 
219
262
  if code_action is not None:
220
- obs = run_code_action(code_action, code_interpreter)
263
+ result = run_code_action(
264
+ code_action, code_interpreter, str(remote_artifacts_path)
265
+ )
266
+ obs = str(result.logs)
267
+
221
268
  if self.verbosity >= 1:
222
269
  _LOGGER.info(obs)
270
+ # don't add execution results to internal chat
223
271
  int_chat.append({"role": "observation", "content": obs})
224
- orig_chat.append({"role": "observation", "content": obs})
272
+ orig_chat.append(
273
+ {"role": "observation", "content": obs, "execution": result}
274
+ )
225
275
 
226
276
  iterations += 1
227
- return orig_chat
277
+ last_response = response
278
+
279
+ # after running the agent, download the artifacts locally
280
+ code_interpreter.download_file(
281
+ str(remote_artifacts_path.name), str(self.local_artifacts_path)
282
+ )
283
+ artifacts.load(self.local_artifacts_path)
284
+ artifacts.save()
285
+ return orig_chat, artifacts
228
286
 
229
287
  def log_progress(self, data: Dict[str, Any]) -> None:
230
288
  pass
@@ -722,10 +722,10 @@ class VisionAgentCoder(Agent):
722
722
  media
723
723
  if type(media) is str
724
724
  and media.startswith(("http", "https"))
725
- else code_interpreter.upload_file(media)
725
+ else code_interpreter.upload_file(cast(str, media))
726
726
  )
727
727
  chat_i["content"] += f" Media name {media}" # type: ignore
728
- media_list.append(media)
728
+ media_list.append(str(media))
729
729
 
730
730
  int_chat = cast(
731
731
  List[Message],
@@ -1,7 +1,7 @@
1
1
  VA_CODE = """
2
2
  **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
3
3
 
4
- **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
4
+ **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
5
5
 
6
6
  <execute_python>
7
7
  print("Hello World!")
@@ -15,7 +15,6 @@ This is the documentation for the different actions you can take:
15
15
  **Examples**:
16
16
  Here is an example of how you can interact with a user and Actions to complete a task:
17
17
  --- START EXAMPLES ---
18
- [Current directory: /example/workspace]
19
18
  {examples}
20
19
  --- END EXAMPLES ---
21
20
 
@@ -26,24 +25,28 @@ Here is an example of how you can interact with a user and Actions to complete a
26
25
  **Conversation**:
27
26
  Here is the current conversation so far:
28
27
  --- START CONVERSATION ---
29
- [Current directory: {dir}]
30
-
31
28
  {conversation}
32
29
  """
33
30
 
31
+
34
32
  EXAMPLES_CODE1 = """
35
33
  USER: Can you detect the dogs in this image? Media name dog.jpg
36
34
 
37
- AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
35
+ OBSERVATION:
36
+ [Artifacts loaded]
37
+ Artifact dog.jpg loaded to /path/to/images/dog.jpg
38
+ [End of artifacts]
39
+
40
+ AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
38
41
 
39
42
  OBSERVATION:
40
- [File /example/workspace/dog_detector.py]
43
+ [Artifact dog_detector.py]
41
44
  0|from vision_agent.tools import load_image, owl_v2
42
45
  1|def detect_dogs(image_path: str):
43
46
  2| image = load_image(image_path)
44
47
  3| dogs = owl_v2("dog", image)
45
48
  4| return dogs
46
- [End of file]
49
+ [End of artifact]
47
50
 
48
51
  AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
49
52
 
@@ -56,18 +59,23 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
56
59
 
57
60
  USER: The the image only has one dog, can you fix this?
58
61
 
59
- AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
62
+ [Artifacts loaded]
63
+ Artifact dog.jpg loaded to /path/to/images/dog.jpg
64
+ Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
65
+ [End of artifacts]
66
+
67
+ AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
60
68
 
61
69
  OBSERVATION:
62
- [File /example/workspace/dog_detector.py]
70
+ [Artifact dog_detector.py]
63
71
  0|from vision_agent.tools import load_image, owl_v2
64
72
  1|def detect_dogs(image_path: str):
65
73
  2| image = load_image(image_path)
66
74
  3| dogs = owl_v2("dog", image, threshold=0.24)
67
75
  4| return dogs
68
- [End of file]
76
+ [End of artifact]
69
77
 
70
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
78
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
71
79
 
72
80
  OBSERVATION:
73
81
  ----- stdout -----
@@ -76,23 +84,34 @@ OBSERVATION:
76
84
  AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
77
85
  """
78
86
 
87
+
79
88
  EXAMPLES_CODE2 = """
80
- USER: Can you create a function to count workers with helmets?
89
+ USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
90
+
91
+ OBSERVATION:
92
+ [Artifacts loaded]
93
+ [End of artifacts]
81
94
 
82
95
  AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
83
96
 
84
97
  USER: Yes you can use workers.png
85
98
 
86
- AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
99
+ OBSERVATION:
100
+ [Artifacts loaded]
101
+ Artifact workers.png loaded to /path/to/images/workers.png
102
+ [End of artifacts]
103
+
104
+ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
87
105
 
88
106
  OBSERVATION:
89
- [File /example/workspace/code.py]
90
- 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
91
- 1|def count_workers_with_helmets(image_path: str):
107
+ [Artifact code.py]
108
+ 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
109
+ 1|def count_workers_with_helmets(image_path: str, output_path: str):
92
110
  2| image = load_image(image_path)
93
- 3| workers = owl_v2("worker", image)
94
- 4| helmets = owl_v2("helmet", image)
95
- 5| count = 0
111
+ 3| detections = owl_v2("worker, helmet", image)
112
+ 4| workers = [d for d in detections if d['label'] == 'worker']
113
+ 5| helmets = [d for d in detections if d['label'] == 'helmet']
114
+ 6| count = 0
96
115
  6| for worker in workers:
97
116
  7| person_box = worker['bbox']
98
117
  8| person_has_helmet = False
@@ -102,14 +121,16 @@ OBSERVATION:
102
121
  12| break
103
122
  13| if person_has_helmet:
104
123
  14| count += 1
124
+ 15| overlay_bounding_boxes(image, detections)
125
+ 16| save_image(output_path, image)
105
126
  15| return count
106
- [End of file]
127
+ [End of artifact]
107
128
 
108
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
129
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
109
130
 
110
131
  OBSERVATION:
111
132
  ----- stdout -----
112
133
  2
113
134
 
114
- AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
135
+ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
115
136
  """
@@ -5,9 +5,9 @@ from uuid import UUID
5
5
  from requests.exceptions import HTTPError
6
6
 
7
7
  from vision_agent.clients.http import BaseHTTP
8
- from vision_agent.utils.type_defs import LandingaiAPIKey
8
+ from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
9
9
  from vision_agent.utils.exceptions import FineTuneModelNotFound
10
- from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
10
+ from vision_agent.utils.type_defs import LandingaiAPIKey
11
11
 
12
12
 
13
13
  class LandingPublicAPI(BaseHTTP):
@@ -138,7 +138,7 @@ class OpenAILMM(LMM):
138
138
  fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
139
139
  if "media" in c:
140
140
  for media in c["media"]:
141
- encoded_media = encode_media(media)
141
+ encoded_media = encode_media(cast(str, media))
142
142
 
143
143
  fixed_c["content"].append( # type: ignore
144
144
  {
@@ -389,7 +389,9 @@ class OllamaLMM(LMM):
389
389
  fixed_chat = []
390
390
  for message in chat:
391
391
  if "media" in message:
392
- message["images"] = [encode_media(m) for m in message["media"]]
392
+ message["images"] = [
393
+ encode_media(cast(str, m)) for m in message["media"]
394
+ ]
393
395
  del message["media"]
394
396
  fixed_chat.append(message)
395
397
  url = f"{self.url}/chat"
@@ -1,5 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Dict, Sequence, Union
3
3
 
4
+ from vision_agent.utils.execute import Execution
5
+
4
6
  TextOrImage = Union[str, Sequence[Union[str, Path]]]
5
- Message = Dict[str, TextOrImage]
7
+ Message = Dict[str, Union[TextOrImage, Execution]]
@@ -1,6 +1,6 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING
3
+ from .meta_tools import META_TOOL_DOCSTRING, Artifacts
4
4
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
5
  from .tool_utils import get_tool_descriptions_by_names
6
6
  from .tools import (
@@ -21,8 +21,8 @@ from .tools import (
21
21
  dpt_hybrid_midas,
22
22
  extract_frames,
23
23
  florence2_image_caption,
24
- florence2_phrase_grounding,
25
24
  florence2_ocr,
25
+ florence2_phrase_grounding,
26
26
  florence2_roberta_vqa,
27
27
  florence2_sam2_image,
28
28
  florence2_sam2_video,