vision-agent 0.2.117__py3-none-any.whl → 0.2.119__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ class Agent(ABC):
11
11
  self,
12
12
  input: Union[str, List[Message]],
13
13
  media: Optional[Union[str, Path]] = None,
14
- ) -> str:
14
+ ) -> Union[str, List[Message]]:
15
15
  pass
16
16
 
17
17
  @abstractmethod
@@ -1,8 +1,9 @@
1
1
  import copy
2
2
  import logging
3
3
  import os
4
+ import tempfile
4
5
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional, Union, cast
6
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
6
7
 
7
8
  from vision_agent.agent import Agent
8
9
  from vision_agent.agent.agent_utils import extract_json
@@ -13,8 +14,9 @@ from vision_agent.agent.vision_agent_prompts import (
13
14
  )
14
15
  from vision_agent.lmm import LMM, Message, OpenAILMM
15
16
  from vision_agent.tools import META_TOOL_DOCSTRING
17
+ from vision_agent.tools.meta_tools import Artifacts
16
18
  from vision_agent.utils import CodeInterpreterFactory
17
- from vision_agent.utils.execute import CodeInterpreter
19
+ from vision_agent.utils.execute import CodeInterpreter, Execution
18
20
 
19
21
  logging.basicConfig(level=logging.INFO)
20
22
  _LOGGER = logging.getLogger(__name__)
@@ -24,23 +26,30 @@ if str(WORKSPACE) != "":
24
26
  os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
25
27
 
26
28
 
27
- class DefaultImports:
28
- code = [
29
+ class BoilerplateCode:
30
+ pre_code = [
29
31
  "from typing import *",
30
32
  "from vision_agent.utils.execute import CodeInterpreter",
31
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
33
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
34
+ "artifacts = Artifacts('{remote_path}')",
35
+ "artifacts.load('{remote_path}')",
36
+ ]
37
+ post_code = [
38
+ "artifacts.save()",
32
39
  ]
33
40
 
34
41
  @staticmethod
35
- def to_code_string() -> str:
36
- return "\n".join(DefaultImports.code)
37
-
38
- @staticmethod
39
- def prepend_imports(code: str) -> str:
42
+ def add_boilerplate(code: str, **format: Any) -> str:
40
43
  """Run this method to prepend the default imports to the code.
41
44
  NOTE: be sure to run this method after the custom tools have been registered.
42
45
  """
43
- return DefaultImports.to_code_string() + "\n\n" + code
46
+ return (
47
+ "\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
48
+ + "\n\n"
49
+ + code
50
+ + "\n\n"
51
+ + "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
52
+ )
44
53
 
45
54
 
46
55
  def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
@@ -60,35 +69,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
60
69
  prompt = VA_CODE.format(
61
70
  documentation=META_TOOL_DOCSTRING,
62
71
  examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
63
- dir=WORKSPACE,
64
72
  conversation=conversation,
65
73
  )
66
74
  return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
67
75
 
68
76
 
69
- def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
70
- # Note the code interpreter needs to keep running in the same environment because
71
- # the SWE tools hold state like line numbers and currently open files.
72
- result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
73
-
74
- return_str = ""
75
- if result.success:
76
- for res in result.results:
77
- if res.text is not None:
78
- return_str += res.text.replace("\\n", "\n")
79
- if result.logs.stdout:
80
- return_str += "----- stdout -----\n"
81
- for log in result.logs.stdout:
82
- return_str += log.replace("\\n", "\n")
83
- else:
84
- # for log in result.logs.stderr:
85
- # return_str += log.replace("\\n", "\n")
86
- if result.error:
87
- return_str += (
88
- "\n" + result.error.value + "\n".join(result.error.traceback_raw)
89
- )
90
-
91
- return return_str
77
+ def run_code_action(
78
+ code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
79
+ ) -> Execution:
80
+ return code_interpreter.exec_isolation(
81
+ BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
82
+ )
92
83
 
93
84
 
94
85
  def parse_execution(response: str) -> Optional[str]:
@@ -101,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
101
92
 
102
93
  class VisionAgent(Agent):
103
94
  """Vision Agent is an agent that can chat with the user and call tools or other
104
- agents to generate code for it. Vision Agent uses python code to execute actions for
105
- the user. Vision Agent is inspired by by OpenDev
95
+ agents to generate code for it. Vision Agent uses python code to execute actions
96
+ for the user. Vision Agent is inspired by by OpenDev
106
97
  https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
107
98
 
108
99
  Example
@@ -118,8 +109,20 @@ class VisionAgent(Agent):
118
109
  self,
119
110
  agent: Optional[LMM] = None,
120
111
  verbosity: int = 0,
112
+ local_artifacts_path: Optional[Union[str, Path]] = None,
121
113
  code_sandbox_runtime: Optional[str] = None,
122
114
  ) -> None:
115
+ """Initialize the VisionAgent.
116
+
117
+ Parameters:
118
+ agent (Optional[LMM]): The agent to use for conversation and orchestration
119
+ of other agents.
120
+ verbosity (int): The verbosity level of the agent.
121
+ local_artifacts_path (Optional[Union[str, Path]]): The path to the local
122
+ artifacts file.
123
+ code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
124
+ """
125
+
123
126
  self.agent = (
124
127
  OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
125
128
  )
@@ -128,12 +131,21 @@ class VisionAgent(Agent):
128
131
  self.code_sandbox_runtime = code_sandbox_runtime
129
132
  if self.verbosity >= 1:
130
133
  _LOGGER.setLevel(logging.INFO)
134
+ self.local_artifacts_path = cast(
135
+ str,
136
+ (
137
+ Path(local_artifacts_path)
138
+ if local_artifacts_path is not None
139
+ else Path(tempfile.NamedTemporaryFile(delete=False).name)
140
+ ),
141
+ )
131
142
 
132
143
  def __call__(
133
144
  self,
134
145
  input: Union[str, List[Message]],
135
146
  media: Optional[Union[str, Path]] = None,
136
- ) -> str:
147
+ artifacts: Optional[Artifacts] = None,
148
+ ) -> List[Message]:
137
149
  """Chat with VisionAgent and get the conversation response.
138
150
 
139
151
  Parameters:
@@ -141,6 +153,7 @@ class VisionAgent(Agent):
141
153
  [{"role": "user", "content": "describe your task here..."}, ...] or a
142
154
  string of just the contents.
143
155
  media (Optional[Union[str, Path]]): The media file to be used in the task.
156
+ artifacts (Optional[Artifacts]): The artifacts to use in the task.
144
157
 
145
158
  Returns:
146
159
  str: The conversation response.
@@ -149,22 +162,23 @@ class VisionAgent(Agent):
149
162
  input = [{"role": "user", "content": input}]
150
163
  if media is not None:
151
164
  input[0]["media"] = [media]
152
- results = self.chat_with_code(input)
153
- return results # type: ignore
165
+ results, _ = self.chat_with_code(input, artifacts)
166
+ return results
154
167
 
155
168
  def chat_with_code(
156
169
  self,
157
170
  chat: List[Message],
158
- ) -> List[Message]:
171
+ artifacts: Optional[Artifacts] = None,
172
+ ) -> Tuple[List[Message], Artifacts]:
159
173
  """Chat with VisionAgent, it will use code to execute actions to accomplish
160
174
  its tasks.
161
175
 
162
176
  Parameters:
163
- chat (List[Message]): A conversation
164
- in the format of:
177
+ chat (List[Message]): A conversation in the format of:
165
178
  [{"role": "user", "content": "describe your task here..."}]
166
179
  or if it contains media files, it should be in the format of:
167
180
  [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
181
+ artifacts (Optional[Artifacts]): The artifacts to use in the task.
168
182
 
169
183
  Returns:
170
184
  List[Message]: The conversation response.
@@ -173,6 +187,10 @@ class VisionAgent(Agent):
173
187
  if not chat:
174
188
  raise ValueError("chat cannot be empty")
175
189
 
190
+ if not artifacts:
191
+ # this is setting remote artifacts path
192
+ artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
193
+
176
194
  with CodeInterpreterFactory.new_instance(
177
195
  code_sandbox_runtime=self.code_sandbox_runtime
178
196
  ) as code_interpreter:
@@ -182,9 +200,14 @@ class VisionAgent(Agent):
182
200
  for chat_i in int_chat:
183
201
  if "media" in chat_i:
184
202
  for media in chat_i["media"]:
185
- media = code_interpreter.upload_file(media)
186
- chat_i["content"] += f" Media name {media}" # type: ignore
187
- media_list.append(media)
203
+ media = cast(str, media)
204
+ artifacts.artifacts[Path(media).name] = open(media, "rb").read()
205
+
206
+ media_remote_path = (
207
+ Path(code_interpreter.remote_path) / Path(media).name
208
+ )
209
+ chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
210
+ media_list.append(media_remote_path)
188
211
 
189
212
  int_chat = cast(
190
213
  List[Message],
@@ -204,6 +227,22 @@ class VisionAgent(Agent):
204
227
 
205
228
  finished = False
206
229
  iterations = 0
230
+ last_response = None
231
+
232
+ # Save the current state of artifacts, will include any images the user
233
+ # passed in.
234
+ artifacts.save(self.local_artifacts_path)
235
+
236
+ # Upload artifacts to remote location and show where they are going
237
+ # to be loaded to. The actual loading happens in BoilerplateCode as
238
+ # part of the pre_code.
239
+ remote_artifacts_path = code_interpreter.upload_file(
240
+ self.local_artifacts_path
241
+ )
242
+ artifacts_loaded = artifacts.show()
243
+ int_chat.append({"role": "observation", "content": artifacts_loaded})
244
+ orig_chat.append({"role": "observation", "content": artifacts_loaded})
245
+
207
246
  while not finished and iterations < self.max_iterations:
208
247
  response = run_conversation(self.agent, int_chat)
209
248
  if self.verbosity >= 1:
@@ -211,20 +250,39 @@ class VisionAgent(Agent):
211
250
  int_chat.append({"role": "assistant", "content": str(response)})
212
251
  orig_chat.append({"role": "assistant", "content": str(response)})
213
252
 
253
+ # sometimes it gets stuck in a loop, so we force it to exit
254
+ if last_response == response:
255
+ response["let_user_respond"] = True
256
+
214
257
  if response["let_user_respond"]:
215
258
  break
216
259
 
217
260
  code_action = parse_execution(response["response"])
218
261
 
219
262
  if code_action is not None:
220
- obs = run_code_action(code_action, code_interpreter)
263
+ result = run_code_action(
264
+ code_action, code_interpreter, str(remote_artifacts_path)
265
+ )
266
+ obs = str(result.logs)
267
+
221
268
  if self.verbosity >= 1:
222
269
  _LOGGER.info(obs)
270
+ # don't add execution results to internal chat
223
271
  int_chat.append({"role": "observation", "content": obs})
224
- orig_chat.append({"role": "observation", "content": obs})
272
+ orig_chat.append(
273
+ {"role": "observation", "content": obs, "execution": result}
274
+ )
225
275
 
226
276
  iterations += 1
227
- return orig_chat
277
+ last_response = response
278
+
279
+ # after running the agent, download the artifacts locally
280
+ code_interpreter.download_file(
281
+ str(remote_artifacts_path.name), str(self.local_artifacts_path)
282
+ )
283
+ artifacts.load(self.local_artifacts_path)
284
+ artifacts.save()
285
+ return orig_chat, artifacts
228
286
 
229
287
  def log_progress(self, data: Dict[str, Any]) -> None:
230
288
  pass
@@ -718,9 +718,14 @@ class VisionAgentCoder(Agent):
718
718
  for chat_i in chat:
719
719
  if "media" in chat_i:
720
720
  for media in chat_i["media"]:
721
- media = code_interpreter.upload_file(media)
721
+ media = (
722
+ media
723
+ if type(media) is str
724
+ and media.startswith(("http", "https"))
725
+ else code_interpreter.upload_file(cast(str, media))
726
+ )
722
727
  chat_i["content"] += f" Media name {media}" # type: ignore
723
- media_list.append(media)
728
+ media_list.append(str(media))
724
729
 
725
730
  int_chat = cast(
726
731
  List[Message],
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
744
749
  results = {"code": "", "test": "", "plan": []}
745
750
  plan = []
746
751
  success = False
747
- self.log_progress(
748
- {
749
- "type": "log",
750
- "log_content": "Creating plans",
751
- "status": "started",
752
- }
753
- )
754
- plans = write_plans(
755
- int_chat,
756
- T.get_tool_descriptions_by_names(
757
- customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
758
- ),
759
- format_memory(working_memory),
760
- self.planner,
752
+
753
+ plans = self._create_plans(
754
+ int_chat, customized_tool_names, working_memory, self.planner
761
755
  )
762
756
 
763
- if self.verbosity >= 1:
764
- for p in plans:
765
- # tabulate will fail if the keys are not the same for all elements
766
- p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
767
- _LOGGER.info(
768
- f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
769
- )
757
+ if test_multi_plan:
758
+ self._log_plans(plans, self.verbosity)
759
+
770
760
  tool_infos = retrieve_tools(
771
761
  plans,
772
762
  self.tool_recommender,
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
860
850
  if self.report_progress_callback is not None:
861
851
  self.report_progress_callback(data)
862
852
 
853
+ def _create_plans(
854
+ self,
855
+ int_chat: List[Message],
856
+ customized_tool_names: Optional[List[str]],
857
+ working_memory: List[Dict[str, str]],
858
+ planner: LMM,
859
+ ) -> Dict[str, Any]:
860
+ self.log_progress(
861
+ {
862
+ "type": "log",
863
+ "log_content": "Creating plans",
864
+ "status": "started",
865
+ }
866
+ )
867
+ plans = write_plans(
868
+ int_chat,
869
+ T.get_tool_descriptions_by_names(
870
+ customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
871
+ ),
872
+ format_memory(working_memory),
873
+ planner,
874
+ )
875
+ return plans
876
+
877
+ def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
878
+ if verbosity >= 1:
879
+ for p in plans:
880
+ # tabulate will fail if the keys are not the same for all elements
881
+ p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
882
+ _LOGGER.info(
883
+ f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
884
+ )
885
+
863
886
 
864
887
  class OllamaVisionAgentCoder(VisionAgentCoder):
865
888
  """VisionAgentCoder that uses Ollama models for planning, coding, testing.
@@ -1,7 +1,7 @@
1
1
  VA_CODE = """
2
2
  **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
3
3
 
4
- **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
4
+ **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
5
5
 
6
6
  <execute_python>
7
7
  print("Hello World!")
@@ -15,7 +15,6 @@ This is the documentation for the different actions you can take:
15
15
  **Examples**:
16
16
  Here is an example of how you can interact with a user and Actions to complete a task:
17
17
  --- START EXAMPLES ---
18
- [Current directory: /example/workspace]
19
18
  {examples}
20
19
  --- END EXAMPLES ---
21
20
 
@@ -26,24 +25,28 @@ Here is an example of how you can interact with a user and Actions to complete a
26
25
  **Conversation**:
27
26
  Here is the current conversation so far:
28
27
  --- START CONVERSATION ---
29
- [Current directory: {dir}]
30
-
31
28
  {conversation}
32
29
  """
33
30
 
31
+
34
32
  EXAMPLES_CODE1 = """
35
33
  USER: Can you detect the dogs in this image? Media name dog.jpg
36
34
 
37
- AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
35
+ OBSERVATION:
36
+ [Artifacts loaded]
37
+ Artifact dog.jpg loaded to /path/to/images/dog.jpg
38
+ [End of artifacts]
39
+
40
+ AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
38
41
 
39
42
  OBSERVATION:
40
- [File /example/workspace/dog_detector.py]
43
+ [Artifact dog_detector.py]
41
44
  0|from vision_agent.tools import load_image, owl_v2
42
45
  1|def detect_dogs(image_path: str):
43
46
  2| image = load_image(image_path)
44
47
  3| dogs = owl_v2("dog", image)
45
48
  4| return dogs
46
- [End of file]
49
+ [End of artifact]
47
50
 
48
51
  AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
49
52
 
@@ -56,18 +59,23 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
56
59
 
57
60
  USER: The the image only has one dog, can you fix this?
58
61
 
59
- AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
62
+ [Artifacts loaded]
63
+ Artifact dog.jpg loaded to /path/to/images/dog.jpg
64
+ Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
65
+ [End of artifacts]
66
+
67
+ AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
60
68
 
61
69
  OBSERVATION:
62
- [File /example/workspace/dog_detector.py]
70
+ [Artifact dog_detector.py]
63
71
  0|from vision_agent.tools import load_image, owl_v2
64
72
  1|def detect_dogs(image_path: str):
65
73
  2| image = load_image(image_path)
66
74
  3| dogs = owl_v2("dog", image, threshold=0.24)
67
75
  4| return dogs
68
- [End of file]
76
+ [End of artifact]
69
77
 
70
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
78
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
71
79
 
72
80
  OBSERVATION:
73
81
  ----- stdout -----
@@ -76,23 +84,34 @@ OBSERVATION:
76
84
  AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
77
85
  """
78
86
 
87
+
79
88
  EXAMPLES_CODE2 = """
80
- USER: Can you create a function to count workers with helmets?
89
+ USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
90
+
91
+ OBSERVATION:
92
+ [Artifacts loaded]
93
+ [End of artifacts]
81
94
 
82
95
  AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
83
96
 
84
97
  USER: Yes you can use workers.png
85
98
 
86
- AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
99
+ OBSERVATION:
100
+ [Artifacts loaded]
101
+ Artifact workers.png loaded to /path/to/images/workers.png
102
+ [End of artifacts]
103
+
104
+ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
87
105
 
88
106
  OBSERVATION:
89
- [File /example/workspace/code.py]
90
- 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
91
- 1|def count_workers_with_helmets(image_path: str):
107
+ [Artifact code.py]
108
+ 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
109
+ 1|def count_workers_with_helmets(image_path: str, output_path: str):
92
110
  2| image = load_image(image_path)
93
- 3| workers = owl_v2("worker", image)
94
- 4| helmets = owl_v2("helmet", image)
95
- 5| count = 0
111
+ 3| detections = owl_v2("worker, helmet", image)
112
+ 4| workers = [d for d in detections if d['label'] == 'worker']
113
+ 5| helmets = [d for d in detections if d['label'] == 'helmet']
114
+ 6| count = 0
96
115
  6| for worker in workers:
97
116
  7| person_box = worker['bbox']
98
117
  8| person_has_helmet = False
@@ -102,14 +121,16 @@ OBSERVATION:
102
121
  12| break
103
122
  13| if person_has_helmet:
104
123
  14| count += 1
124
+ 15| overlay_bounding_boxes(image, detections)
125
+ 16| save_image(output_path, image)
105
126
  15| return count
106
- [End of file]
127
+ [End of artifact]
107
128
 
108
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
129
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
109
130
 
110
131
  OBSERVATION:
111
132
  ----- stdout -----
112
133
  2
113
134
 
114
- AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
135
+ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
115
136
  """
@@ -5,9 +5,9 @@ from uuid import UUID
5
5
  from requests.exceptions import HTTPError
6
6
 
7
7
  from vision_agent.clients.http import BaseHTTP
8
- from vision_agent.utils.type_defs import LandingaiAPIKey
8
+ from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
9
9
  from vision_agent.utils.exceptions import FineTuneModelNotFound
10
- from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
10
+ from vision_agent.utils.type_defs import LandingaiAPIKey
11
11
 
12
12
 
13
13
  class LandingPublicAPI(BaseHTTP):
vision_agent/lmm/lmm.py CHANGED
@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
30
30
 
31
31
 
32
32
  def encode_media(media: Union[str, Path]) -> str:
33
+ if type(media) is str and media.startswith(("http", "https")):
34
+ # for mp4 video url, we assume there is a same url but ends with png
35
+ # vision-agent-ui will upload this png when uploading the video
36
+ if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
37
+ return media[:-4] + ".png"
38
+ return media
33
39
  extension = "png"
34
40
  extension = Path(media).suffix
35
41
  if extension.lower() not in {
@@ -132,13 +138,17 @@ class OpenAILMM(LMM):
132
138
  fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
133
139
  if "media" in c:
134
140
  for media in c["media"]:
135
- encoded_media = encode_media(media)
141
+ encoded_media = encode_media(cast(str, media))
136
142
 
137
143
  fixed_c["content"].append( # type: ignore
138
144
  {
139
145
  "type": "image_url",
140
146
  "image_url": {
141
- "url": f"data:image/png;base64,{encoded_media}",
147
+ "url": (
148
+ encoded_media
149
+ if encoded_media.startswith(("http", "https"))
150
+ else f"data:image/png;base64,{encoded_media}"
151
+ ),
142
152
  "detail": "low",
143
153
  },
144
154
  },
@@ -379,7 +389,9 @@ class OllamaLMM(LMM):
379
389
  fixed_chat = []
380
390
  for message in chat:
381
391
  if "media" in message:
382
- message["images"] = [encode_media(m) for m in message["media"]]
392
+ message["images"] = [
393
+ encode_media(cast(str, m)) for m in message["media"]
394
+ ]
383
395
  del message["media"]
384
396
  fixed_chat.append(message)
385
397
  url = f"{self.url}/chat"
@@ -390,7 +402,6 @@ class OllamaLMM(LMM):
390
402
  tmp_kwargs = self.kwargs | kwargs
391
403
  data.update(tmp_kwargs)
392
404
  if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
393
-
394
405
  json_data = json.dumps(data)
395
406
 
396
407
  def f() -> Iterator[Optional[str]]:
@@ -424,7 +435,6 @@ class OllamaLMM(LMM):
424
435
  media: Optional[List[Union[str, Path]]] = None,
425
436
  **kwargs: Any,
426
437
  ) -> Union[str, Iterator[Optional[str]]]:
427
-
428
438
  url = f"{self.url}/generate"
429
439
  data: Dict[str, Any] = {
430
440
  "model": self.model_name,
@@ -439,7 +449,6 @@ class OllamaLMM(LMM):
439
449
  tmp_kwargs = self.kwargs | kwargs
440
450
  data.update(tmp_kwargs)
441
451
  if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
442
-
443
452
  json_data = json.dumps(data)
444
453
 
445
454
  def f() -> Iterator[Optional[str]]:
vision_agent/lmm/types.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Dict, Sequence, Union
3
3
 
4
+ from vision_agent.utils.execute import Execution
5
+
4
6
  TextOrImage = Union[str, Sequence[Union[str, Path]]]
5
- Message = Dict[str, TextOrImage]
7
+ Message = Dict[str, Union[TextOrImage, Execution]]
@@ -1,6 +1,6 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING
3
+ from .meta_tools import META_TOOL_DOCSTRING, Artifacts
4
4
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
5
  from .tool_utils import get_tool_descriptions_by_names
6
6
  from .tools import (
@@ -21,8 +21,8 @@ from .tools import (
21
21
  dpt_hybrid_midas,
22
22
  extract_frames,
23
23
  florence2_image_caption,
24
- florence2_object_detection,
25
24
  florence2_ocr,
25
+ florence2_phrase_grounding,
26
26
  florence2_roberta_vqa,
27
27
  florence2_sam2_image,
28
28
  florence2_sam2_video,