vision-agent 0.2.117__py3-none-any.whl → 0.2.119__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,7 +11,7 @@ class Agent(ABC):
11
11
  self,
12
12
  input: Union[str, List[Message]],
13
13
  media: Optional[Union[str, Path]] = None,
14
- ) -> str:
14
+ ) -> Union[str, List[Message]]:
15
15
  pass
16
16
 
17
17
  @abstractmethod
@@ -1,8 +1,9 @@
1
1
  import copy
2
2
  import logging
3
3
  import os
4
+ import tempfile
4
5
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional, Union, cast
6
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
6
7
 
7
8
  from vision_agent.agent import Agent
8
9
  from vision_agent.agent.agent_utils import extract_json
@@ -13,8 +14,9 @@ from vision_agent.agent.vision_agent_prompts import (
13
14
  )
14
15
  from vision_agent.lmm import LMM, Message, OpenAILMM
15
16
  from vision_agent.tools import META_TOOL_DOCSTRING
17
+ from vision_agent.tools.meta_tools import Artifacts
16
18
  from vision_agent.utils import CodeInterpreterFactory
17
- from vision_agent.utils.execute import CodeInterpreter
19
+ from vision_agent.utils.execute import CodeInterpreter, Execution
18
20
 
19
21
  logging.basicConfig(level=logging.INFO)
20
22
  _LOGGER = logging.getLogger(__name__)
@@ -24,23 +26,30 @@ if str(WORKSPACE) != "":
24
26
  os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
25
27
 
26
28
 
27
- class DefaultImports:
28
- code = [
29
+ class BoilerplateCode:
30
+ pre_code = [
29
31
  "from typing import *",
30
32
  "from vision_agent.utils.execute import CodeInterpreter",
31
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
33
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
34
+ "artifacts = Artifacts('{remote_path}')",
35
+ "artifacts.load('{remote_path}')",
36
+ ]
37
+ post_code = [
38
+ "artifacts.save()",
32
39
  ]
33
40
 
34
41
  @staticmethod
35
- def to_code_string() -> str:
36
- return "\n".join(DefaultImports.code)
37
-
38
- @staticmethod
39
- def prepend_imports(code: str) -> str:
42
+ def add_boilerplate(code: str, **format: Any) -> str:
40
43
  """Run this method to prepend the default imports to the code.
41
44
  NOTE: be sure to run this method after the custom tools have been registered.
42
45
  """
43
- return DefaultImports.to_code_string() + "\n\n" + code
46
+ return (
47
+ "\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
48
+ + "\n\n"
49
+ + code
50
+ + "\n\n"
51
+ + "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
52
+ )
44
53
 
45
54
 
46
55
  def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
@@ -60,35 +69,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
60
69
  prompt = VA_CODE.format(
61
70
  documentation=META_TOOL_DOCSTRING,
62
71
  examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
63
- dir=WORKSPACE,
64
72
  conversation=conversation,
65
73
  )
66
74
  return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
67
75
 
68
76
 
69
- def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
70
- # Note the code interpreter needs to keep running in the same environment because
71
- # the SWE tools hold state like line numbers and currently open files.
72
- result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
73
-
74
- return_str = ""
75
- if result.success:
76
- for res in result.results:
77
- if res.text is not None:
78
- return_str += res.text.replace("\\n", "\n")
79
- if result.logs.stdout:
80
- return_str += "----- stdout -----\n"
81
- for log in result.logs.stdout:
82
- return_str += log.replace("\\n", "\n")
83
- else:
84
- # for log in result.logs.stderr:
85
- # return_str += log.replace("\\n", "\n")
86
- if result.error:
87
- return_str += (
88
- "\n" + result.error.value + "\n".join(result.error.traceback_raw)
89
- )
90
-
91
- return return_str
77
+ def run_code_action(
78
+ code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
79
+ ) -> Execution:
80
+ return code_interpreter.exec_isolation(
81
+ BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
82
+ )
92
83
 
93
84
 
94
85
  def parse_execution(response: str) -> Optional[str]:
@@ -101,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
101
92
 
102
93
  class VisionAgent(Agent):
103
94
  """Vision Agent is an agent that can chat with the user and call tools or other
104
- agents to generate code for it. Vision Agent uses python code to execute actions for
105
- the user. Vision Agent is inspired by by OpenDev
95
+ agents to generate code for it. Vision Agent uses python code to execute actions
96
+ for the user. Vision Agent is inspired by by OpenDev
106
97
  https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
107
98
 
108
99
  Example
@@ -118,8 +109,20 @@ class VisionAgent(Agent):
118
109
  self,
119
110
  agent: Optional[LMM] = None,
120
111
  verbosity: int = 0,
112
+ local_artifacts_path: Optional[Union[str, Path]] = None,
121
113
  code_sandbox_runtime: Optional[str] = None,
122
114
  ) -> None:
115
+ """Initialize the VisionAgent.
116
+
117
+ Parameters:
118
+ agent (Optional[LMM]): The agent to use for conversation and orchestration
119
+ of other agents.
120
+ verbosity (int): The verbosity level of the agent.
121
+ local_artifacts_path (Optional[Union[str, Path]]): The path to the local
122
+ artifacts file.
123
+ code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
124
+ """
125
+
123
126
  self.agent = (
124
127
  OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
125
128
  )
@@ -128,12 +131,21 @@ class VisionAgent(Agent):
128
131
  self.code_sandbox_runtime = code_sandbox_runtime
129
132
  if self.verbosity >= 1:
130
133
  _LOGGER.setLevel(logging.INFO)
134
+ self.local_artifacts_path = cast(
135
+ str,
136
+ (
137
+ Path(local_artifacts_path)
138
+ if local_artifacts_path is not None
139
+ else Path(tempfile.NamedTemporaryFile(delete=False).name)
140
+ ),
141
+ )
131
142
 
132
143
  def __call__(
133
144
  self,
134
145
  input: Union[str, List[Message]],
135
146
  media: Optional[Union[str, Path]] = None,
136
- ) -> str:
147
+ artifacts: Optional[Artifacts] = None,
148
+ ) -> List[Message]:
137
149
  """Chat with VisionAgent and get the conversation response.
138
150
 
139
151
  Parameters:
@@ -141,6 +153,7 @@ class VisionAgent(Agent):
141
153
  [{"role": "user", "content": "describe your task here..."}, ...] or a
142
154
  string of just the contents.
143
155
  media (Optional[Union[str, Path]]): The media file to be used in the task.
156
+ artifacts (Optional[Artifacts]): The artifacts to use in the task.
144
157
 
145
158
  Returns:
146
159
  str: The conversation response.
@@ -149,22 +162,23 @@ class VisionAgent(Agent):
149
162
  input = [{"role": "user", "content": input}]
150
163
  if media is not None:
151
164
  input[0]["media"] = [media]
152
- results = self.chat_with_code(input)
153
- return results # type: ignore
165
+ results, _ = self.chat_with_code(input, artifacts)
166
+ return results
154
167
 
155
168
  def chat_with_code(
156
169
  self,
157
170
  chat: List[Message],
158
- ) -> List[Message]:
171
+ artifacts: Optional[Artifacts] = None,
172
+ ) -> Tuple[List[Message], Artifacts]:
159
173
  """Chat with VisionAgent, it will use code to execute actions to accomplish
160
174
  its tasks.
161
175
 
162
176
  Parameters:
163
- chat (List[Message]): A conversation
164
- in the format of:
177
+ chat (List[Message]): A conversation in the format of:
165
178
  [{"role": "user", "content": "describe your task here..."}]
166
179
  or if it contains media files, it should be in the format of:
167
180
  [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
181
+ artifacts (Optional[Artifacts]): The artifacts to use in the task.
168
182
 
169
183
  Returns:
170
184
  List[Message]: The conversation response.
@@ -173,6 +187,10 @@ class VisionAgent(Agent):
173
187
  if not chat:
174
188
  raise ValueError("chat cannot be empty")
175
189
 
190
+ if not artifacts:
191
+ # this is setting remote artifacts path
192
+ artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
193
+
176
194
  with CodeInterpreterFactory.new_instance(
177
195
  code_sandbox_runtime=self.code_sandbox_runtime
178
196
  ) as code_interpreter:
@@ -182,9 +200,14 @@ class VisionAgent(Agent):
182
200
  for chat_i in int_chat:
183
201
  if "media" in chat_i:
184
202
  for media in chat_i["media"]:
185
- media = code_interpreter.upload_file(media)
186
- chat_i["content"] += f" Media name {media}" # type: ignore
187
- media_list.append(media)
203
+ media = cast(str, media)
204
+ artifacts.artifacts[Path(media).name] = open(media, "rb").read()
205
+
206
+ media_remote_path = (
207
+ Path(code_interpreter.remote_path) / Path(media).name
208
+ )
209
+ chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
210
+ media_list.append(media_remote_path)
188
211
 
189
212
  int_chat = cast(
190
213
  List[Message],
@@ -204,6 +227,22 @@ class VisionAgent(Agent):
204
227
 
205
228
  finished = False
206
229
  iterations = 0
230
+ last_response = None
231
+
232
+ # Save the current state of artifacts, will include any images the user
233
+ # passed in.
234
+ artifacts.save(self.local_artifacts_path)
235
+
236
+ # Upload artifacts to remote location and show where they are going
237
+ # to be loaded to. The actual loading happens in BoilerplateCode as
238
+ # part of the pre_code.
239
+ remote_artifacts_path = code_interpreter.upload_file(
240
+ self.local_artifacts_path
241
+ )
242
+ artifacts_loaded = artifacts.show()
243
+ int_chat.append({"role": "observation", "content": artifacts_loaded})
244
+ orig_chat.append({"role": "observation", "content": artifacts_loaded})
245
+
207
246
  while not finished and iterations < self.max_iterations:
208
247
  response = run_conversation(self.agent, int_chat)
209
248
  if self.verbosity >= 1:
@@ -211,20 +250,39 @@ class VisionAgent(Agent):
211
250
  int_chat.append({"role": "assistant", "content": str(response)})
212
251
  orig_chat.append({"role": "assistant", "content": str(response)})
213
252
 
253
+ # sometimes it gets stuck in a loop, so we force it to exit
254
+ if last_response == response:
255
+ response["let_user_respond"] = True
256
+
214
257
  if response["let_user_respond"]:
215
258
  break
216
259
 
217
260
  code_action = parse_execution(response["response"])
218
261
 
219
262
  if code_action is not None:
220
- obs = run_code_action(code_action, code_interpreter)
263
+ result = run_code_action(
264
+ code_action, code_interpreter, str(remote_artifacts_path)
265
+ )
266
+ obs = str(result.logs)
267
+
221
268
  if self.verbosity >= 1:
222
269
  _LOGGER.info(obs)
270
+ # don't add execution results to internal chat
223
271
  int_chat.append({"role": "observation", "content": obs})
224
- orig_chat.append({"role": "observation", "content": obs})
272
+ orig_chat.append(
273
+ {"role": "observation", "content": obs, "execution": result}
274
+ )
225
275
 
226
276
  iterations += 1
227
- return orig_chat
277
+ last_response = response
278
+
279
+ # after running the agent, download the artifacts locally
280
+ code_interpreter.download_file(
281
+ str(remote_artifacts_path.name), str(self.local_artifacts_path)
282
+ )
283
+ artifacts.load(self.local_artifacts_path)
284
+ artifacts.save()
285
+ return orig_chat, artifacts
228
286
 
229
287
  def log_progress(self, data: Dict[str, Any]) -> None:
230
288
  pass
@@ -718,9 +718,14 @@ class VisionAgentCoder(Agent):
718
718
  for chat_i in chat:
719
719
  if "media" in chat_i:
720
720
  for media in chat_i["media"]:
721
- media = code_interpreter.upload_file(media)
721
+ media = (
722
+ media
723
+ if type(media) is str
724
+ and media.startswith(("http", "https"))
725
+ else code_interpreter.upload_file(cast(str, media))
726
+ )
722
727
  chat_i["content"] += f" Media name {media}" # type: ignore
723
- media_list.append(media)
728
+ media_list.append(str(media))
724
729
 
725
730
  int_chat = cast(
726
731
  List[Message],
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
744
749
  results = {"code": "", "test": "", "plan": []}
745
750
  plan = []
746
751
  success = False
747
- self.log_progress(
748
- {
749
- "type": "log",
750
- "log_content": "Creating plans",
751
- "status": "started",
752
- }
753
- )
754
- plans = write_plans(
755
- int_chat,
756
- T.get_tool_descriptions_by_names(
757
- customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
758
- ),
759
- format_memory(working_memory),
760
- self.planner,
752
+
753
+ plans = self._create_plans(
754
+ int_chat, customized_tool_names, working_memory, self.planner
761
755
  )
762
756
 
763
- if self.verbosity >= 1:
764
- for p in plans:
765
- # tabulate will fail if the keys are not the same for all elements
766
- p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
767
- _LOGGER.info(
768
- f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
769
- )
757
+ if test_multi_plan:
758
+ self._log_plans(plans, self.verbosity)
759
+
770
760
  tool_infos = retrieve_tools(
771
761
  plans,
772
762
  self.tool_recommender,
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
860
850
  if self.report_progress_callback is not None:
861
851
  self.report_progress_callback(data)
862
852
 
853
+ def _create_plans(
854
+ self,
855
+ int_chat: List[Message],
856
+ customized_tool_names: Optional[List[str]],
857
+ working_memory: List[Dict[str, str]],
858
+ planner: LMM,
859
+ ) -> Dict[str, Any]:
860
+ self.log_progress(
861
+ {
862
+ "type": "log",
863
+ "log_content": "Creating plans",
864
+ "status": "started",
865
+ }
866
+ )
867
+ plans = write_plans(
868
+ int_chat,
869
+ T.get_tool_descriptions_by_names(
870
+ customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
871
+ ),
872
+ format_memory(working_memory),
873
+ planner,
874
+ )
875
+ return plans
876
+
877
+ def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
878
+ if verbosity >= 1:
879
+ for p in plans:
880
+ # tabulate will fail if the keys are not the same for all elements
881
+ p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
882
+ _LOGGER.info(
883
+ f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
884
+ )
885
+
863
886
 
864
887
  class OllamaVisionAgentCoder(VisionAgentCoder):
865
888
  """VisionAgentCoder that uses Ollama models for planning, coding, testing.
@@ -1,7 +1,7 @@
1
1
  VA_CODE = """
2
2
  **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
3
3
 
4
- **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
4
+ **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
5
5
 
6
6
  <execute_python>
7
7
  print("Hello World!")
@@ -15,7 +15,6 @@ This is the documentation for the different actions you can take:
15
15
  **Examples**:
16
16
  Here is an example of how you can interact with a user and Actions to complete a task:
17
17
  --- START EXAMPLES ---
18
- [Current directory: /example/workspace]
19
18
  {examples}
20
19
  --- END EXAMPLES ---
21
20
 
@@ -26,24 +25,28 @@ Here is an example of how you can interact with a user and Actions to complete a
26
25
  **Conversation**:
27
26
  Here is the current conversation so far:
28
27
  --- START CONVERSATION ---
29
- [Current directory: {dir}]
30
-
31
28
  {conversation}
32
29
  """
33
30
 
31
+
34
32
  EXAMPLES_CODE1 = """
35
33
  USER: Can you detect the dogs in this image? Media name dog.jpg
36
34
 
37
- AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
35
+ OBSERVATION:
36
+ [Artifacts loaded]
37
+ Artifact dog.jpg loaded to /path/to/images/dog.jpg
38
+ [End of artifacts]
39
+
40
+ AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
38
41
 
39
42
  OBSERVATION:
40
- [File /example/workspace/dog_detector.py]
43
+ [Artifact dog_detector.py]
41
44
  0|from vision_agent.tools import load_image, owl_v2
42
45
  1|def detect_dogs(image_path: str):
43
46
  2| image = load_image(image_path)
44
47
  3| dogs = owl_v2("dog", image)
45
48
  4| return dogs
46
- [End of file]
49
+ [End of artifact]
47
50
 
48
51
  AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
49
52
 
@@ -56,18 +59,23 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
56
59
 
57
60
  USER: The the image only has one dog, can you fix this?
58
61
 
59
- AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
62
+ [Artifacts loaded]
63
+ Artifact dog.jpg loaded to /path/to/images/dog.jpg
64
+ Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
65
+ [End of artifacts]
66
+
67
+ AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
60
68
 
61
69
  OBSERVATION:
62
- [File /example/workspace/dog_detector.py]
70
+ [Artifact dog_detector.py]
63
71
  0|from vision_agent.tools import load_image, owl_v2
64
72
  1|def detect_dogs(image_path: str):
65
73
  2| image = load_image(image_path)
66
74
  3| dogs = owl_v2("dog", image, threshold=0.24)
67
75
  4| return dogs
68
- [End of file]
76
+ [End of artifact]
69
77
 
70
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
78
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
71
79
 
72
80
  OBSERVATION:
73
81
  ----- stdout -----
@@ -76,23 +84,34 @@ OBSERVATION:
76
84
  AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
77
85
  """
78
86
 
87
+
79
88
  EXAMPLES_CODE2 = """
80
- USER: Can you create a function to count workers with helmets?
89
+ USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
90
+
91
+ OBSERVATION:
92
+ [Artifacts loaded]
93
+ [End of artifacts]
81
94
 
82
95
  AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
83
96
 
84
97
  USER: Yes you can use workers.png
85
98
 
86
- AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
99
+ OBSERVATION:
100
+ [Artifacts loaded]
101
+ Artifact workers.png loaded to /path/to/images/workers.png
102
+ [End of artifacts]
103
+
104
+ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
87
105
 
88
106
  OBSERVATION:
89
- [File /example/workspace/code.py]
90
- 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
91
- 1|def count_workers_with_helmets(image_path: str):
107
+ [Artifact code.py]
108
+ 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
109
+ 1|def count_workers_with_helmets(image_path: str, output_path: str):
92
110
  2| image = load_image(image_path)
93
- 3| workers = owl_v2("worker", image)
94
- 4| helmets = owl_v2("helmet", image)
95
- 5| count = 0
111
+ 3| detections = owl_v2("worker, helmet", image)
112
+ 4| workers = [d for d in detections if d['label'] == 'worker']
113
+ 5| helmets = [d for d in detections if d['label'] == 'helmet']
114
+ 6| count = 0
96
115
  6| for worker in workers:
97
116
  7| person_box = worker['bbox']
98
117
  8| person_has_helmet = False
@@ -102,14 +121,16 @@ OBSERVATION:
102
121
  12| break
103
122
  13| if person_has_helmet:
104
123
  14| count += 1
124
+ 15| overlay_bounding_boxes(image, detections)
125
+ 16| save_image(output_path, image)
105
126
  15| return count
106
- [End of file]
127
+ [End of artifact]
107
128
 
108
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
129
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
109
130
 
110
131
  OBSERVATION:
111
132
  ----- stdout -----
112
133
  2
113
134
 
114
- AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
135
+ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
115
136
  """
@@ -5,9 +5,9 @@ from uuid import UUID
5
5
  from requests.exceptions import HTTPError
6
6
 
7
7
  from vision_agent.clients.http import BaseHTTP
8
- from vision_agent.utils.type_defs import LandingaiAPIKey
8
+ from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
9
9
  from vision_agent.utils.exceptions import FineTuneModelNotFound
10
- from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
10
+ from vision_agent.utils.type_defs import LandingaiAPIKey
11
11
 
12
12
 
13
13
  class LandingPublicAPI(BaseHTTP):
vision_agent/lmm/lmm.py CHANGED
@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
30
30
 
31
31
 
32
32
  def encode_media(media: Union[str, Path]) -> str:
33
+ if type(media) is str and media.startswith(("http", "https")):
34
+ # for mp4 video url, we assume there is a same url but ends with png
35
+ # vision-agent-ui will upload this png when uploading the video
36
+ if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
37
+ return media[:-4] + ".png"
38
+ return media
33
39
  extension = "png"
34
40
  extension = Path(media).suffix
35
41
  if extension.lower() not in {
@@ -132,13 +138,17 @@ class OpenAILMM(LMM):
132
138
  fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
133
139
  if "media" in c:
134
140
  for media in c["media"]:
135
- encoded_media = encode_media(media)
141
+ encoded_media = encode_media(cast(str, media))
136
142
 
137
143
  fixed_c["content"].append( # type: ignore
138
144
  {
139
145
  "type": "image_url",
140
146
  "image_url": {
141
- "url": f"data:image/png;base64,{encoded_media}",
147
+ "url": (
148
+ encoded_media
149
+ if encoded_media.startswith(("http", "https"))
150
+ else f"data:image/png;base64,{encoded_media}"
151
+ ),
142
152
  "detail": "low",
143
153
  },
144
154
  },
@@ -379,7 +389,9 @@ class OllamaLMM(LMM):
379
389
  fixed_chat = []
380
390
  for message in chat:
381
391
  if "media" in message:
382
- message["images"] = [encode_media(m) for m in message["media"]]
392
+ message["images"] = [
393
+ encode_media(cast(str, m)) for m in message["media"]
394
+ ]
383
395
  del message["media"]
384
396
  fixed_chat.append(message)
385
397
  url = f"{self.url}/chat"
@@ -390,7 +402,6 @@ class OllamaLMM(LMM):
390
402
  tmp_kwargs = self.kwargs | kwargs
391
403
  data.update(tmp_kwargs)
392
404
  if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
393
-
394
405
  json_data = json.dumps(data)
395
406
 
396
407
  def f() -> Iterator[Optional[str]]:
@@ -424,7 +435,6 @@ class OllamaLMM(LMM):
424
435
  media: Optional[List[Union[str, Path]]] = None,
425
436
  **kwargs: Any,
426
437
  ) -> Union[str, Iterator[Optional[str]]]:
427
-
428
438
  url = f"{self.url}/generate"
429
439
  data: Dict[str, Any] = {
430
440
  "model": self.model_name,
@@ -439,7 +449,6 @@ class OllamaLMM(LMM):
439
449
  tmp_kwargs = self.kwargs | kwargs
440
450
  data.update(tmp_kwargs)
441
451
  if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
442
-
443
452
  json_data = json.dumps(data)
444
453
 
445
454
  def f() -> Iterator[Optional[str]]:
vision_agent/lmm/types.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Dict, Sequence, Union
3
3
 
4
+ from vision_agent.utils.execute import Execution
5
+
4
6
  TextOrImage = Union[str, Sequence[Union[str, Path]]]
5
- Message = Dict[str, TextOrImage]
7
+ Message = Dict[str, Union[TextOrImage, Execution]]
@@ -1,6 +1,6 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING
3
+ from .meta_tools import META_TOOL_DOCSTRING, Artifacts
4
4
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
5
  from .tool_utils import get_tool_descriptions_by_names
6
6
  from .tools import (
@@ -21,8 +21,8 @@ from .tools import (
21
21
  dpt_hybrid_midas,
22
22
  extract_frames,
23
23
  florence2_image_caption,
24
- florence2_object_detection,
25
24
  florence2_ocr,
25
+ florence2_phrase_grounding,
26
26
  florence2_roberta_vqa,
27
27
  florence2_sam2_image,
28
28
  florence2_sam2_video,