vision-agent 0.2.174__py3-none-any.whl → 0.2.175__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]:
72
72
  if json_dict is None:
73
73
  error_msg = f"Could not extract JSON from the given str: {json_orig}"
74
74
  _LOGGER.exception(error_msg)
75
- raise ValueError(error_msg)
75
+ raise json.JSONDecodeError(
76
+ msg="Could not extract JSON", doc=json_orig, pos=0
77
+ )
76
78
 
77
79
  return json_dict
78
80
 
@@ -2,7 +2,6 @@ import copy
2
2
  import json
3
3
  import logging
4
4
  import os
5
- import tempfile
6
5
  from pathlib import Path
7
6
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
8
7
 
@@ -12,6 +11,7 @@ from vision_agent.agent.vision_agent_prompts import (
12
11
  EXAMPLES_CODE1,
13
12
  EXAMPLES_CODE2,
14
13
  EXAMPLES_CODE3,
14
+ EXAMPLES_CODE3_EXTRA2,
15
15
  VA_CODE,
16
16
  )
17
17
  from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
@@ -19,7 +19,6 @@ from vision_agent.tools.meta_tools import (
19
19
  META_TOOL_DOCSTRING,
20
20
  Artifacts,
21
21
  check_and_load_image,
22
- extract_and_save_files_to_artifacts,
23
22
  use_extra_vision_agent_args,
24
23
  )
25
24
  from vision_agent.utils import CodeInterpreterFactory
@@ -37,11 +36,12 @@ class BoilerplateCode:
37
36
  pre_code = [
38
37
  "from typing import *",
39
38
  "from vision_agent.utils.execute import CodeInterpreter",
40
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
41
- "artifacts = Artifacts('{remote_path}')",
39
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts, capture_files_into_artifacts",
40
+ "artifacts = Artifacts('{remote_path}', '{remote_path}')",
42
41
  "artifacts.load('{remote_path}')",
43
42
  ]
44
43
  post_code = [
44
+ "capture_files_into_artifacts(artifacts)",
45
45
  "artifacts.save()",
46
46
  ]
47
47
 
@@ -97,8 +97,9 @@ def _clean_response(response: str) -> str:
97
97
  def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
98
98
  chat = copy.deepcopy(chat)
99
99
 
100
+ # only add 10 most recent messages in the chat to not go over token limit
100
101
  conversation = ""
101
- for chat_i in chat:
102
+ for chat_i in chat[-10:]:
102
103
  if chat_i["role"] == "user":
103
104
  conversation += f"USER: {chat_i['content']}\n\n"
104
105
  elif chat_i["role"] == "observation":
@@ -110,7 +111,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
110
111
 
111
112
  prompt = VA_CODE.format(
112
113
  documentation=META_TOOL_DOCSTRING,
113
- examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
114
+ examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
114
115
  conversation=conversation,
115
116
  )
116
117
  message: Message = {"role": "user", "content": prompt}
@@ -120,7 +121,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
120
121
  and "media" in chat[-1]
121
122
  and len(chat[-1]["media"]) > 0 # type: ignore
122
123
  ):
123
- message["media"] = chat[-1]["media"]
124
+ media_obs = [media for media in chat[-1]["media"] if Path(media).exists()] # type: ignore
125
+ if len(media_obs) > 0:
126
+ message["media"] = media_obs # type: ignore
124
127
  conv_resp = cast(str, orch([message], stream=False))
125
128
 
126
129
  # clean the response first, if we are executing code, do not resond or end
@@ -144,16 +147,16 @@ def execute_code_action(
144
147
  artifacts: Artifacts,
145
148
  code: str,
146
149
  code_interpreter: CodeInterpreter,
147
- artifact_remote_path: str,
148
150
  ) -> Tuple[Execution, str]:
149
151
  result = code_interpreter.exec_isolation(
150
- BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
152
+ BoilerplateCode.add_boilerplate(
153
+ code, remote_path=str(artifacts.remote_save_path)
154
+ )
151
155
  )
152
156
 
153
157
  obs = str(result.logs)
154
158
  if result.error:
155
159
  obs += f"\n{result.error}"
156
- extract_and_save_files_to_artifacts(artifacts, code, obs)
157
160
  return result, obs
158
161
 
159
162
 
@@ -161,7 +164,6 @@ def execute_user_code_action(
161
164
  artifacts: Artifacts,
162
165
  last_user_message: Message,
163
166
  code_interpreter: CodeInterpreter,
164
- artifact_remote_path: str,
165
167
  ) -> Tuple[Optional[Execution], Optional[str]]:
166
168
  user_result = None
167
169
  user_obs = None
@@ -178,11 +180,10 @@ def execute_user_code_action(
178
180
  if user_code_action is not None:
179
181
  user_code_action = use_extra_vision_agent_args(user_code_action, False)
180
182
  user_result, user_obs = execute_code_action(
181
- artifacts, user_code_action, code_interpreter, artifact_remote_path
183
+ artifacts, user_code_action, code_interpreter
182
184
  )
183
185
  if user_result.error:
184
186
  user_obs += f"\n{user_result.error}"
185
- extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs)
186
187
  return user_result, user_obs
187
188
 
188
189
 
@@ -231,9 +232,18 @@ def old_format_to_new_format(old_format_str: str) -> str:
231
232
  except json.JSONDecodeError:
232
233
  return old_format_str
233
234
 
234
- thinking = old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
235
- let_user_respond = old_format["let_user_respond"]
236
- if "<execute_python>" in old_format["response"]:
235
+ if "thoughts" in old_format:
236
+ thinking = (
237
+ old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
238
+ )
239
+ else:
240
+ thinking = None
241
+
242
+ let_user_respond = (
243
+ old_format["let_user_respond"] if "let_user_respond" in old_format else True
244
+ )
245
+
246
+ if "response" in old_format and "<execute_python>" in old_format["response"]:
237
247
  execute_python = extract_tag(old_format["response"], "execute_python")
238
248
  response = (
239
249
  old_format["response"]
@@ -244,7 +254,7 @@ def old_format_to_new_format(old_format_str: str) -> str:
244
254
  )
245
255
  else:
246
256
  execute_python = None
247
- response = old_format["response"]
257
+ response = old_format["response"] if "response" in old_format else None
248
258
 
249
259
  return json.dumps(
250
260
  {
@@ -275,7 +285,6 @@ class VisionAgent(Agent):
275
285
  self,
276
286
  agent: Optional[LMM] = None,
277
287
  verbosity: int = 0,
278
- local_artifacts_path: Optional[Union[str, Path]] = None,
279
288
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
280
289
  code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
281
290
  ) -> None:
@@ -285,8 +294,6 @@ class VisionAgent(Agent):
285
294
  agent (Optional[LMM]): The agent to use for conversation and orchestration
286
295
  of other agents.
287
296
  verbosity (int): The verbosity level of the agent.
288
- local_artifacts_path (Optional[Union[str, Path]]): The path to the local
289
- artifacts file.
290
297
  callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
291
298
  function to send intermediate update messages.
292
299
  code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -302,14 +309,6 @@ class VisionAgent(Agent):
302
309
  self.callback_message = callback_message
303
310
  if self.verbosity >= 1:
304
311
  _LOGGER.setLevel(logging.INFO)
305
- self.local_artifacts_path = cast(
306
- str,
307
- (
308
- Path(local_artifacts_path)
309
- if local_artifacts_path is not None
310
- else Path(tempfile.NamedTemporaryFile(delete=False).name)
311
- ),
312
- )
313
312
 
314
313
  def __call__(
315
314
  self,
@@ -386,7 +385,7 @@ class VisionAgent(Agent):
386
385
 
387
386
  if not artifacts:
388
387
  # this is setting remote artifacts path
389
- artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
388
+ artifacts = Artifacts("", "")
390
389
 
391
390
  # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
392
391
  code_interpreter = (
@@ -395,8 +394,15 @@ class VisionAgent(Agent):
395
394
  and not isinstance(self.code_interpreter, str)
396
395
  else CodeInterpreterFactory.new_instance(
397
396
  code_sandbox_runtime=self.code_interpreter,
397
+ remote_path=artifacts.remote_save_path.parent,
398
398
  )
399
399
  )
400
+
401
+ if code_interpreter.remote_path != artifacts.remote_save_path.parent:
402
+ raise ValueError(
403
+ f"Code interpreter remote path {code_interpreter.remote_path} does not match {artifacts.remote_save_path.parent}"
404
+ )
405
+
400
406
  with code_interpreter:
401
407
  orig_chat = copy.deepcopy(chat)
402
408
  int_chat = copy.deepcopy(chat)
@@ -436,15 +442,13 @@ class VisionAgent(Agent):
436
442
 
437
443
  # Save the current state of artifacts, will include any images the user
438
444
  # passed in.
439
- artifacts.save(self.local_artifacts_path)
445
+ artifacts.save()
440
446
 
441
447
  # Upload artifacts to remote location and show where they are going
442
448
  # to be loaded to. The actual loading happens in BoilerplateCode as
443
449
  # part of the pre_code.
444
- remote_artifacts_path = code_interpreter.upload_file(
445
- self.local_artifacts_path
446
- )
447
- artifacts_loaded = artifacts.show(code_interpreter.remote_path)
450
+ code_interpreter.upload_file(artifacts.local_save_path)
451
+ artifacts_loaded = artifacts.show(artifacts.remote_save_path.parent)
448
452
  int_chat.append({"role": "observation", "content": artifacts_loaded})
449
453
  orig_chat.append({"role": "observation", "content": artifacts_loaded})
450
454
  self.streaming_message({"role": "observation", "content": artifacts_loaded})
@@ -453,7 +457,6 @@ class VisionAgent(Agent):
453
457
  artifacts,
454
458
  last_user_message,
455
459
  code_interpreter,
456
- str(remote_artifacts_path),
457
460
  )
458
461
  finished = user_result is not None and user_obs is not None
459
462
  if user_result is not None and user_obs is not None:
@@ -472,7 +475,16 @@ class VisionAgent(Agent):
472
475
  )
473
476
 
474
477
  while not finished and iterations < self.max_iterations:
478
+ # ensure we upload the artifacts before each turn, so any local
479
+ # modifications we made to it will be reflected in the remote
480
+ code_interpreter.upload_file(artifacts.local_save_path)
481
+
475
482
  response = run_conversation(self.agent, int_chat)
483
+ code_action = use_extra_vision_agent_args(
484
+ response.get("execute_python", None),
485
+ test_multi_plan,
486
+ custom_tool_names,
487
+ )
476
488
  if self.verbosity >= 1:
477
489
  _LOGGER.info(response)
478
490
  int_chat.append(
@@ -532,31 +544,20 @@ class VisionAgent(Agent):
532
544
  artifacts,
533
545
  code_action,
534
546
  code_interpreter,
535
- str(remote_artifacts_path),
536
547
  )
537
-
538
- media_obs = check_and_load_image(code_action)
539
-
540
- if self.verbosity >= 1:
541
- _LOGGER.info(obs)
542
-
543
548
  obs_chat_elt: Message = {"role": "observation", "content": obs}
549
+ media_obs = check_and_load_image(code_action)
544
550
  if media_obs and result.success:
545
- # for view_media_artifact, we need to ensure the media is loaded
546
- # locally so the conversation agent can actually see it
547
- code_interpreter.download_file(
548
- str(remote_artifacts_path.name),
549
- str(self.local_artifacts_path),
550
- )
551
- artifacts.load(
552
- self.local_artifacts_path,
553
- Path(self.local_artifacts_path).parent,
554
- )
551
+ # media paths will be under the local_save_path when we download
552
+ # them after each turn
555
553
  obs_chat_elt["media"] = [
556
- Path(self.local_artifacts_path).parent / media_ob
554
+ artifacts.local_save_path.parent / media_ob
557
555
  for media_ob in media_obs
558
556
  ]
559
557
 
558
+ if self.verbosity >= 1:
559
+ _LOGGER.info(obs)
560
+
560
561
  # don't add execution results to internal chat
561
562
  int_chat.append(obs_chat_elt)
562
563
  obs_chat_elt["execution"] = result
@@ -573,13 +574,15 @@ class VisionAgent(Agent):
573
574
  iterations += 1
574
575
  last_response = response
575
576
 
576
- # after running the agent, download the artifacts locally
577
- code_interpreter.download_file(
578
- str(remote_artifacts_path.name), str(self.local_artifacts_path)
579
- )
580
- artifacts.load(
581
- self.local_artifacts_path, Path(self.local_artifacts_path).parent
582
- )
577
+ # after each turn, download the artifacts locally
578
+ code_interpreter.download_file(
579
+ str(artifacts.remote_save_path.name),
580
+ str(artifacts.local_save_path),
581
+ )
582
+ artifacts.load(
583
+ artifacts.local_save_path, artifacts.local_save_path.parent
584
+ )
585
+
583
586
  return orig_chat, artifacts
584
587
 
585
588
  def streaming_message(self, message: Dict[str, Any]) -> None:
@@ -595,7 +598,6 @@ class OpenAIVisionAgent(VisionAgent):
595
598
  self,
596
599
  agent: Optional[LMM] = None,
597
600
  verbosity: int = 0,
598
- local_artifacts_path: Optional[Union[str, Path]] = None,
599
601
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
600
602
  code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
601
603
  ) -> None:
@@ -605,8 +607,6 @@ class OpenAIVisionAgent(VisionAgent):
605
607
  agent (Optional[LMM]): The agent to use for conversation and orchestration
606
608
  of other agents.
607
609
  verbosity (int): The verbosity level of the agent.
608
- local_artifacts_path (Optional[Union[str, Path]]): The path to the local
609
- artifacts file.
610
610
  callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
611
611
  function to send intermediate update messages.
612
612
  code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -619,7 +619,6 @@ class OpenAIVisionAgent(VisionAgent):
619
619
  super().__init__(
620
620
  agent,
621
621
  verbosity,
622
- local_artifacts_path,
623
622
  callback_message,
624
623
  code_interpreter,
625
624
  )
@@ -630,7 +629,6 @@ class AnthropicVisionAgent(VisionAgent):
630
629
  self,
631
630
  agent: Optional[LMM] = None,
632
631
  verbosity: int = 0,
633
- local_artifacts_path: Optional[Union[str, Path]] = None,
634
632
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
635
633
  code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
636
634
  ) -> None:
@@ -640,8 +638,6 @@ class AnthropicVisionAgent(VisionAgent):
640
638
  agent (Optional[LMM]): The agent to use for conversation and orchestration
641
639
  of other agents.
642
640
  verbosity (int): The verbosity level of the agent.
643
- local_artifacts_path (Optional[Union[str, Path]]): The path to the local
644
- artifacts file.
645
641
  callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
646
642
  function to send intermediate update messages.
647
643
  code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -654,7 +650,6 @@ class AnthropicVisionAgent(VisionAgent):
654
650
  super().__init__(
655
651
  agent,
656
652
  verbosity,
657
- local_artifacts_path,
658
653
  callback_message,
659
654
  code_interpreter,
660
655
  )
@@ -1,7 +1,7 @@
1
1
  VA_CODE = """
2
2
  **Role**: You are a helpful agent that assists users with writing code.
3
3
 
4
- **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
4
+ **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`.
5
5
 
6
6
  <execute_python>
7
7
  print("Hello World!")
@@ -26,10 +26,11 @@ Here is the current conversation so far:
26
26
 
27
27
  **Instructions**:
28
28
  1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
29
- 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
29
+ 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
30
30
  3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
31
- 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
32
- 5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
31
+ 4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
32
+ 5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
33
+ 6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
33
34
 
34
35
  <thinking>Your thoughts here...</thinking>
35
36
  <response>Your response to the user here...</response>
@@ -53,7 +54,7 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
53
54
  <let_user_respond>false</let_user_respond>
54
55
 
55
56
  OBSERVATION:
56
- [Artifact dog_detector.py]
57
+ [Artifact dog_detector.py (5 lines total)]
57
58
  0|from vision_agent.tools import load_image, owl_v2
58
59
  1|def detect_dogs(image_path: str):
59
60
  2| image = load_image(image_path)
@@ -94,7 +95,7 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
94
95
  <let_user_respond>false</let_user_respond>
95
96
 
96
97
  OBSERVATION:
97
- [Artifact dog_detector.py]
98
+ [Artifact dog_detector.py (5 lines total)]
98
99
  0|from vision_agent.tools import load_image, owl_v2
99
100
  1|def detect_dogs(image_path: str):
100
101
  2| image = load_image(image_path)
@@ -142,7 +143,7 @@ AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will te
142
143
 
143
144
 
144
145
  EXAMPLES_CODE3 = """
145
- USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
146
+ USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes?
146
147
 
147
148
  OBSERVATION:
148
149
  [Artifacts loaded]
@@ -166,9 +167,9 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w
166
167
  <let_user_respond>false</let_user_respond>
167
168
 
168
169
  OBSERVATION:
169
- [Artifact code.py]
170
+ [Artifact code.py (17 lines total)]
170
171
  0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
171
- 1|def count_workers_with_helmets(image_path: str, output_path: str):
172
+ 1|def count_workers_with_helmets(image_path: str):
172
173
  2| image = load_image(image_path)
173
174
  3| detections = florence2_phrase_grounding("worker, helmet", image)
174
175
  4| workers = [d for d in detections if d['label'] == 'worker']
@@ -182,15 +183,16 @@ OBSERVATION:
182
183
  11| break
183
184
  11| if person_has_helmet:
184
185
  13| count += 1
185
- 14| overlay_bounding_boxes(image, detections)
186
- 15| save_image(output_path, image)
187
- 16| return count
186
+ 14| viz = overlay_bounding_boxes(image, detections)
187
+ 16| return count, viz
188
188
  [End of artifact]
189
189
 
190
- AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
190
+ AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization.</thinking>
191
191
  <execute_python>
192
192
  from code import count_workers_with_helmets
193
- print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
193
+ count, viz = count_workers_with_helmets('/path/to/images/workers.png')
194
+ save_image(viz, 'workers_viz.png')
195
+ print(count)
194
196
  </execute_python>
195
197
  <let_user_respond>false</let_user_respond>
196
198
 
@@ -201,7 +203,9 @@ OBSERVATION:
201
203
  AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
202
204
  <response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
203
205
  <let_user_respond>true</let_user_respond>
206
+ """
204
207
 
208
+ EXAMPLES_CODE3_EXTRA1 = """
205
209
  USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
206
210
 
207
211
  AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
@@ -249,3 +253,60 @@ AGENT: <thinking>Three workers with helmets are detected, I will show this to th
249
253
  <response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
250
254
  <let_user_respond>true</let_user_respond>
251
255
  """
256
+
257
+ EXAMPLES_CODE3_EXTRA2 = """
258
+ USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
259
+
260
+ AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
261
+ <execute_python>
262
+ CODE = '''
263
+ import streamlit as st
264
+ from code imoprt count_workers_with_helmets
265
+ import os
266
+
267
+ def main():
268
+ st.title("Worker with Helmet Counter")
269
+ uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
270
+ if uploaded_file is not None:
271
+ with open("temp_image.jpg", "wb") as f:
272
+ f.write(uploaded_file.read())
273
+
274
+ count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
275
+ st.image("temp_image_viz.jpg")
276
+ st.write(f"Number of workers with helmets: {count}")
277
+ os.remove("temp_image.jpg")
278
+ os.remove("temp_image_viz.jpg")
279
+
280
+ if __name__ == "__main__":
281
+ main()
282
+ '''
283
+ edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
284
+ </execute_python>
285
+
286
+ OBSERVATION:
287
+ [Artifact streamlit_app.py (19 lines total)]
288
+ 0|import streamlit as st
289
+ 1|from code imoprt count_workers_with_helmets
290
+ 2|import os
291
+ 3|
292
+ 4|def main():
293
+ 5| st.title("Worker with Helmet Counter")
294
+ 6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
295
+ 7| if uploaded_file is not None:
296
+ 8| with open("temp_image.jpg", "wb") as f:
297
+ 9| f.write(uploaded_file.read())
298
+ 10|
299
+ 11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
300
+ 12| st.image("temp_image_viz.jpg")
301
+ 13| st.write(f"Number of workers with helmets: {count}")
302
+ 14| os.remove("temp_image.jpg")
303
+ 15| os.remove("temp_image_viz.jpg")
304
+ 16|
305
+ 17|if __name__ == "__main__":
306
+ 18| main()
307
+ [End of artifact]
308
+
309
+ AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
310
+ <response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
311
+ <let_user_respond>true</let_user_respond>
312
+ """
@@ -6,15 +6,13 @@ import re
6
6
  import subprocess
7
7
  import tempfile
8
8
  from pathlib import Path
9
- from typing import Any, Dict, List, Optional, Union
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
10
 
11
11
  import libcst as cst
12
12
  from IPython.display import display
13
13
 
14
14
  import vision_agent as va
15
- from vision_agent.agent.agent_utils import extract_json
16
15
  from vision_agent.clients.landing_public_api import LandingPublicAPI
17
- from vision_agent.lmm import AnthropicLMM
18
16
  from vision_agent.lmm.types import Message
19
17
  from vision_agent.tools.tool_utils import get_tool_documentation
20
18
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
@@ -26,7 +24,6 @@ CURRENT_FILE = None
26
24
  CURRENT_LINE = 0
27
25
  DEFAULT_WINDOW_SIZE = 100
28
26
  ZMQ_PORT = os.environ.get("ZMQ_PORT", None)
29
- VERBOSITY = os.environ.get("VERBOSITY", 0)
30
27
 
31
28
 
32
29
  def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
@@ -38,16 +35,6 @@ def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
38
35
  socket.send_json(inp)
39
36
 
40
37
 
41
- def filter_file(file_name: Union[str, Path]) -> bool:
42
- file_name_p = Path(file_name)
43
- return (
44
- file_name_p.is_file()
45
- and "__pycache__" not in str(file_name_p)
46
- and file_name_p.suffix in [".py", ".txt"]
47
- and not file_name_p.name.startswith(".")
48
- )
49
-
50
-
51
38
  def redisplay_results(execution: Execution) -> None:
52
39
  """This function is used to add previous execution results to the current output.
53
40
  This is handy if you are inside a notebook environment, call it notebook1, and you
@@ -86,8 +73,11 @@ class Artifacts:
86
73
  need to be in sync with the remote environment the VisionAgent is running in.
87
74
  """
88
75
 
89
- def __init__(self, remote_save_path: Union[str, Path]) -> None:
76
+ def __init__(
77
+ self, remote_save_path: Union[str, Path], local_save_path: Union[str, Path]
78
+ ) -> None:
90
79
  self.remote_save_path = Path(remote_save_path)
80
+ self.local_save_path = Path(local_save_path)
91
81
  self.artifacts: Dict[str, Any] = {}
92
82
 
93
83
  self.code_sandbox_runtime = None
@@ -131,9 +121,7 @@ class Artifacts:
131
121
  return output_str
132
122
 
133
123
  def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
134
- save_path = (
135
- Path(local_path) if local_path is not None else self.remote_save_path
136
- )
124
+ save_path = Path(local_path) if local_path is not None else self.local_save_path
137
125
  with open(save_path, "wb") as f:
138
126
  pkl.dump(self.artifacts, f)
139
127
 
@@ -150,6 +138,38 @@ class Artifacts:
150
138
  return name in self.artifacts
151
139
 
152
140
 
141
+ def filter_file(file_name: Union[str, Path]) -> Tuple[bool, bool]:
142
+ file_name_p = Path(file_name)
143
+ return (
144
+ file_name_p.is_file()
145
+ and "__pycache__" not in str(file_name_p)
146
+ and not file_name_p.name.startswith(".")
147
+ and file_name_p.suffix
148
+ in [".png", ".jpeg", ".jpg", ".mp4", ".txt", ".json", ".csv"]
149
+ ), file_name_p.suffix in [".png", ".jpeg", ".jpg", ".mp4"]
150
+
151
+
152
+ def capture_files_into_artifacts(artifacts: Artifacts) -> None:
153
+ """This function is used to capture all files in the current directory into an
154
+ artifact object. This is useful if you want to capture all files in the current
155
+ directory and use them in a different environment where you don't have access to
156
+ the file system.
157
+
158
+ Parameters:
159
+ artifact (Artifacts): The artifact object to save the files to.
160
+ """
161
+ for file in Path(".").glob("**/*"):
162
+ usable_file, is_media = filter_file(file)
163
+ mode = "rb" if is_media else "r"
164
+ if usable_file:
165
+ file_name = file.name
166
+ if file_name.startswith(str(Path(artifacts.remote_save_path).parents)):
167
+ idx = len(Path(artifacts.remote_save_path).parents)
168
+ file_name = file_name[idx:]
169
+ with open(file, mode) as f:
170
+ artifacts[file_name] = f.read()
171
+
172
+
153
173
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
154
174
 
155
175
 
@@ -174,9 +194,9 @@ def view_lines(
174
194
  f"[Artifact: {name} ({total_lines} lines total)]\n"
175
195
  + format_lines(lines[start:end], start)
176
196
  + (
177
- "[End of artifact]"
197
+ "\n[End of artifact]"
178
198
  if end == len(lines)
179
- else f"[{len(lines) - end} more lines]"
199
+ else f"\n[{len(lines) - end} more lines]"
180
200
  )
181
201
  )
182
202
 
@@ -256,8 +276,10 @@ def edit_code_artifact(
256
276
  Parameters:
257
277
  artifacts (Artifacts): The artifacts object to edit the artifact from.
258
278
  name (str): The name of the artifact to edit.
259
- start (int): The line number to start the edit.
260
- end (int): The line number to end the edit.
279
+ start (int): The line number to start the edit, can be in [-1, total_lines]
280
+ where -1 represents the end of the file.
281
+ end (int): The line number to end the edit, can be in [-1, total_lines] where
282
+ -1 represents the end of the file.
261
283
  content (str): The content to insert.
262
284
  """
263
285
  # just make the artifact if it doesn't exist instead of forcing agent to call
@@ -266,17 +288,21 @@ def edit_code_artifact(
266
288
  artifacts[name] = ""
267
289
 
268
290
  total_lines = len(artifacts[name].splitlines())
291
+ if start == -1:
292
+ start = total_lines
293
+ if end == -1:
294
+ end = total_lines
295
+
269
296
  if start < 0 or end < 0 or start > end or end > total_lines:
270
297
  print("[Invalid line range]")
271
298
  return "[Invalid line range]"
272
- if start == end:
273
- end += 1
274
299
 
275
300
  new_content_lines = content.splitlines(keepends=True)
276
301
  new_content_lines = [
277
302
  line if line.endswith("\n") else line + "\n" for line in new_content_lines
278
303
  ]
279
304
  lines = artifacts[name].splitlines(keepends=True)
305
+ lines = [line if line.endswith("\n") else line + "\n" for line in lines]
280
306
  edited_lines = lines[:start] + new_content_lines + lines[end:]
281
307
 
282
308
  cur_line = start + len(content.split("\n")) // 2
@@ -371,14 +397,16 @@ def generate_vision_plan(
371
397
  [End Plan Context]
372
398
  """
373
399
 
400
+ # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
374
401
  if ZMQ_PORT is not None:
375
402
  agent = va.agent.VisionAgentPlanner(
376
403
  report_progress_callback=lambda inp: report_progress_callback(
377
404
  int(ZMQ_PORT), inp
378
- )
405
+ ),
406
+ verbosity=0,
379
407
  )
380
408
  else:
381
- agent = va.agent.VisionAgentPlanner()
409
+ agent = va.agent.VisionAgentPlanner(verbosity=0)
382
410
 
383
411
  fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
384
412
  response = agent.generate_plan(
@@ -435,14 +463,16 @@ def generate_vision_code(
435
463
  dogs = owl_v2("dog", image)
436
464
  return dogs
437
465
  """
466
+ # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
438
467
  if ZMQ_PORT is not None:
439
468
  agent = va.agent.VisionAgentCoder(
440
469
  report_progress_callback=lambda inp: report_progress_callback(
441
470
  int(ZMQ_PORT), inp
442
- )
471
+ ),
472
+ verbosity=0,
443
473
  )
444
474
  else:
445
- agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
475
+ agent = va.agent.VisionAgentCoder(verbosity=0)
446
476
 
447
477
  fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
448
478
  response = agent.generate_code(
@@ -506,7 +536,8 @@ def edit_vision_code(
506
536
  return dogs
507
537
  """
508
538
 
509
- agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
539
+ # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
540
+ agent = va.agent.VisionAgentCoder(verbosity=0)
510
541
  if name not in artifacts:
511
542
  print(f"[Artifact {name} does not exist]")
512
543
  return f"[Artifact {name} does not exist]"
@@ -570,8 +601,9 @@ def check_and_load_image(code: str) -> List[str]:
570
601
 
571
602
 
572
603
  def view_media_artifact(artifacts: Artifacts, name: str) -> str:
573
- """Allows you to view the media artifact with the given name. This does not show
574
- the media to the user, the user can already see all media saved in the artifacts.
604
+ """Allows only the agent to view the media artifact with the given name. DO NOT use
605
+ this to show media to the user, the user can already see all media saved in the
606
+ artifacts.
575
607
 
576
608
  Parameters:
577
609
  artifacts (Artifacts): The artifacts object to show the image from.
@@ -648,10 +680,10 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
648
680
 
649
681
 
650
682
  def use_extra_vision_agent_args(
651
- code: str,
683
+ code: Optional[str],
652
684
  test_multi_plan: bool = True,
653
685
  custom_tool_names: Optional[List[str]] = None,
654
- ) -> str:
686
+ ) -> Optional[str]:
655
687
  """This is for forcing arguments passed by the user to VisionAgent into the
656
688
  VisionAgentCoder call.
657
689
 
@@ -663,6 +695,8 @@ def use_extra_vision_agent_args(
663
695
  Returns:
664
696
  str: The edited code.
665
697
  """
698
+ if code is None:
699
+ return None
666
700
 
667
701
  class VisionAgentTransformer(cst.CSTTransformer):
668
702
  def __init__(
@@ -815,74 +849,12 @@ def use_object_detection_fine_tuning(
815
849
  return diff
816
850
 
817
851
 
818
- def extract_and_save_files_to_artifacts(
819
- artifacts: Artifacts, code: str, obs: str
820
- ) -> None:
821
- """Extracts and saves files used in the code to the artifacts object.
822
-
823
- Parameters:
824
- artifacts (Artifacts): The artifacts object to save the files to.
825
- code (str): The code to extract the files from.
826
- """
827
- try:
828
- response = extract_json(
829
- AnthropicLMM()( # type: ignore
830
- f"""You are a helpful AI assistant. Your job is to look at a snippet of code and the output of running that code and return the file paths that are being saved in the file. Below is the code snippet:
831
-
832
- ```python
833
- {code}
834
- ```
835
-
836
- ```output
837
- {obs}
838
- ```
839
-
840
- Return the file paths in the following JSON format:
841
- {{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
842
- )
843
- )
844
- except json.JSONDecodeError:
845
- return
846
-
847
- text_file_ext = [
848
- ".txt",
849
- ".md",
850
- "rtf",
851
- ".html",
852
- ".htm",
853
- "xml",
854
- ".json",
855
- ".csv",
856
- ".tsv",
857
- ".yaml",
858
- ".yml",
859
- ".toml",
860
- ".conf",
861
- ".env" ".ini",
862
- ".log",
863
- ".py",
864
- ".java",
865
- ".js",
866
- ".cpp",
867
- ".c" ".sql",
868
- ".sh",
869
- ]
870
-
871
- if "file_paths" in response and isinstance(response["file_paths"], list):
872
- for file_path in response["file_paths"]:
873
- read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
874
- if Path(file_path).is_file():
875
- with open(file_path, read_mode) as f:
876
- artifacts[Path(file_path).name] = f.read()
877
-
878
-
879
852
  META_TOOL_DOCSTRING = get_tool_documentation(
880
853
  [
881
854
  get_tool_descriptions,
882
855
  open_code_artifact,
883
856
  create_code_artifact,
884
857
  edit_code_artifact,
885
- generate_vision_plan,
886
858
  generate_vision_code,
887
859
  edit_vision_code,
888
860
  view_media_artifact,
@@ -575,6 +575,7 @@ class LocalCodeInterpreter(CodeInterpreter):
575
575
  super().__init__(timeout=timeout)
576
576
  self.nb = nbformat.v4.new_notebook()
577
577
  # Set the notebook execution path to the remote path
578
+ self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
578
579
  self.resources = {"metadata": {"path": str(self.remote_path)}}
579
580
  self.nb_client = NotebookClient(
580
581
  self.nb,
@@ -591,7 +592,6 @@ Timeout: {self.timeout}"""
591
592
  )
592
593
  sleep(1)
593
594
  self._new_kernel()
594
- self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
595
595
 
596
596
  def _new_kernel(self) -> None:
597
597
  if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
@@ -659,7 +659,7 @@ Timeout: {self.timeout}"""
659
659
  def download_file(
660
660
  self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
661
661
  ) -> Path:
662
- with open(self.remote_path / remote_file_path, "rb") as f:
662
+ with open(self.remote_path / Path(remote_file_path).name, "rb") as f:
663
663
  contents = f.read()
664
664
  with open(local_file_path, "wb") as f:
665
665
  f.write(contents)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.174
3
+ Version: 0.2.175
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -1,13 +1,13 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
- vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
5
- vision_agent/agent/vision_agent.py,sha256=lEda43d-Ri68FIjmn-MPIgLs8_jMpyDVXslusQedhWA,26222
4
+ vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
5
+ vision_agent/agent/vision_agent.py,sha256=YfSYp9UeML-f67mn1TvjwyXDB_CxaaG_4mzNC1py5xU,25882
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=3Q1VWrN-BNUoSD4OAqKazvXkP2c04PXDYu2Z1f5dQb0,31960
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
8
8
  vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
9
9
  vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
10
- vision_agent/agent/vision_agent_prompts.py,sha256=_xAITNDKcS45tqhEax5i6vDQa4V39f9n55iRGk2R6RM,11218
10
+ vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
11
11
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
13
13
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -17,19 +17,19 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
19
  vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
20
- vision_agent/tools/meta_tools.py,sha256=SVevrA5yxtSIXR8352QMMJv-jW8MIuA68Nj93DjW-60,30640
20
+ vision_agent/tools/meta_tools.py,sha256=v6e4pnWDIO46ZTWuk-1FkMszfmz2pj-N5wRP8_0WelM,30648
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
23
  vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
27
- vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4,28017
27
+ vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo,28028
28
28
  vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.174.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.174.dist-info/METADATA,sha256=V3d-gcpI2IZ4QWygiErgGAeNza-ROlsU-s1eH5Fr9UM,18339
34
- vision_agent-0.2.174.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.174.dist-info/RECORD,,
32
+ vision_agent-0.2.175.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.175.dist-info/METADATA,sha256=4qXZ_bRfFYb5fgTP4XcRG4bH9IcVhqE2akIQObcpMSo,18339
34
+ vision_agent-0.2.175.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.175.dist-info/RECORD,,