vision-agent 0.2.174__py3-none-any.whl → 0.2.176__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]:
72
72
  if json_dict is None:
73
73
  error_msg = f"Could not extract JSON from the given str: {json_orig}"
74
74
  _LOGGER.exception(error_msg)
75
- raise ValueError(error_msg)
75
+ raise json.JSONDecodeError(
76
+ msg="Could not extract JSON", doc=json_orig, pos=0
77
+ )
76
78
 
77
79
  return json_dict
78
80
 
@@ -2,7 +2,6 @@ import copy
2
2
  import json
3
3
  import logging
4
4
  import os
5
- import tempfile
6
5
  from pathlib import Path
7
6
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
8
7
 
@@ -12,6 +11,7 @@ from vision_agent.agent.vision_agent_prompts import (
12
11
  EXAMPLES_CODE1,
13
12
  EXAMPLES_CODE2,
14
13
  EXAMPLES_CODE3,
14
+ EXAMPLES_CODE3_EXTRA2,
15
15
  VA_CODE,
16
16
  )
17
17
  from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
@@ -19,7 +19,6 @@ from vision_agent.tools.meta_tools import (
19
19
  META_TOOL_DOCSTRING,
20
20
  Artifacts,
21
21
  check_and_load_image,
22
- extract_and_save_files_to_artifacts,
23
22
  use_extra_vision_agent_args,
24
23
  )
25
24
  from vision_agent.utils import CodeInterpreterFactory
@@ -37,11 +36,12 @@ class BoilerplateCode:
37
36
  pre_code = [
38
37
  "from typing import *",
39
38
  "from vision_agent.utils.execute import CodeInterpreter",
40
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
41
- "artifacts = Artifacts('{remote_path}')",
39
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts, capture_files_into_artifacts",
40
+ "artifacts = Artifacts('{remote_path}', '{remote_path}')",
42
41
  "artifacts.load('{remote_path}')",
43
42
  ]
44
43
  post_code = [
44
+ "capture_files_into_artifacts(artifacts)",
45
45
  "artifacts.save()",
46
46
  ]
47
47
 
@@ -97,8 +97,9 @@ def _clean_response(response: str) -> str:
97
97
  def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
98
98
  chat = copy.deepcopy(chat)
99
99
 
100
+ # only add 10 most recent messages in the chat to not go over token limit
100
101
  conversation = ""
101
- for chat_i in chat:
102
+ for chat_i in chat[-10:]:
102
103
  if chat_i["role"] == "user":
103
104
  conversation += f"USER: {chat_i['content']}\n\n"
104
105
  elif chat_i["role"] == "observation":
@@ -110,7 +111,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
110
111
 
111
112
  prompt = VA_CODE.format(
112
113
  documentation=META_TOOL_DOCSTRING,
113
- examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
114
+ examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
114
115
  conversation=conversation,
115
116
  )
116
117
  message: Message = {"role": "user", "content": prompt}
@@ -120,7 +121,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
120
121
  and "media" in chat[-1]
121
122
  and len(chat[-1]["media"]) > 0 # type: ignore
122
123
  ):
123
- message["media"] = chat[-1]["media"]
124
+ media_obs = [media for media in chat[-1]["media"] if Path(media).exists()] # type: ignore
125
+ if len(media_obs) > 0:
126
+ message["media"] = media_obs # type: ignore
124
127
  conv_resp = cast(str, orch([message], stream=False))
125
128
 
126
129
  # clean the response first, if we are executing code, do not resond or end
@@ -144,16 +147,16 @@ def execute_code_action(
144
147
  artifacts: Artifacts,
145
148
  code: str,
146
149
  code_interpreter: CodeInterpreter,
147
- artifact_remote_path: str,
148
150
  ) -> Tuple[Execution, str]:
149
151
  result = code_interpreter.exec_isolation(
150
- BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
152
+ BoilerplateCode.add_boilerplate(
153
+ code, remote_path=str(artifacts.remote_save_path)
154
+ )
151
155
  )
152
156
 
153
157
  obs = str(result.logs)
154
158
  if result.error:
155
159
  obs += f"\n{result.error}"
156
- extract_and_save_files_to_artifacts(artifacts, code, obs)
157
160
  return result, obs
158
161
 
159
162
 
@@ -161,7 +164,6 @@ def execute_user_code_action(
161
164
  artifacts: Artifacts,
162
165
  last_user_message: Message,
163
166
  code_interpreter: CodeInterpreter,
164
- artifact_remote_path: str,
165
167
  ) -> Tuple[Optional[Execution], Optional[str]]:
166
168
  user_result = None
167
169
  user_obs = None
@@ -178,11 +180,10 @@ def execute_user_code_action(
178
180
  if user_code_action is not None:
179
181
  user_code_action = use_extra_vision_agent_args(user_code_action, False)
180
182
  user_result, user_obs = execute_code_action(
181
- artifacts, user_code_action, code_interpreter, artifact_remote_path
183
+ artifacts, user_code_action, code_interpreter
182
184
  )
183
185
  if user_result.error:
184
186
  user_obs += f"\n{user_result.error}"
185
- extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs)
186
187
  return user_result, user_obs
187
188
 
188
189
 
@@ -211,6 +212,19 @@ def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
211
212
  return response
212
213
 
213
214
 
215
+ def setup_artifacts() -> Artifacts:
216
+ # this is setting remote artifacts path
217
+ sandbox = os.environ.get("CODE_SANDBOX_RUNTIME", None)
218
+ if sandbox is None or sandbox == "local":
219
+ remote = WORKSPACE / "artifacts.pkl"
220
+ elif sandbox == "e2b":
221
+ remote = Path("/home/user/artifacts.pkl")
222
+ else:
223
+ raise ValueError(f"Unknown code sandbox runtime {sandbox}")
224
+ artifacts = Artifacts(remote, Path(os.getcwd()) / "artifacts.pkl")
225
+ return artifacts
226
+
227
+
214
228
  def new_format_to_old_format(new_format: Dict[str, Any]) -> Dict[str, Any]:
215
229
  thoughts = new_format["thinking"] if new_format["thinking"] is not None else ""
216
230
  response = new_format["response"] if new_format["response"] is not None else ""
@@ -231,9 +245,18 @@ def old_format_to_new_format(old_format_str: str) -> str:
231
245
  except json.JSONDecodeError:
232
246
  return old_format_str
233
247
 
234
- thinking = old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
235
- let_user_respond = old_format["let_user_respond"]
236
- if "<execute_python>" in old_format["response"]:
248
+ if "thoughts" in old_format:
249
+ thinking = (
250
+ old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
251
+ )
252
+ else:
253
+ thinking = None
254
+
255
+ let_user_respond = (
256
+ old_format["let_user_respond"] if "let_user_respond" in old_format else True
257
+ )
258
+
259
+ if "response" in old_format and "<execute_python>" in old_format["response"]:
237
260
  execute_python = extract_tag(old_format["response"], "execute_python")
238
261
  response = (
239
262
  old_format["response"]
@@ -244,7 +267,7 @@ def old_format_to_new_format(old_format_str: str) -> str:
244
267
  )
245
268
  else:
246
269
  execute_python = None
247
- response = old_format["response"]
270
+ response = old_format["response"] if "response" in old_format else None
248
271
 
249
272
  return json.dumps(
250
273
  {
@@ -275,7 +298,6 @@ class VisionAgent(Agent):
275
298
  self,
276
299
  agent: Optional[LMM] = None,
277
300
  verbosity: int = 0,
278
- local_artifacts_path: Optional[Union[str, Path]] = None,
279
301
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
280
302
  code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
281
303
  ) -> None:
@@ -285,8 +307,6 @@ class VisionAgent(Agent):
285
307
  agent (Optional[LMM]): The agent to use for conversation and orchestration
286
308
  of other agents.
287
309
  verbosity (int): The verbosity level of the agent.
288
- local_artifacts_path (Optional[Union[str, Path]]): The path to the local
289
- artifacts file.
290
310
  callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
291
311
  function to send intermediate update messages.
292
312
  code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -302,14 +322,6 @@ class VisionAgent(Agent):
302
322
  self.callback_message = callback_message
303
323
  if self.verbosity >= 1:
304
324
  _LOGGER.setLevel(logging.INFO)
305
- self.local_artifacts_path = cast(
306
- str,
307
- (
308
- Path(local_artifacts_path)
309
- if local_artifacts_path is not None
310
- else Path(tempfile.NamedTemporaryFile(delete=False).name)
311
- ),
312
- )
313
325
 
314
326
  def __call__(
315
327
  self,
@@ -385,8 +397,7 @@ class VisionAgent(Agent):
385
397
  raise ValueError("chat cannot be empty")
386
398
 
387
399
  if not artifacts:
388
- # this is setting remote artifacts path
389
- artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
400
+ artifacts = setup_artifacts()
390
401
 
391
402
  # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
392
403
  code_interpreter = (
@@ -395,8 +406,15 @@ class VisionAgent(Agent):
395
406
  and not isinstance(self.code_interpreter, str)
396
407
  else CodeInterpreterFactory.new_instance(
397
408
  code_sandbox_runtime=self.code_interpreter,
409
+ remote_path=artifacts.remote_save_path.parent,
398
410
  )
399
411
  )
412
+
413
+ if code_interpreter.remote_path != artifacts.remote_save_path.parent:
414
+ raise ValueError(
415
+ f"Code interpreter remote path {code_interpreter.remote_path} does not match artifacts remote path {artifacts.remote_save_path.parent}"
416
+ )
417
+
400
418
  with code_interpreter:
401
419
  orig_chat = copy.deepcopy(chat)
402
420
  int_chat = copy.deepcopy(chat)
@@ -409,7 +427,7 @@ class VisionAgent(Agent):
409
427
  artifacts.artifacts[Path(media).name] = open(media, "rb").read()
410
428
 
411
429
  media_remote_path = (
412
- Path(code_interpreter.remote_path) / Path(media).name
430
+ Path(artifacts.remote_save_path.parent) / Path(media).name
413
431
  )
414
432
  chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
415
433
  media_list.append(media_remote_path)
@@ -436,15 +454,13 @@ class VisionAgent(Agent):
436
454
 
437
455
  # Save the current state of artifacts, will include any images the user
438
456
  # passed in.
439
- artifacts.save(self.local_artifacts_path)
457
+ artifacts.save()
440
458
 
441
459
  # Upload artifacts to remote location and show where they are going
442
460
  # to be loaded to. The actual loading happens in BoilerplateCode as
443
461
  # part of the pre_code.
444
- remote_artifacts_path = code_interpreter.upload_file(
445
- self.local_artifacts_path
446
- )
447
- artifacts_loaded = artifacts.show(code_interpreter.remote_path)
462
+ code_interpreter.upload_file(artifacts.local_save_path)
463
+ artifacts_loaded = artifacts.show(artifacts.remote_save_path.parent)
448
464
  int_chat.append({"role": "observation", "content": artifacts_loaded})
449
465
  orig_chat.append({"role": "observation", "content": artifacts_loaded})
450
466
  self.streaming_message({"role": "observation", "content": artifacts_loaded})
@@ -453,7 +469,6 @@ class VisionAgent(Agent):
453
469
  artifacts,
454
470
  last_user_message,
455
471
  code_interpreter,
456
- str(remote_artifacts_path),
457
472
  )
458
473
  finished = user_result is not None and user_obs is not None
459
474
  if user_result is not None and user_obs is not None:
@@ -472,7 +487,16 @@ class VisionAgent(Agent):
472
487
  )
473
488
 
474
489
  while not finished and iterations < self.max_iterations:
490
+ # ensure we upload the artifacts before each turn, so any local
491
+ # modifications we made to it will be reflected in the remote
492
+ code_interpreter.upload_file(artifacts.local_save_path)
493
+
475
494
  response = run_conversation(self.agent, int_chat)
495
+ code_action = use_extra_vision_agent_args(
496
+ response.get("execute_python", None),
497
+ test_multi_plan,
498
+ custom_tool_names,
499
+ )
476
500
  if self.verbosity >= 1:
477
501
  _LOGGER.info(response)
478
502
  int_chat.append(
@@ -532,31 +556,20 @@ class VisionAgent(Agent):
532
556
  artifacts,
533
557
  code_action,
534
558
  code_interpreter,
535
- str(remote_artifacts_path),
536
559
  )
537
-
538
- media_obs = check_and_load_image(code_action)
539
-
540
- if self.verbosity >= 1:
541
- _LOGGER.info(obs)
542
-
543
560
  obs_chat_elt: Message = {"role": "observation", "content": obs}
561
+ media_obs = check_and_load_image(code_action)
544
562
  if media_obs and result.success:
545
- # for view_media_artifact, we need to ensure the media is loaded
546
- # locally so the conversation agent can actually see it
547
- code_interpreter.download_file(
548
- str(remote_artifacts_path.name),
549
- str(self.local_artifacts_path),
550
- )
551
- artifacts.load(
552
- self.local_artifacts_path,
553
- Path(self.local_artifacts_path).parent,
554
- )
563
+ # media paths will be under the local_save_path when we download
564
+ # them after each turn
555
565
  obs_chat_elt["media"] = [
556
- Path(self.local_artifacts_path).parent / media_ob
566
+ artifacts.local_save_path.parent / media_ob
557
567
  for media_ob in media_obs
558
568
  ]
559
569
 
570
+ if self.verbosity >= 1:
571
+ _LOGGER.info(obs)
572
+
560
573
  # don't add execution results to internal chat
561
574
  int_chat.append(obs_chat_elt)
562
575
  obs_chat_elt["execution"] = result
@@ -573,13 +586,15 @@ class VisionAgent(Agent):
573
586
  iterations += 1
574
587
  last_response = response
575
588
 
576
- # after running the agent, download the artifacts locally
577
- code_interpreter.download_file(
578
- str(remote_artifacts_path.name), str(self.local_artifacts_path)
579
- )
580
- artifacts.load(
581
- self.local_artifacts_path, Path(self.local_artifacts_path).parent
582
- )
589
+ # after each turn, download the artifacts locally
590
+ code_interpreter.download_file(
591
+ str(artifacts.remote_save_path.name),
592
+ str(artifacts.local_save_path),
593
+ )
594
+ artifacts.load(
595
+ artifacts.local_save_path, artifacts.local_save_path.parent
596
+ )
597
+
583
598
  return orig_chat, artifacts
584
599
 
585
600
  def streaming_message(self, message: Dict[str, Any]) -> None:
@@ -595,7 +610,6 @@ class OpenAIVisionAgent(VisionAgent):
595
610
  self,
596
611
  agent: Optional[LMM] = None,
597
612
  verbosity: int = 0,
598
- local_artifacts_path: Optional[Union[str, Path]] = None,
599
613
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
600
614
  code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
601
615
  ) -> None:
@@ -605,8 +619,6 @@ class OpenAIVisionAgent(VisionAgent):
605
619
  agent (Optional[LMM]): The agent to use for conversation and orchestration
606
620
  of other agents.
607
621
  verbosity (int): The verbosity level of the agent.
608
- local_artifacts_path (Optional[Union[str, Path]]): The path to the local
609
- artifacts file.
610
622
  callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
611
623
  function to send intermediate update messages.
612
624
  code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -619,7 +631,6 @@ class OpenAIVisionAgent(VisionAgent):
619
631
  super().__init__(
620
632
  agent,
621
633
  verbosity,
622
- local_artifacts_path,
623
634
  callback_message,
624
635
  code_interpreter,
625
636
  )
@@ -630,7 +641,6 @@ class AnthropicVisionAgent(VisionAgent):
630
641
  self,
631
642
  agent: Optional[LMM] = None,
632
643
  verbosity: int = 0,
633
- local_artifacts_path: Optional[Union[str, Path]] = None,
634
644
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
635
645
  code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
636
646
  ) -> None:
@@ -640,8 +650,6 @@ class AnthropicVisionAgent(VisionAgent):
640
650
  agent (Optional[LMM]): The agent to use for conversation and orchestration
641
651
  of other agents.
642
652
  verbosity (int): The verbosity level of the agent.
643
- local_artifacts_path (Optional[Union[str, Path]]): The path to the local
644
- artifacts file.
645
653
  callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
646
654
  function to send intermediate update messages.
647
655
  code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -654,7 +662,6 @@ class AnthropicVisionAgent(VisionAgent):
654
662
  super().__init__(
655
663
  agent,
656
664
  verbosity,
657
- local_artifacts_path,
658
665
  callback_message,
659
666
  code_interpreter,
660
667
  )
@@ -1,7 +1,7 @@
1
1
  VA_CODE = """
2
2
  **Role**: You are a helpful agent that assists users with writing code.
3
3
 
4
- **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
4
+ **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`.
5
5
 
6
6
  <execute_python>
7
7
  print("Hello World!")
@@ -26,10 +26,11 @@ Here is the current conversation so far:
26
26
 
27
27
  **Instructions**:
28
28
  1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
29
- 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
29
+ 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
30
30
  3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
31
- 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
32
- 5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
31
+ 4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
32
+ 5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
33
+ 6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
33
34
 
34
35
  <thinking>Your thoughts here...</thinking>
35
36
  <response>Your response to the user here...</response>
@@ -53,7 +54,7 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
53
54
  <let_user_respond>false</let_user_respond>
54
55
 
55
56
  OBSERVATION:
56
- [Artifact dog_detector.py]
57
+ [Artifact dog_detector.py (5 lines total)]
57
58
  0|from vision_agent.tools import load_image, owl_v2
58
59
  1|def detect_dogs(image_path: str):
59
60
  2| image = load_image(image_path)
@@ -94,7 +95,7 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
94
95
  <let_user_respond>false</let_user_respond>
95
96
 
96
97
  OBSERVATION:
97
- [Artifact dog_detector.py]
98
+ [Artifact dog_detector.py (5 lines total)]
98
99
  0|from vision_agent.tools import load_image, owl_v2
99
100
  1|def detect_dogs(image_path: str):
100
101
  2| image = load_image(image_path)
@@ -142,7 +143,7 @@ AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will te
142
143
 
143
144
 
144
145
  EXAMPLES_CODE3 = """
145
- USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
146
+ USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes?
146
147
 
147
148
  OBSERVATION:
148
149
  [Artifacts loaded]
@@ -166,9 +167,9 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w
166
167
  <let_user_respond>false</let_user_respond>
167
168
 
168
169
  OBSERVATION:
169
- [Artifact code.py]
170
+ [Artifact code.py (17 lines total)]
170
171
  0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
171
- 1|def count_workers_with_helmets(image_path: str, output_path: str):
172
+ 1|def count_workers_with_helmets(image_path: str):
172
173
  2| image = load_image(image_path)
173
174
  3| detections = florence2_phrase_grounding("worker, helmet", image)
174
175
  4| workers = [d for d in detections if d['label'] == 'worker']
@@ -182,15 +183,16 @@ OBSERVATION:
182
183
  11| break
183
184
  11| if person_has_helmet:
184
185
  13| count += 1
185
- 14| overlay_bounding_boxes(image, detections)
186
- 15| save_image(output_path, image)
187
- 16| return count
186
+ 14| viz = overlay_bounding_boxes(image, detections)
187
+ 16| return count, viz
188
188
  [End of artifact]
189
189
 
190
- AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
190
+ AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization.</thinking>
191
191
  <execute_python>
192
192
  from code import count_workers_with_helmets
193
- print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
193
+ count, viz = count_workers_with_helmets('/path/to/images/workers.png')
194
+ save_image(viz, 'workers_viz.png')
195
+ print(count)
194
196
  </execute_python>
195
197
  <let_user_respond>false</let_user_respond>
196
198
 
@@ -201,7 +203,9 @@ OBSERVATION:
201
203
  AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
202
204
  <response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
203
205
  <let_user_respond>true</let_user_respond>
206
+ """
204
207
 
208
+ EXAMPLES_CODE3_EXTRA1 = """
205
209
  USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
206
210
 
207
211
  AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
@@ -249,3 +253,60 @@ AGENT: <thinking>Three workers with helmets are detected, I will show this to th
249
253
  <response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
250
254
  <let_user_respond>true</let_user_respond>
251
255
  """
256
+
257
+ EXAMPLES_CODE3_EXTRA2 = """
258
+ USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
259
+
260
+ AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
261
+ <execute_python>
262
+ CODE = '''
263
+ import streamlit as st
264
+ from code imoprt count_workers_with_helmets
265
+ import os
266
+
267
+ def main():
268
+ st.title("Worker with Helmet Counter")
269
+ uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
270
+ if uploaded_file is not None:
271
+ with open("temp_image.jpg", "wb") as f:
272
+ f.write(uploaded_file.read())
273
+
274
+ count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
275
+ st.image("temp_image_viz.jpg")
276
+ st.write(f"Number of workers with helmets: {count}")
277
+ os.remove("temp_image.jpg")
278
+ os.remove("temp_image_viz.jpg")
279
+
280
+ if __name__ == "__main__":
281
+ main()
282
+ '''
283
+ edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
284
+ </execute_python>
285
+
286
+ OBSERVATION:
287
+ [Artifact streamlit_app.py (19 lines total)]
288
+ 0|import streamlit as st
289
+ 1|from code imoprt count_workers_with_helmets
290
+ 2|import os
291
+ 3|
292
+ 4|def main():
293
+ 5| st.title("Worker with Helmet Counter")
294
+ 6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
295
+ 7| if uploaded_file is not None:
296
+ 8| with open("temp_image.jpg", "wb") as f:
297
+ 9| f.write(uploaded_file.read())
298
+ 10|
299
+ 11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
300
+ 12| st.image("temp_image_viz.jpg")
301
+ 13| st.write(f"Number of workers with helmets: {count}")
302
+ 14| os.remove("temp_image.jpg")
303
+ 15| os.remove("temp_image_viz.jpg")
304
+ 16|
305
+ 17|if __name__ == "__main__":
306
+ 18| main()
307
+ [End of artifact]
308
+
309
+ AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
310
+ <response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
311
+ <let_user_respond>true</let_user_respond>
312
+ """
@@ -6,15 +6,13 @@ import re
6
6
  import subprocess
7
7
  import tempfile
8
8
  from pathlib import Path
9
- from typing import Any, Dict, List, Optional, Union
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
10
 
11
11
  import libcst as cst
12
12
  from IPython.display import display
13
13
 
14
14
  import vision_agent as va
15
- from vision_agent.agent.agent_utils import extract_json
16
15
  from vision_agent.clients.landing_public_api import LandingPublicAPI
17
- from vision_agent.lmm import AnthropicLMM
18
16
  from vision_agent.lmm.types import Message
19
17
  from vision_agent.tools.tool_utils import get_tool_documentation
20
18
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
@@ -26,7 +24,6 @@ CURRENT_FILE = None
26
24
  CURRENT_LINE = 0
27
25
  DEFAULT_WINDOW_SIZE = 100
28
26
  ZMQ_PORT = os.environ.get("ZMQ_PORT", None)
29
- VERBOSITY = os.environ.get("VERBOSITY", 0)
30
27
 
31
28
 
32
29
  def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
@@ -38,16 +35,6 @@ def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
38
35
  socket.send_json(inp)
39
36
 
40
37
 
41
- def filter_file(file_name: Union[str, Path]) -> bool:
42
- file_name_p = Path(file_name)
43
- return (
44
- file_name_p.is_file()
45
- and "__pycache__" not in str(file_name_p)
46
- and file_name_p.suffix in [".py", ".txt"]
47
- and not file_name_p.name.startswith(".")
48
- )
49
-
50
-
51
38
  def redisplay_results(execution: Execution) -> None:
52
39
  """This function is used to add previous execution results to the current output.
53
40
  This is handy if you are inside a notebook environment, call it notebook1, and you
@@ -86,8 +73,19 @@ class Artifacts:
86
73
  need to be in sync with the remote environment the VisionAgent is running in.
87
74
  """
88
75
 
89
- def __init__(self, remote_save_path: Union[str, Path]) -> None:
76
+ def __init__(
77
+ self, remote_save_path: Union[str, Path], local_save_path: Union[str, Path]
78
+ ) -> None:
79
+ """Initializes the Artifacts object with it's remote and local save paths.
80
+
81
+ Parameters:
82
+ remote_save_path (Union[str, Path]): The path to save the artifacts in the
83
+ remote environment. For example "/home/user/artifacts.pkl".
84
+ local_save_path (Union[str, Path]): The path to save the artifacts in the
85
+ local environment. For example "/Users/my_user/workspace/artifacts.pkl".
86
+ """
90
87
  self.remote_save_path = Path(remote_save_path)
88
+ self.local_save_path = Path(local_save_path)
91
89
  self.artifacts: Dict[str, Any] = {}
92
90
 
93
91
  self.code_sandbox_runtime = None
@@ -95,31 +93,46 @@ class Artifacts:
95
93
  def load(
96
94
  self,
97
95
  artifacts_path: Union[str, Path],
98
- load_to: Optional[Union[str, Path]] = None,
96
+ load_to_dir: Optional[Union[str, Path]] = None,
99
97
  ) -> None:
100
- """Loads are artifacts into the load_to path. If load_to is None, it will load
101
- into remote_save_path. If an artifact value is None it will skip loading it.
98
+ """Loads are artifacts into the load_to_dir directory. If load_to_dir is None,
99
+ it will load into remote_save_path directory. If an artifact value is None it
100
+ will skip loading it.
102
101
 
103
102
  Parameters:
104
- artifacts_path (Union[str, Path]): The file path to load the artifacts from
103
+ artifacts_path (Union[str, Path]): The file path to load the artifacts from.
104
+ If you are in the remote environment this would be remote_save_path, if
105
+ you are in the local environment this would be local_save_path.
106
+ load_to_dir (Optional[Union[str, Path]): The directory to load the artifacts
107
+ into. If None, it will load into remote_save_path directory.
105
108
  """
106
109
  with open(artifacts_path, "rb") as f:
107
110
  self.artifacts = pkl.load(f)
108
111
 
109
- load_to = self.remote_save_path.parent if load_to is None else Path(load_to)
112
+ load_to_dir = (
113
+ self.remote_save_path.parent if load_to_dir is None else Path(load_to_dir)
114
+ )
110
115
 
111
116
  for k, v in self.artifacts.items():
112
117
  if v is not None:
113
118
  mode = "w" if isinstance(v, str) else "wb"
114
- with open(load_to / k, mode) as f:
119
+ with open(load_to_dir / k, mode) as f:
115
120
  f.write(v)
116
121
 
117
- def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str:
118
- """Shows the artifacts that have been loaded and their remote save paths."""
122
+ def show(self, uploaded_file_dir: Optional[Union[str, Path]] = None) -> str:
123
+ """Prints out the artifacts and the directory they have been loaded to. If you
124
+ pass in upload_file_dir, it will show the artifacts have been loaded to the
125
+ upload_file_dir directory. If you don't pass in upload_file_dir, it will show
126
+ the artifacts have been loaded to the remote_save_path directory.
127
+
128
+ Parameters:
129
+ uploaded_file_dir (Optional[Union[str, Path]): The directory the artifacts
130
+ have been loaded to.
131
+ """
119
132
  loaded_path = (
120
- Path(uploaded_file_path)
121
- if uploaded_file_path is not None
122
- else self.remote_save_path
133
+ Path(uploaded_file_dir)
134
+ if uploaded_file_dir is not None
135
+ else self.remote_save_path.parent
123
136
  )
124
137
  output_str = "[Artifacts loaded]\n"
125
138
  for k in self.artifacts.keys():
@@ -131,9 +144,10 @@ class Artifacts:
131
144
  return output_str
132
145
 
133
146
  def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
134
- save_path = (
135
- Path(local_path) if local_path is not None else self.remote_save_path
136
- )
147
+ """Saves the artifacts to the local_save_path directory. If local_path is None,
148
+ it will save to the local_save_path directory.
149
+ """
150
+ save_path = Path(local_path) if local_path is not None else self.local_save_path
137
151
  with open(save_path, "wb") as f:
138
152
  pkl.dump(self.artifacts, f)
139
153
 
@@ -150,6 +164,38 @@ class Artifacts:
150
164
  return name in self.artifacts
151
165
 
152
166
 
167
+ def filter_file(file_name: Union[str, Path]) -> Tuple[bool, bool]:
168
+ file_name_p = Path(file_name)
169
+ return (
170
+ file_name_p.is_file()
171
+ and "__pycache__" not in str(file_name_p)
172
+ and not file_name_p.name.startswith(".")
173
+ and file_name_p.suffix
174
+ in [".png", ".jpeg", ".jpg", ".mp4", ".txt", ".json", ".csv"]
175
+ ), file_name_p.suffix in [".png", ".jpeg", ".jpg", ".mp4"]
176
+
177
+
178
+ def capture_files_into_artifacts(artifacts: Artifacts) -> None:
179
+ """This function is used to capture all files in the current directory into an
180
+ artifact object. This is useful if you want to capture all files in the current
181
+ directory and use them in a different environment where you don't have access to
182
+ the file system.
183
+
184
+ Parameters:
185
+ artifact (Artifacts): The artifact object to save the files to.
186
+ """
187
+ for file in Path(".").glob("**/*"):
188
+ usable_file, is_media = filter_file(file)
189
+ mode = "rb" if is_media else "r"
190
+ if usable_file:
191
+ file_name = file.name
192
+ if file_name.startswith(str(Path(artifacts.remote_save_path).parents)):
193
+ idx = len(Path(artifacts.remote_save_path).parents)
194
+ file_name = file_name[idx:]
195
+ with open(file, mode) as f:
196
+ artifacts[file_name] = f.read()
197
+
198
+
153
199
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
154
200
 
155
201
 
@@ -174,9 +220,9 @@ def view_lines(
174
220
  f"[Artifact: {name} ({total_lines} lines total)]\n"
175
221
  + format_lines(lines[start:end], start)
176
222
  + (
177
- "[End of artifact]"
223
+ "\n[End of artifact]"
178
224
  if end == len(lines)
179
- else f"[{len(lines) - end} more lines]"
225
+ else f"\n[{len(lines) - end} more lines]"
180
226
  )
181
227
  )
182
228
 
@@ -256,8 +302,10 @@ def edit_code_artifact(
256
302
  Parameters:
257
303
  artifacts (Artifacts): The artifacts object to edit the artifact from.
258
304
  name (str): The name of the artifact to edit.
259
- start (int): The line number to start the edit.
260
- end (int): The line number to end the edit.
305
+ start (int): The line number to start the edit, can be in [-1, total_lines]
306
+ where -1 represents the end of the file.
307
+ end (int): The line number to end the edit, can be in [-1, total_lines] where
308
+ -1 represents the end of the file.
261
309
  content (str): The content to insert.
262
310
  """
263
311
  # just make the artifact if it doesn't exist instead of forcing agent to call
@@ -266,17 +314,21 @@ def edit_code_artifact(
266
314
  artifacts[name] = ""
267
315
 
268
316
  total_lines = len(artifacts[name].splitlines())
317
+ if start == -1:
318
+ start = total_lines
319
+ if end == -1:
320
+ end = total_lines
321
+
269
322
  if start < 0 or end < 0 or start > end or end > total_lines:
270
323
  print("[Invalid line range]")
271
324
  return "[Invalid line range]"
272
- if start == end:
273
- end += 1
274
325
 
275
326
  new_content_lines = content.splitlines(keepends=True)
276
327
  new_content_lines = [
277
328
  line if line.endswith("\n") else line + "\n" for line in new_content_lines
278
329
  ]
279
330
  lines = artifacts[name].splitlines(keepends=True)
331
+ lines = [line if line.endswith("\n") else line + "\n" for line in lines]
280
332
  edited_lines = lines[:start] + new_content_lines + lines[end:]
281
333
 
282
334
  cur_line = start + len(content.split("\n")) // 2
@@ -371,14 +423,16 @@ def generate_vision_plan(
371
423
  [End Plan Context]
372
424
  """
373
425
 
426
+ # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
374
427
  if ZMQ_PORT is not None:
375
428
  agent = va.agent.VisionAgentPlanner(
376
429
  report_progress_callback=lambda inp: report_progress_callback(
377
430
  int(ZMQ_PORT), inp
378
- )
431
+ ),
432
+ verbosity=0,
379
433
  )
380
434
  else:
381
- agent = va.agent.VisionAgentPlanner()
435
+ agent = va.agent.VisionAgentPlanner(verbosity=0)
382
436
 
383
437
  fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
384
438
  response = agent.generate_plan(
@@ -435,14 +489,16 @@ def generate_vision_code(
435
489
  dogs = owl_v2("dog", image)
436
490
  return dogs
437
491
  """
492
+ # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
438
493
  if ZMQ_PORT is not None:
439
494
  agent = va.agent.VisionAgentCoder(
440
495
  report_progress_callback=lambda inp: report_progress_callback(
441
496
  int(ZMQ_PORT), inp
442
- )
497
+ ),
498
+ verbosity=0,
443
499
  )
444
500
  else:
445
- agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
501
+ agent = va.agent.VisionAgentCoder(verbosity=0)
446
502
 
447
503
  fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
448
504
  response = agent.generate_code(
@@ -506,7 +562,8 @@ def edit_vision_code(
506
562
  return dogs
507
563
  """
508
564
 
509
- agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
565
+ # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
566
+ agent = va.agent.VisionAgentCoder(verbosity=0)
510
567
  if name not in artifacts:
511
568
  print(f"[Artifact {name} does not exist]")
512
569
  return f"[Artifact {name} does not exist]"
@@ -570,8 +627,9 @@ def check_and_load_image(code: str) -> List[str]:
570
627
 
571
628
 
572
629
  def view_media_artifact(artifacts: Artifacts, name: str) -> str:
573
- """Allows you to view the media artifact with the given name. This does not show
574
- the media to the user, the user can already see all media saved in the artifacts.
630
+ """Allows only the agent to view the media artifact with the given name. DO NOT use
631
+ this to show media to the user, the user can already see all media saved in the
632
+ artifacts.
575
633
 
576
634
  Parameters:
577
635
  artifacts (Artifacts): The artifacts object to show the image from.
@@ -648,10 +706,10 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
648
706
 
649
707
 
650
708
  def use_extra_vision_agent_args(
651
- code: str,
709
+ code: Optional[str],
652
710
  test_multi_plan: bool = True,
653
711
  custom_tool_names: Optional[List[str]] = None,
654
- ) -> str:
712
+ ) -> Optional[str]:
655
713
  """This is for forcing arguments passed by the user to VisionAgent into the
656
714
  VisionAgentCoder call.
657
715
 
@@ -663,6 +721,8 @@ def use_extra_vision_agent_args(
663
721
  Returns:
664
722
  str: The edited code.
665
723
  """
724
+ if code is None:
725
+ return None
666
726
 
667
727
  class VisionAgentTransformer(cst.CSTTransformer):
668
728
  def __init__(
@@ -815,74 +875,12 @@ def use_object_detection_fine_tuning(
815
875
  return diff
816
876
 
817
877
 
818
- def extract_and_save_files_to_artifacts(
819
- artifacts: Artifacts, code: str, obs: str
820
- ) -> None:
821
- """Extracts and saves files used in the code to the artifacts object.
822
-
823
- Parameters:
824
- artifacts (Artifacts): The artifacts object to save the files to.
825
- code (str): The code to extract the files from.
826
- """
827
- try:
828
- response = extract_json(
829
- AnthropicLMM()( # type: ignore
830
- f"""You are a helpful AI assistant. Your job is to look at a snippet of code and the output of running that code and return the file paths that are being saved in the file. Below is the code snippet:
831
-
832
- ```python
833
- {code}
834
- ```
835
-
836
- ```output
837
- {obs}
838
- ```
839
-
840
- Return the file paths in the following JSON format:
841
- {{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
842
- )
843
- )
844
- except json.JSONDecodeError:
845
- return
846
-
847
- text_file_ext = [
848
- ".txt",
849
- ".md",
850
- "rtf",
851
- ".html",
852
- ".htm",
853
- "xml",
854
- ".json",
855
- ".csv",
856
- ".tsv",
857
- ".yaml",
858
- ".yml",
859
- ".toml",
860
- ".conf",
861
- ".env" ".ini",
862
- ".log",
863
- ".py",
864
- ".java",
865
- ".js",
866
- ".cpp",
867
- ".c" ".sql",
868
- ".sh",
869
- ]
870
-
871
- if "file_paths" in response and isinstance(response["file_paths"], list):
872
- for file_path in response["file_paths"]:
873
- read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
874
- if Path(file_path).is_file():
875
- with open(file_path, read_mode) as f:
876
- artifacts[Path(file_path).name] = f.read()
877
-
878
-
879
878
  META_TOOL_DOCSTRING = get_tool_documentation(
880
879
  [
881
880
  get_tool_descriptions,
882
881
  open_code_artifact,
883
882
  create_code_artifact,
884
883
  edit_code_artifact,
885
- generate_vision_plan,
886
884
  generate_vision_code,
887
885
  edit_vision_code,
888
886
  view_media_artifact,
@@ -575,6 +575,7 @@ class LocalCodeInterpreter(CodeInterpreter):
575
575
  super().__init__(timeout=timeout)
576
576
  self.nb = nbformat.v4.new_notebook()
577
577
  # Set the notebook execution path to the remote path
578
+ self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
578
579
  self.resources = {"metadata": {"path": str(self.remote_path)}}
579
580
  self.nb_client = NotebookClient(
580
581
  self.nb,
@@ -591,7 +592,6 @@ Timeout: {self.timeout}"""
591
592
  )
592
593
  sleep(1)
593
594
  self._new_kernel()
594
- self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
595
595
 
596
596
  def _new_kernel(self) -> None:
597
597
  if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
@@ -659,7 +659,7 @@ Timeout: {self.timeout}"""
659
659
  def download_file(
660
660
  self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
661
661
  ) -> Path:
662
- with open(self.remote_path / remote_file_path, "rb") as f:
662
+ with open(self.remote_path / Path(remote_file_path).name, "rb") as f:
663
663
  contents = f.read()
664
664
  with open(local_file_path, "wb") as f:
665
665
  f.write(contents)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.174
3
+ Version: 0.2.176
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -53,25 +53,25 @@ Description-Content-Type: text/markdown
53
53
  ![version](https://img.shields.io/pypi/pyversions/vision-agent)
54
54
  </div>
55
55
 
56
- Vision Agent is a library that helps you utilize agent frameworks to generate code to
56
+ VisionAgent is a library that helps you utilize agent frameworks to generate code to
57
57
  solve your vision task. Many current vision problems can easily take hours or days to
58
58
  solve, you need to find the right model, figure out how to use it and program it to
59
- accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
59
+ accomplish the task you want. VisionAgent aims to provide an in-seconds experience by
60
60
  allowing users to describe their problem in text and have the agent framework generate
61
61
  code to solve the task for them. Check out our discord for updates and roadmaps!
62
62
 
63
63
  ## Table of Contents
64
64
  - [🚀Quick Start](#quick-start)
65
65
  - [📚Documentation](#documentation)
66
- - [🔍🤖Vision Agent](#vision-agent-basic-usage)
66
+ - [🔍🤖VisionAgent](#vision-agent-basic-usage)
67
67
  - [🛠️Tools](#tools)
68
68
  - [🤖LMMs](#lmms)
69
- - [💻🤖Vision Agent Coder](#vision-agent-coder)
69
+ - [💻🤖VisionAgent Coder](#vision-agent-coder)
70
70
  - [🏗️Additional Backends](#additional-backends)
71
71
 
72
72
  ## Quick Start
73
73
  ### Web Application
74
- The fastest way to test out Vision Agent is to use our web application. You can find it
74
+ The fastest way to test out VisionAgent is to use our web application. You can find it
75
75
  [here](https://va.landing.ai/).
76
76
 
77
77
 
@@ -108,9 +108,9 @@ be images or video files.
108
108
 
109
109
  ## Documentation
110
110
 
111
- [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
111
+ [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
112
112
 
113
- ## Vision Agent Basic Usage
113
+ ## VisionAgent Basic Usage
114
114
  ### Chatting and Message Formats
115
115
  `VisionAgent` is an agent that can chat with you and call other tools or agents to
116
116
  write vision code for you. You can interact with it like you would ChatGPT or any other
@@ -292,7 +292,7 @@ response = lmm(
292
292
  )
293
293
  ```
294
294
 
295
- ## Vision Agent Coder
295
+ ## VisionAgent Coder
296
296
  Underneath the hood, `VisionAgent` uses `VisionAgentCoder` to generate code to solve
297
297
  vision tasks. You can use `VisionAgentCoder` directly to generate code if you want:
298
298
 
@@ -488,7 +488,7 @@ export AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME="your_embedding_model_deploy
488
488
 
489
489
  > NOTE: make sure your Azure model deployment have enough quota (token per minute) to support it. The default value 8000TPM is not enough.
490
490
 
491
- You can then run Vision Agent using the Azure OpenAI models:
491
+ You can then run VisionAgent using the Azure OpenAI models:
492
492
 
493
493
  ```python
494
494
  import vision_agent as va
@@ -1,13 +1,13 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
- vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
5
- vision_agent/agent/vision_agent.py,sha256=lEda43d-Ri68FIjmn-MPIgLs8_jMpyDVXslusQedhWA,26222
4
+ vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
5
+ vision_agent/agent/vision_agent.py,sha256=x0-TElnTRW7abyq2wAwKRiTUExBGg24C-c74wO1oKtI,26336
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=3Q1VWrN-BNUoSD4OAqKazvXkP2c04PXDYu2Z1f5dQb0,31960
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
8
8
  vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
9
9
  vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
10
- vision_agent/agent/vision_agent_prompts.py,sha256=_xAITNDKcS45tqhEax5i6vDQa4V39f9n55iRGk2R6RM,11218
10
+ vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
11
11
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
13
13
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -17,19 +17,19 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
19
  vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
20
- vision_agent/tools/meta_tools.py,sha256=SVevrA5yxtSIXR8352QMMJv-jW8MIuA68Nj93DjW-60,30640
20
+ vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
23
  vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
27
- vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4,28017
27
+ vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo,28028
28
28
  vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.174.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.174.dist-info/METADATA,sha256=V3d-gcpI2IZ4QWygiErgGAeNza-ROlsU-s1eH5Fr9UM,18339
34
- vision_agent-0.2.174.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.174.dist-info/RECORD,,
32
+ vision_agent-0.2.176.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.176.dist-info/METADATA,sha256=W86hwl87xkxa59Hn9rw8_FQGiMW3JncY9WQAmC5jxs0,18330
34
+ vision_agent-0.2.176.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.176.dist-info/RECORD,,