vision-agent 0.2.174__py3-none-any.whl → 0.2.176__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent_utils.py +3 -1
- vision_agent/agent/vision_agent.py +77 -70
- vision_agent/agent/vision_agent_prompts.py +75 -14
- vision_agent/tools/meta_tools.py +104 -106
- vision_agent/utils/execute.py +2 -2
- {vision_agent-0.2.174.dist-info → vision_agent-0.2.176.dist-info}/METADATA +10 -10
- {vision_agent-0.2.174.dist-info → vision_agent-0.2.176.dist-info}/RECORD +9 -9
- {vision_agent-0.2.174.dist-info → vision_agent-0.2.176.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.174.dist-info → vision_agent-0.2.176.dist-info}/WHEEL +0 -0
@@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
72
72
|
if json_dict is None:
|
73
73
|
error_msg = f"Could not extract JSON from the given str: {json_orig}"
|
74
74
|
_LOGGER.exception(error_msg)
|
75
|
-
raise
|
75
|
+
raise json.JSONDecodeError(
|
76
|
+
msg="Could not extract JSON", doc=json_orig, pos=0
|
77
|
+
)
|
76
78
|
|
77
79
|
return json_dict
|
78
80
|
|
@@ -2,7 +2,6 @@ import copy
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import os
|
5
|
-
import tempfile
|
6
5
|
from pathlib import Path
|
7
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
8
7
|
|
@@ -12,6 +11,7 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
12
11
|
EXAMPLES_CODE1,
|
13
12
|
EXAMPLES_CODE2,
|
14
13
|
EXAMPLES_CODE3,
|
14
|
+
EXAMPLES_CODE3_EXTRA2,
|
15
15
|
VA_CODE,
|
16
16
|
)
|
17
17
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
@@ -19,7 +19,6 @@ from vision_agent.tools.meta_tools import (
|
|
19
19
|
META_TOOL_DOCSTRING,
|
20
20
|
Artifacts,
|
21
21
|
check_and_load_image,
|
22
|
-
extract_and_save_files_to_artifacts,
|
23
22
|
use_extra_vision_agent_args,
|
24
23
|
)
|
25
24
|
from vision_agent.utils import CodeInterpreterFactory
|
@@ -37,11 +36,12 @@ class BoilerplateCode:
|
|
37
36
|
pre_code = [
|
38
37
|
"from typing import *",
|
39
38
|
"from vision_agent.utils.execute import CodeInterpreter",
|
40
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
|
41
|
-
"artifacts = Artifacts('{remote_path}')",
|
39
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts, capture_files_into_artifacts",
|
40
|
+
"artifacts = Artifacts('{remote_path}', '{remote_path}')",
|
42
41
|
"artifacts.load('{remote_path}')",
|
43
42
|
]
|
44
43
|
post_code = [
|
44
|
+
"capture_files_into_artifacts(artifacts)",
|
45
45
|
"artifacts.save()",
|
46
46
|
]
|
47
47
|
|
@@ -97,8 +97,9 @@ def _clean_response(response: str) -> str:
|
|
97
97
|
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
98
98
|
chat = copy.deepcopy(chat)
|
99
99
|
|
100
|
+
# only add 10 most recent messages in the chat to not go over token limit
|
100
101
|
conversation = ""
|
101
|
-
for chat_i in chat:
|
102
|
+
for chat_i in chat[-10:]:
|
102
103
|
if chat_i["role"] == "user":
|
103
104
|
conversation += f"USER: {chat_i['content']}\n\n"
|
104
105
|
elif chat_i["role"] == "observation":
|
@@ -110,7 +111,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
110
111
|
|
111
112
|
prompt = VA_CODE.format(
|
112
113
|
documentation=META_TOOL_DOCSTRING,
|
113
|
-
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
|
114
|
+
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
|
114
115
|
conversation=conversation,
|
115
116
|
)
|
116
117
|
message: Message = {"role": "user", "content": prompt}
|
@@ -120,7 +121,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
120
121
|
and "media" in chat[-1]
|
121
122
|
and len(chat[-1]["media"]) > 0 # type: ignore
|
122
123
|
):
|
123
|
-
|
124
|
+
media_obs = [media for media in chat[-1]["media"] if Path(media).exists()] # type: ignore
|
125
|
+
if len(media_obs) > 0:
|
126
|
+
message["media"] = media_obs # type: ignore
|
124
127
|
conv_resp = cast(str, orch([message], stream=False))
|
125
128
|
|
126
129
|
# clean the response first, if we are executing code, do not resond or end
|
@@ -144,16 +147,16 @@ def execute_code_action(
|
|
144
147
|
artifacts: Artifacts,
|
145
148
|
code: str,
|
146
149
|
code_interpreter: CodeInterpreter,
|
147
|
-
artifact_remote_path: str,
|
148
150
|
) -> Tuple[Execution, str]:
|
149
151
|
result = code_interpreter.exec_isolation(
|
150
|
-
BoilerplateCode.add_boilerplate(
|
152
|
+
BoilerplateCode.add_boilerplate(
|
153
|
+
code, remote_path=str(artifacts.remote_save_path)
|
154
|
+
)
|
151
155
|
)
|
152
156
|
|
153
157
|
obs = str(result.logs)
|
154
158
|
if result.error:
|
155
159
|
obs += f"\n{result.error}"
|
156
|
-
extract_and_save_files_to_artifacts(artifacts, code, obs)
|
157
160
|
return result, obs
|
158
161
|
|
159
162
|
|
@@ -161,7 +164,6 @@ def execute_user_code_action(
|
|
161
164
|
artifacts: Artifacts,
|
162
165
|
last_user_message: Message,
|
163
166
|
code_interpreter: CodeInterpreter,
|
164
|
-
artifact_remote_path: str,
|
165
167
|
) -> Tuple[Optional[Execution], Optional[str]]:
|
166
168
|
user_result = None
|
167
169
|
user_obs = None
|
@@ -178,11 +180,10 @@ def execute_user_code_action(
|
|
178
180
|
if user_code_action is not None:
|
179
181
|
user_code_action = use_extra_vision_agent_args(user_code_action, False)
|
180
182
|
user_result, user_obs = execute_code_action(
|
181
|
-
artifacts, user_code_action, code_interpreter
|
183
|
+
artifacts, user_code_action, code_interpreter
|
182
184
|
)
|
183
185
|
if user_result.error:
|
184
186
|
user_obs += f"\n{user_result.error}"
|
185
|
-
extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs)
|
186
187
|
return user_result, user_obs
|
187
188
|
|
188
189
|
|
@@ -211,6 +212,19 @@ def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
|
|
211
212
|
return response
|
212
213
|
|
213
214
|
|
215
|
+
def setup_artifacts() -> Artifacts:
|
216
|
+
# this is setting remote artifacts path
|
217
|
+
sandbox = os.environ.get("CODE_SANDBOX_RUNTIME", None)
|
218
|
+
if sandbox is None or sandbox == "local":
|
219
|
+
remote = WORKSPACE / "artifacts.pkl"
|
220
|
+
elif sandbox == "e2b":
|
221
|
+
remote = Path("/home/user/artifacts.pkl")
|
222
|
+
else:
|
223
|
+
raise ValueError(f"Unknown code sandbox runtime {sandbox}")
|
224
|
+
artifacts = Artifacts(remote, Path(os.getcwd()) / "artifacts.pkl")
|
225
|
+
return artifacts
|
226
|
+
|
227
|
+
|
214
228
|
def new_format_to_old_format(new_format: Dict[str, Any]) -> Dict[str, Any]:
|
215
229
|
thoughts = new_format["thinking"] if new_format["thinking"] is not None else ""
|
216
230
|
response = new_format["response"] if new_format["response"] is not None else ""
|
@@ -231,9 +245,18 @@ def old_format_to_new_format(old_format_str: str) -> str:
|
|
231
245
|
except json.JSONDecodeError:
|
232
246
|
return old_format_str
|
233
247
|
|
234
|
-
|
235
|
-
|
236
|
-
|
248
|
+
if "thoughts" in old_format:
|
249
|
+
thinking = (
|
250
|
+
old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
|
251
|
+
)
|
252
|
+
else:
|
253
|
+
thinking = None
|
254
|
+
|
255
|
+
let_user_respond = (
|
256
|
+
old_format["let_user_respond"] if "let_user_respond" in old_format else True
|
257
|
+
)
|
258
|
+
|
259
|
+
if "response" in old_format and "<execute_python>" in old_format["response"]:
|
237
260
|
execute_python = extract_tag(old_format["response"], "execute_python")
|
238
261
|
response = (
|
239
262
|
old_format["response"]
|
@@ -244,7 +267,7 @@ def old_format_to_new_format(old_format_str: str) -> str:
|
|
244
267
|
)
|
245
268
|
else:
|
246
269
|
execute_python = None
|
247
|
-
response = old_format["response"]
|
270
|
+
response = old_format["response"] if "response" in old_format else None
|
248
271
|
|
249
272
|
return json.dumps(
|
250
273
|
{
|
@@ -275,7 +298,6 @@ class VisionAgent(Agent):
|
|
275
298
|
self,
|
276
299
|
agent: Optional[LMM] = None,
|
277
300
|
verbosity: int = 0,
|
278
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
279
301
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
280
302
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
281
303
|
) -> None:
|
@@ -285,8 +307,6 @@ class VisionAgent(Agent):
|
|
285
307
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
286
308
|
of other agents.
|
287
309
|
verbosity (int): The verbosity level of the agent.
|
288
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
289
|
-
artifacts file.
|
290
310
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
291
311
|
function to send intermediate update messages.
|
292
312
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -302,14 +322,6 @@ class VisionAgent(Agent):
|
|
302
322
|
self.callback_message = callback_message
|
303
323
|
if self.verbosity >= 1:
|
304
324
|
_LOGGER.setLevel(logging.INFO)
|
305
|
-
self.local_artifacts_path = cast(
|
306
|
-
str,
|
307
|
-
(
|
308
|
-
Path(local_artifacts_path)
|
309
|
-
if local_artifacts_path is not None
|
310
|
-
else Path(tempfile.NamedTemporaryFile(delete=False).name)
|
311
|
-
),
|
312
|
-
)
|
313
325
|
|
314
326
|
def __call__(
|
315
327
|
self,
|
@@ -385,8 +397,7 @@ class VisionAgent(Agent):
|
|
385
397
|
raise ValueError("chat cannot be empty")
|
386
398
|
|
387
399
|
if not artifacts:
|
388
|
-
|
389
|
-
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
400
|
+
artifacts = setup_artifacts()
|
390
401
|
|
391
402
|
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
392
403
|
code_interpreter = (
|
@@ -395,8 +406,15 @@ class VisionAgent(Agent):
|
|
395
406
|
and not isinstance(self.code_interpreter, str)
|
396
407
|
else CodeInterpreterFactory.new_instance(
|
397
408
|
code_sandbox_runtime=self.code_interpreter,
|
409
|
+
remote_path=artifacts.remote_save_path.parent,
|
398
410
|
)
|
399
411
|
)
|
412
|
+
|
413
|
+
if code_interpreter.remote_path != artifacts.remote_save_path.parent:
|
414
|
+
raise ValueError(
|
415
|
+
f"Code interpreter remote path {code_interpreter.remote_path} does not match artifacts remote path {artifacts.remote_save_path.parent}"
|
416
|
+
)
|
417
|
+
|
400
418
|
with code_interpreter:
|
401
419
|
orig_chat = copy.deepcopy(chat)
|
402
420
|
int_chat = copy.deepcopy(chat)
|
@@ -409,7 +427,7 @@ class VisionAgent(Agent):
|
|
409
427
|
artifacts.artifacts[Path(media).name] = open(media, "rb").read()
|
410
428
|
|
411
429
|
media_remote_path = (
|
412
|
-
Path(
|
430
|
+
Path(artifacts.remote_save_path.parent) / Path(media).name
|
413
431
|
)
|
414
432
|
chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
|
415
433
|
media_list.append(media_remote_path)
|
@@ -436,15 +454,13 @@ class VisionAgent(Agent):
|
|
436
454
|
|
437
455
|
# Save the current state of artifacts, will include any images the user
|
438
456
|
# passed in.
|
439
|
-
artifacts.save(
|
457
|
+
artifacts.save()
|
440
458
|
|
441
459
|
# Upload artifacts to remote location and show where they are going
|
442
460
|
# to be loaded to. The actual loading happens in BoilerplateCode as
|
443
461
|
# part of the pre_code.
|
444
|
-
|
445
|
-
|
446
|
-
)
|
447
|
-
artifacts_loaded = artifacts.show(code_interpreter.remote_path)
|
462
|
+
code_interpreter.upload_file(artifacts.local_save_path)
|
463
|
+
artifacts_loaded = artifacts.show(artifacts.remote_save_path.parent)
|
448
464
|
int_chat.append({"role": "observation", "content": artifacts_loaded})
|
449
465
|
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
450
466
|
self.streaming_message({"role": "observation", "content": artifacts_loaded})
|
@@ -453,7 +469,6 @@ class VisionAgent(Agent):
|
|
453
469
|
artifacts,
|
454
470
|
last_user_message,
|
455
471
|
code_interpreter,
|
456
|
-
str(remote_artifacts_path),
|
457
472
|
)
|
458
473
|
finished = user_result is not None and user_obs is not None
|
459
474
|
if user_result is not None and user_obs is not None:
|
@@ -472,7 +487,16 @@ class VisionAgent(Agent):
|
|
472
487
|
)
|
473
488
|
|
474
489
|
while not finished and iterations < self.max_iterations:
|
490
|
+
# ensure we upload the artifacts before each turn, so any local
|
491
|
+
# modifications we made to it will be reflected in the remote
|
492
|
+
code_interpreter.upload_file(artifacts.local_save_path)
|
493
|
+
|
475
494
|
response = run_conversation(self.agent, int_chat)
|
495
|
+
code_action = use_extra_vision_agent_args(
|
496
|
+
response.get("execute_python", None),
|
497
|
+
test_multi_plan,
|
498
|
+
custom_tool_names,
|
499
|
+
)
|
476
500
|
if self.verbosity >= 1:
|
477
501
|
_LOGGER.info(response)
|
478
502
|
int_chat.append(
|
@@ -532,31 +556,20 @@ class VisionAgent(Agent):
|
|
532
556
|
artifacts,
|
533
557
|
code_action,
|
534
558
|
code_interpreter,
|
535
|
-
str(remote_artifacts_path),
|
536
559
|
)
|
537
|
-
|
538
|
-
media_obs = check_and_load_image(code_action)
|
539
|
-
|
540
|
-
if self.verbosity >= 1:
|
541
|
-
_LOGGER.info(obs)
|
542
|
-
|
543
560
|
obs_chat_elt: Message = {"role": "observation", "content": obs}
|
561
|
+
media_obs = check_and_load_image(code_action)
|
544
562
|
if media_obs and result.success:
|
545
|
-
#
|
546
|
-
#
|
547
|
-
code_interpreter.download_file(
|
548
|
-
str(remote_artifacts_path.name),
|
549
|
-
str(self.local_artifacts_path),
|
550
|
-
)
|
551
|
-
artifacts.load(
|
552
|
-
self.local_artifacts_path,
|
553
|
-
Path(self.local_artifacts_path).parent,
|
554
|
-
)
|
563
|
+
# media paths will be under the local_save_path when we download
|
564
|
+
# them after each turn
|
555
565
|
obs_chat_elt["media"] = [
|
556
|
-
|
566
|
+
artifacts.local_save_path.parent / media_ob
|
557
567
|
for media_ob in media_obs
|
558
568
|
]
|
559
569
|
|
570
|
+
if self.verbosity >= 1:
|
571
|
+
_LOGGER.info(obs)
|
572
|
+
|
560
573
|
# don't add execution results to internal chat
|
561
574
|
int_chat.append(obs_chat_elt)
|
562
575
|
obs_chat_elt["execution"] = result
|
@@ -573,13 +586,15 @@ class VisionAgent(Agent):
|
|
573
586
|
iterations += 1
|
574
587
|
last_response = response
|
575
588
|
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
589
|
+
# after each turn, download the artifacts locally
|
590
|
+
code_interpreter.download_file(
|
591
|
+
str(artifacts.remote_save_path.name),
|
592
|
+
str(artifacts.local_save_path),
|
593
|
+
)
|
594
|
+
artifacts.load(
|
595
|
+
artifacts.local_save_path, artifacts.local_save_path.parent
|
596
|
+
)
|
597
|
+
|
583
598
|
return orig_chat, artifacts
|
584
599
|
|
585
600
|
def streaming_message(self, message: Dict[str, Any]) -> None:
|
@@ -595,7 +610,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
595
610
|
self,
|
596
611
|
agent: Optional[LMM] = None,
|
597
612
|
verbosity: int = 0,
|
598
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
599
613
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
600
614
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
601
615
|
) -> None:
|
@@ -605,8 +619,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
605
619
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
606
620
|
of other agents.
|
607
621
|
verbosity (int): The verbosity level of the agent.
|
608
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
609
|
-
artifacts file.
|
610
622
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
611
623
|
function to send intermediate update messages.
|
612
624
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -619,7 +631,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
619
631
|
super().__init__(
|
620
632
|
agent,
|
621
633
|
verbosity,
|
622
|
-
local_artifacts_path,
|
623
634
|
callback_message,
|
624
635
|
code_interpreter,
|
625
636
|
)
|
@@ -630,7 +641,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
630
641
|
self,
|
631
642
|
agent: Optional[LMM] = None,
|
632
643
|
verbosity: int = 0,
|
633
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
634
644
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
635
645
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
636
646
|
) -> None:
|
@@ -640,8 +650,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
640
650
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
641
651
|
of other agents.
|
642
652
|
verbosity (int): The verbosity level of the agent.
|
643
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
644
|
-
artifacts file.
|
645
653
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
646
654
|
function to send intermediate update messages.
|
647
655
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -654,7 +662,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
654
662
|
super().__init__(
|
655
663
|
agent,
|
656
664
|
verbosity,
|
657
|
-
local_artifacts_path,
|
658
665
|
callback_message,
|
659
666
|
code_interpreter,
|
660
667
|
)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
VA_CODE = """
|
2
2
|
**Role**: You are a helpful agent that assists users with writing code.
|
3
3
|
|
4
|
-
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <
|
4
|
+
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`.
|
5
5
|
|
6
6
|
<execute_python>
|
7
7
|
print("Hello World!")
|
@@ -26,10 +26,11 @@ Here is the current conversation so far:
|
|
26
26
|
|
27
27
|
**Instructions**:
|
28
28
|
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
29
|
-
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
|
29
|
+
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
|
30
30
|
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
|
31
|
-
4. **
|
32
|
-
5. **
|
31
|
+
4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
|
32
|
+
5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
|
33
|
+
6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
|
33
34
|
|
34
35
|
<thinking>Your thoughts here...</thinking>
|
35
36
|
<response>Your response to the user here...</response>
|
@@ -53,7 +54,7 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
|
|
53
54
|
<let_user_respond>false</let_user_respond>
|
54
55
|
|
55
56
|
OBSERVATION:
|
56
|
-
[Artifact dog_detector.py]
|
57
|
+
[Artifact dog_detector.py (5 lines total)]
|
57
58
|
0|from vision_agent.tools import load_image, owl_v2
|
58
59
|
1|def detect_dogs(image_path: str):
|
59
60
|
2| image = load_image(image_path)
|
@@ -94,7 +95,7 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
|
|
94
95
|
<let_user_respond>false</let_user_respond>
|
95
96
|
|
96
97
|
OBSERVATION:
|
97
|
-
[Artifact dog_detector.py]
|
98
|
+
[Artifact dog_detector.py (5 lines total)]
|
98
99
|
0|from vision_agent.tools import load_image, owl_v2
|
99
100
|
1|def detect_dogs(image_path: str):
|
100
101
|
2| image = load_image(image_path)
|
@@ -142,7 +143,7 @@ AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will te
|
|
142
143
|
|
143
144
|
|
144
145
|
EXAMPLES_CODE3 = """
|
145
|
-
USER: Can you create a function to count workers with helmets, return the count and
|
146
|
+
USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes?
|
146
147
|
|
147
148
|
OBSERVATION:
|
148
149
|
[Artifacts loaded]
|
@@ -166,9 +167,9 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w
|
|
166
167
|
<let_user_respond>false</let_user_respond>
|
167
168
|
|
168
169
|
OBSERVATION:
|
169
|
-
[Artifact code.py]
|
170
|
+
[Artifact code.py (17 lines total)]
|
170
171
|
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
171
|
-
1|def count_workers_with_helmets(image_path: str
|
172
|
+
1|def count_workers_with_helmets(image_path: str):
|
172
173
|
2| image = load_image(image_path)
|
173
174
|
3| detections = florence2_phrase_grounding("worker, helmet", image)
|
174
175
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
@@ -182,15 +183,16 @@ OBSERVATION:
|
|
182
183
|
11| break
|
183
184
|
11| if person_has_helmet:
|
184
185
|
13| count += 1
|
185
|
-
14| overlay_bounding_boxes(image, detections)
|
186
|
-
|
187
|
-
16| return count
|
186
|
+
14| viz = overlay_bounding_boxes(image, detections)
|
187
|
+
16| return count, viz
|
188
188
|
[End of artifact]
|
189
189
|
|
190
|
-
AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and
|
190
|
+
AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization.</thinking>
|
191
191
|
<execute_python>
|
192
192
|
from code import count_workers_with_helmets
|
193
|
-
|
193
|
+
count, viz = count_workers_with_helmets('/path/to/images/workers.png')
|
194
|
+
save_image(viz, 'workers_viz.png')
|
195
|
+
print(count)
|
194
196
|
</execute_python>
|
195
197
|
<let_user_respond>false</let_user_respond>
|
196
198
|
|
@@ -201,7 +203,9 @@ OBSERVATION:
|
|
201
203
|
AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
|
202
204
|
<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
|
203
205
|
<let_user_respond>true</let_user_respond>
|
206
|
+
"""
|
204
207
|
|
208
|
+
EXAMPLES_CODE3_EXTRA1 = """
|
205
209
|
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
|
206
210
|
|
207
211
|
AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
|
@@ -249,3 +253,60 @@ AGENT: <thinking>Three workers with helmets are detected, I will show this to th
|
|
249
253
|
<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
|
250
254
|
<let_user_respond>true</let_user_respond>
|
251
255
|
"""
|
256
|
+
|
257
|
+
EXAMPLES_CODE3_EXTRA2 = """
|
258
|
+
USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
|
259
|
+
|
260
|
+
AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
|
261
|
+
<execute_python>
|
262
|
+
CODE = '''
|
263
|
+
import streamlit as st
|
264
|
+
from code imoprt count_workers_with_helmets
|
265
|
+
import os
|
266
|
+
|
267
|
+
def main():
|
268
|
+
st.title("Worker with Helmet Counter")
|
269
|
+
uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
|
270
|
+
if uploaded_file is not None:
|
271
|
+
with open("temp_image.jpg", "wb") as f:
|
272
|
+
f.write(uploaded_file.read())
|
273
|
+
|
274
|
+
count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
|
275
|
+
st.image("temp_image_viz.jpg")
|
276
|
+
st.write(f"Number of workers with helmets: {count}")
|
277
|
+
os.remove("temp_image.jpg")
|
278
|
+
os.remove("temp_image_viz.jpg")
|
279
|
+
|
280
|
+
if __name__ == "__main__":
|
281
|
+
main()
|
282
|
+
'''
|
283
|
+
edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
|
284
|
+
</execute_python>
|
285
|
+
|
286
|
+
OBSERVATION:
|
287
|
+
[Artifact streamlit_app.py (19 lines total)]
|
288
|
+
0|import streamlit as st
|
289
|
+
1|from code imoprt count_workers_with_helmets
|
290
|
+
2|import os
|
291
|
+
3|
|
292
|
+
4|def main():
|
293
|
+
5| st.title("Worker with Helmet Counter")
|
294
|
+
6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
|
295
|
+
7| if uploaded_file is not None:
|
296
|
+
8| with open("temp_image.jpg", "wb") as f:
|
297
|
+
9| f.write(uploaded_file.read())
|
298
|
+
10|
|
299
|
+
11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
|
300
|
+
12| st.image("temp_image_viz.jpg")
|
301
|
+
13| st.write(f"Number of workers with helmets: {count}")
|
302
|
+
14| os.remove("temp_image.jpg")
|
303
|
+
15| os.remove("temp_image_viz.jpg")
|
304
|
+
16|
|
305
|
+
17|if __name__ == "__main__":
|
306
|
+
18| main()
|
307
|
+
[End of artifact]
|
308
|
+
|
309
|
+
AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
|
310
|
+
<response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
|
311
|
+
<let_user_respond>true</let_user_respond>
|
312
|
+
"""
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -6,15 +6,13 @@ import re
|
|
6
6
|
import subprocess
|
7
7
|
import tempfile
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Any, Dict, List, Optional, Union
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
10
10
|
|
11
11
|
import libcst as cst
|
12
12
|
from IPython.display import display
|
13
13
|
|
14
14
|
import vision_agent as va
|
15
|
-
from vision_agent.agent.agent_utils import extract_json
|
16
15
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
17
|
-
from vision_agent.lmm import AnthropicLMM
|
18
16
|
from vision_agent.lmm.types import Message
|
19
17
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
20
18
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
@@ -26,7 +24,6 @@ CURRENT_FILE = None
|
|
26
24
|
CURRENT_LINE = 0
|
27
25
|
DEFAULT_WINDOW_SIZE = 100
|
28
26
|
ZMQ_PORT = os.environ.get("ZMQ_PORT", None)
|
29
|
-
VERBOSITY = os.environ.get("VERBOSITY", 0)
|
30
27
|
|
31
28
|
|
32
29
|
def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
|
@@ -38,16 +35,6 @@ def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
|
|
38
35
|
socket.send_json(inp)
|
39
36
|
|
40
37
|
|
41
|
-
def filter_file(file_name: Union[str, Path]) -> bool:
|
42
|
-
file_name_p = Path(file_name)
|
43
|
-
return (
|
44
|
-
file_name_p.is_file()
|
45
|
-
and "__pycache__" not in str(file_name_p)
|
46
|
-
and file_name_p.suffix in [".py", ".txt"]
|
47
|
-
and not file_name_p.name.startswith(".")
|
48
|
-
)
|
49
|
-
|
50
|
-
|
51
38
|
def redisplay_results(execution: Execution) -> None:
|
52
39
|
"""This function is used to add previous execution results to the current output.
|
53
40
|
This is handy if you are inside a notebook environment, call it notebook1, and you
|
@@ -86,8 +73,19 @@ class Artifacts:
|
|
86
73
|
need to be in sync with the remote environment the VisionAgent is running in.
|
87
74
|
"""
|
88
75
|
|
89
|
-
def __init__(
|
76
|
+
def __init__(
|
77
|
+
self, remote_save_path: Union[str, Path], local_save_path: Union[str, Path]
|
78
|
+
) -> None:
|
79
|
+
"""Initializes the Artifacts object with it's remote and local save paths.
|
80
|
+
|
81
|
+
Parameters:
|
82
|
+
remote_save_path (Union[str, Path]): The path to save the artifacts in the
|
83
|
+
remote environment. For example "/home/user/artifacts.pkl".
|
84
|
+
local_save_path (Union[str, Path]): The path to save the artifacts in the
|
85
|
+
local environment. For example "/Users/my_user/workspace/artifacts.pkl".
|
86
|
+
"""
|
90
87
|
self.remote_save_path = Path(remote_save_path)
|
88
|
+
self.local_save_path = Path(local_save_path)
|
91
89
|
self.artifacts: Dict[str, Any] = {}
|
92
90
|
|
93
91
|
self.code_sandbox_runtime = None
|
@@ -95,31 +93,46 @@ class Artifacts:
|
|
95
93
|
def load(
|
96
94
|
self,
|
97
95
|
artifacts_path: Union[str, Path],
|
98
|
-
|
96
|
+
load_to_dir: Optional[Union[str, Path]] = None,
|
99
97
|
) -> None:
|
100
|
-
"""Loads are artifacts into the
|
101
|
-
into remote_save_path. If an artifact value is None it
|
98
|
+
"""Loads are artifacts into the load_to_dir directory. If load_to_dir is None,
|
99
|
+
it will load into remote_save_path directory. If an artifact value is None it
|
100
|
+
will skip loading it.
|
102
101
|
|
103
102
|
Parameters:
|
104
|
-
artifacts_path (Union[str, Path]): The file path to load the artifacts from
|
103
|
+
artifacts_path (Union[str, Path]): The file path to load the artifacts from.
|
104
|
+
If you are in the remote environment this would be remote_save_path, if
|
105
|
+
you are in the local environment this would be local_save_path.
|
106
|
+
load_to_dir (Optional[Union[str, Path]): The directory to load the artifacts
|
107
|
+
into. If None, it will load into remote_save_path directory.
|
105
108
|
"""
|
106
109
|
with open(artifacts_path, "rb") as f:
|
107
110
|
self.artifacts = pkl.load(f)
|
108
111
|
|
109
|
-
|
112
|
+
load_to_dir = (
|
113
|
+
self.remote_save_path.parent if load_to_dir is None else Path(load_to_dir)
|
114
|
+
)
|
110
115
|
|
111
116
|
for k, v in self.artifacts.items():
|
112
117
|
if v is not None:
|
113
118
|
mode = "w" if isinstance(v, str) else "wb"
|
114
|
-
with open(
|
119
|
+
with open(load_to_dir / k, mode) as f:
|
115
120
|
f.write(v)
|
116
121
|
|
117
|
-
def show(self,
|
118
|
-
"""
|
122
|
+
def show(self, uploaded_file_dir: Optional[Union[str, Path]] = None) -> str:
|
123
|
+
"""Prints out the artifacts and the directory they have been loaded to. If you
|
124
|
+
pass in upload_file_dir, it will show the artifacts have been loaded to the
|
125
|
+
upload_file_dir directory. If you don't pass in upload_file_dir, it will show
|
126
|
+
the artifacts have been loaded to the remote_save_path directory.
|
127
|
+
|
128
|
+
Parameters:
|
129
|
+
uploaded_file_dir (Optional[Union[str, Path]): The directory the artifacts
|
130
|
+
have been loaded to.
|
131
|
+
"""
|
119
132
|
loaded_path = (
|
120
|
-
Path(
|
121
|
-
if
|
122
|
-
else self.remote_save_path
|
133
|
+
Path(uploaded_file_dir)
|
134
|
+
if uploaded_file_dir is not None
|
135
|
+
else self.remote_save_path.parent
|
123
136
|
)
|
124
137
|
output_str = "[Artifacts loaded]\n"
|
125
138
|
for k in self.artifacts.keys():
|
@@ -131,9 +144,10 @@ class Artifacts:
|
|
131
144
|
return output_str
|
132
145
|
|
133
146
|
def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
|
134
|
-
|
135
|
-
|
136
|
-
|
147
|
+
"""Saves the artifacts to the local_save_path directory. If local_path is None,
|
148
|
+
it will save to the local_save_path directory.
|
149
|
+
"""
|
150
|
+
save_path = Path(local_path) if local_path is not None else self.local_save_path
|
137
151
|
with open(save_path, "wb") as f:
|
138
152
|
pkl.dump(self.artifacts, f)
|
139
153
|
|
@@ -150,6 +164,38 @@ class Artifacts:
|
|
150
164
|
return name in self.artifacts
|
151
165
|
|
152
166
|
|
167
|
+
def filter_file(file_name: Union[str, Path]) -> Tuple[bool, bool]:
|
168
|
+
file_name_p = Path(file_name)
|
169
|
+
return (
|
170
|
+
file_name_p.is_file()
|
171
|
+
and "__pycache__" not in str(file_name_p)
|
172
|
+
and not file_name_p.name.startswith(".")
|
173
|
+
and file_name_p.suffix
|
174
|
+
in [".png", ".jpeg", ".jpg", ".mp4", ".txt", ".json", ".csv"]
|
175
|
+
), file_name_p.suffix in [".png", ".jpeg", ".jpg", ".mp4"]
|
176
|
+
|
177
|
+
|
178
|
+
def capture_files_into_artifacts(artifacts: Artifacts) -> None:
|
179
|
+
"""This function is used to capture all files in the current directory into an
|
180
|
+
artifact object. This is useful if you want to capture all files in the current
|
181
|
+
directory and use them in a different environment where you don't have access to
|
182
|
+
the file system.
|
183
|
+
|
184
|
+
Parameters:
|
185
|
+
artifact (Artifacts): The artifact object to save the files to.
|
186
|
+
"""
|
187
|
+
for file in Path(".").glob("**/*"):
|
188
|
+
usable_file, is_media = filter_file(file)
|
189
|
+
mode = "rb" if is_media else "r"
|
190
|
+
if usable_file:
|
191
|
+
file_name = file.name
|
192
|
+
if file_name.startswith(str(Path(artifacts.remote_save_path).parents)):
|
193
|
+
idx = len(Path(artifacts.remote_save_path).parents)
|
194
|
+
file_name = file_name[idx:]
|
195
|
+
with open(file, mode) as f:
|
196
|
+
artifacts[file_name] = f.read()
|
197
|
+
|
198
|
+
|
153
199
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
154
200
|
|
155
201
|
|
@@ -174,9 +220,9 @@ def view_lines(
|
|
174
220
|
f"[Artifact: {name} ({total_lines} lines total)]\n"
|
175
221
|
+ format_lines(lines[start:end], start)
|
176
222
|
+ (
|
177
|
-
"[End of artifact]"
|
223
|
+
"\n[End of artifact]"
|
178
224
|
if end == len(lines)
|
179
|
-
else f"[{len(lines) - end} more lines]"
|
225
|
+
else f"\n[{len(lines) - end} more lines]"
|
180
226
|
)
|
181
227
|
)
|
182
228
|
|
@@ -256,8 +302,10 @@ def edit_code_artifact(
|
|
256
302
|
Parameters:
|
257
303
|
artifacts (Artifacts): The artifacts object to edit the artifact from.
|
258
304
|
name (str): The name of the artifact to edit.
|
259
|
-
start (int): The line number to start the edit
|
260
|
-
|
305
|
+
start (int): The line number to start the edit, can be in [-1, total_lines]
|
306
|
+
where -1 represents the end of the file.
|
307
|
+
end (int): The line number to end the edit, can be in [-1, total_lines] where
|
308
|
+
-1 represents the end of the file.
|
261
309
|
content (str): The content to insert.
|
262
310
|
"""
|
263
311
|
# just make the artifact if it doesn't exist instead of forcing agent to call
|
@@ -266,17 +314,21 @@ def edit_code_artifact(
|
|
266
314
|
artifacts[name] = ""
|
267
315
|
|
268
316
|
total_lines = len(artifacts[name].splitlines())
|
317
|
+
if start == -1:
|
318
|
+
start = total_lines
|
319
|
+
if end == -1:
|
320
|
+
end = total_lines
|
321
|
+
|
269
322
|
if start < 0 or end < 0 or start > end or end > total_lines:
|
270
323
|
print("[Invalid line range]")
|
271
324
|
return "[Invalid line range]"
|
272
|
-
if start == end:
|
273
|
-
end += 1
|
274
325
|
|
275
326
|
new_content_lines = content.splitlines(keepends=True)
|
276
327
|
new_content_lines = [
|
277
328
|
line if line.endswith("\n") else line + "\n" for line in new_content_lines
|
278
329
|
]
|
279
330
|
lines = artifacts[name].splitlines(keepends=True)
|
331
|
+
lines = [line if line.endswith("\n") else line + "\n" for line in lines]
|
280
332
|
edited_lines = lines[:start] + new_content_lines + lines[end:]
|
281
333
|
|
282
334
|
cur_line = start + len(content.split("\n")) // 2
|
@@ -371,14 +423,16 @@ def generate_vision_plan(
|
|
371
423
|
[End Plan Context]
|
372
424
|
"""
|
373
425
|
|
426
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
374
427
|
if ZMQ_PORT is not None:
|
375
428
|
agent = va.agent.VisionAgentPlanner(
|
376
429
|
report_progress_callback=lambda inp: report_progress_callback(
|
377
430
|
int(ZMQ_PORT), inp
|
378
|
-
)
|
431
|
+
),
|
432
|
+
verbosity=0,
|
379
433
|
)
|
380
434
|
else:
|
381
|
-
agent = va.agent.VisionAgentPlanner()
|
435
|
+
agent = va.agent.VisionAgentPlanner(verbosity=0)
|
382
436
|
|
383
437
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
384
438
|
response = agent.generate_plan(
|
@@ -435,14 +489,16 @@ def generate_vision_code(
|
|
435
489
|
dogs = owl_v2("dog", image)
|
436
490
|
return dogs
|
437
491
|
"""
|
492
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
438
493
|
if ZMQ_PORT is not None:
|
439
494
|
agent = va.agent.VisionAgentCoder(
|
440
495
|
report_progress_callback=lambda inp: report_progress_callback(
|
441
496
|
int(ZMQ_PORT), inp
|
442
|
-
)
|
497
|
+
),
|
498
|
+
verbosity=0,
|
443
499
|
)
|
444
500
|
else:
|
445
|
-
agent = va.agent.VisionAgentCoder(verbosity=
|
501
|
+
agent = va.agent.VisionAgentCoder(verbosity=0)
|
446
502
|
|
447
503
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
448
504
|
response = agent.generate_code(
|
@@ -506,7 +562,8 @@ def edit_vision_code(
|
|
506
562
|
return dogs
|
507
563
|
"""
|
508
564
|
|
509
|
-
|
565
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
566
|
+
agent = va.agent.VisionAgentCoder(verbosity=0)
|
510
567
|
if name not in artifacts:
|
511
568
|
print(f"[Artifact {name} does not exist]")
|
512
569
|
return f"[Artifact {name} does not exist]"
|
@@ -570,8 +627,9 @@ def check_and_load_image(code: str) -> List[str]:
|
|
570
627
|
|
571
628
|
|
572
629
|
def view_media_artifact(artifacts: Artifacts, name: str) -> str:
|
573
|
-
"""Allows
|
574
|
-
|
630
|
+
"""Allows only the agent to view the media artifact with the given name. DO NOT use
|
631
|
+
this to show media to the user, the user can already see all media saved in the
|
632
|
+
artifacts.
|
575
633
|
|
576
634
|
Parameters:
|
577
635
|
artifacts (Artifacts): The artifacts object to show the image from.
|
@@ -648,10 +706,10 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
|
|
648
706
|
|
649
707
|
|
650
708
|
def use_extra_vision_agent_args(
|
651
|
-
code: str,
|
709
|
+
code: Optional[str],
|
652
710
|
test_multi_plan: bool = True,
|
653
711
|
custom_tool_names: Optional[List[str]] = None,
|
654
|
-
) -> str:
|
712
|
+
) -> Optional[str]:
|
655
713
|
"""This is for forcing arguments passed by the user to VisionAgent into the
|
656
714
|
VisionAgentCoder call.
|
657
715
|
|
@@ -663,6 +721,8 @@ def use_extra_vision_agent_args(
|
|
663
721
|
Returns:
|
664
722
|
str: The edited code.
|
665
723
|
"""
|
724
|
+
if code is None:
|
725
|
+
return None
|
666
726
|
|
667
727
|
class VisionAgentTransformer(cst.CSTTransformer):
|
668
728
|
def __init__(
|
@@ -815,74 +875,12 @@ def use_object_detection_fine_tuning(
|
|
815
875
|
return diff
|
816
876
|
|
817
877
|
|
818
|
-
def extract_and_save_files_to_artifacts(
|
819
|
-
artifacts: Artifacts, code: str, obs: str
|
820
|
-
) -> None:
|
821
|
-
"""Extracts and saves files used in the code to the artifacts object.
|
822
|
-
|
823
|
-
Parameters:
|
824
|
-
artifacts (Artifacts): The artifacts object to save the files to.
|
825
|
-
code (str): The code to extract the files from.
|
826
|
-
"""
|
827
|
-
try:
|
828
|
-
response = extract_json(
|
829
|
-
AnthropicLMM()( # type: ignore
|
830
|
-
f"""You are a helpful AI assistant. Your job is to look at a snippet of code and the output of running that code and return the file paths that are being saved in the file. Below is the code snippet:
|
831
|
-
|
832
|
-
```python
|
833
|
-
{code}
|
834
|
-
```
|
835
|
-
|
836
|
-
```output
|
837
|
-
{obs}
|
838
|
-
```
|
839
|
-
|
840
|
-
Return the file paths in the following JSON format:
|
841
|
-
{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
|
842
|
-
)
|
843
|
-
)
|
844
|
-
except json.JSONDecodeError:
|
845
|
-
return
|
846
|
-
|
847
|
-
text_file_ext = [
|
848
|
-
".txt",
|
849
|
-
".md",
|
850
|
-
"rtf",
|
851
|
-
".html",
|
852
|
-
".htm",
|
853
|
-
"xml",
|
854
|
-
".json",
|
855
|
-
".csv",
|
856
|
-
".tsv",
|
857
|
-
".yaml",
|
858
|
-
".yml",
|
859
|
-
".toml",
|
860
|
-
".conf",
|
861
|
-
".env" ".ini",
|
862
|
-
".log",
|
863
|
-
".py",
|
864
|
-
".java",
|
865
|
-
".js",
|
866
|
-
".cpp",
|
867
|
-
".c" ".sql",
|
868
|
-
".sh",
|
869
|
-
]
|
870
|
-
|
871
|
-
if "file_paths" in response and isinstance(response["file_paths"], list):
|
872
|
-
for file_path in response["file_paths"]:
|
873
|
-
read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
|
874
|
-
if Path(file_path).is_file():
|
875
|
-
with open(file_path, read_mode) as f:
|
876
|
-
artifacts[Path(file_path).name] = f.read()
|
877
|
-
|
878
|
-
|
879
878
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
880
879
|
[
|
881
880
|
get_tool_descriptions,
|
882
881
|
open_code_artifact,
|
883
882
|
create_code_artifact,
|
884
883
|
edit_code_artifact,
|
885
|
-
generate_vision_plan,
|
886
884
|
generate_vision_code,
|
887
885
|
edit_vision_code,
|
888
886
|
view_media_artifact,
|
vision_agent/utils/execute.py
CHANGED
@@ -575,6 +575,7 @@ class LocalCodeInterpreter(CodeInterpreter):
|
|
575
575
|
super().__init__(timeout=timeout)
|
576
576
|
self.nb = nbformat.v4.new_notebook()
|
577
577
|
# Set the notebook execution path to the remote path
|
578
|
+
self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
|
578
579
|
self.resources = {"metadata": {"path": str(self.remote_path)}}
|
579
580
|
self.nb_client = NotebookClient(
|
580
581
|
self.nb,
|
@@ -591,7 +592,6 @@ Timeout: {self.timeout}"""
|
|
591
592
|
)
|
592
593
|
sleep(1)
|
593
594
|
self._new_kernel()
|
594
|
-
self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
|
595
595
|
|
596
596
|
def _new_kernel(self) -> None:
|
597
597
|
if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
|
@@ -659,7 +659,7 @@ Timeout: {self.timeout}"""
|
|
659
659
|
def download_file(
|
660
660
|
self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
|
661
661
|
) -> Path:
|
662
|
-
with open(self.remote_path / remote_file_path, "rb") as f:
|
662
|
+
with open(self.remote_path / Path(remote_file_path).name, "rb") as f:
|
663
663
|
contents = f.read()
|
664
664
|
with open(local_file_path, "wb") as f:
|
665
665
|
f.write(contents)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.176
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -53,25 +53,25 @@ Description-Content-Type: text/markdown
|
|
53
53
|

|
54
54
|
</div>
|
55
55
|
|
56
|
-
|
56
|
+
VisionAgent is a library that helps you utilize agent frameworks to generate code to
|
57
57
|
solve your vision task. Many current vision problems can easily take hours or days to
|
58
58
|
solve, you need to find the right model, figure out how to use it and program it to
|
59
|
-
accomplish the task you want.
|
59
|
+
accomplish the task you want. VisionAgent aims to provide an in-seconds experience by
|
60
60
|
allowing users to describe their problem in text and have the agent framework generate
|
61
61
|
code to solve the task for them. Check out our discord for updates and roadmaps!
|
62
62
|
|
63
63
|
## Table of Contents
|
64
64
|
- [🚀Quick Start](#quick-start)
|
65
65
|
- [📚Documentation](#documentation)
|
66
|
-
- [🔍🤖
|
66
|
+
- [🔍🤖VisionAgent](#vision-agent-basic-usage)
|
67
67
|
- [🛠️Tools](#tools)
|
68
68
|
- [🤖LMMs](#lmms)
|
69
|
-
- [💻🤖
|
69
|
+
- [💻🤖VisionAgent Coder](#vision-agent-coder)
|
70
70
|
- [🏗️Additional Backends](#additional-backends)
|
71
71
|
|
72
72
|
## Quick Start
|
73
73
|
### Web Application
|
74
|
-
The fastest way to test out
|
74
|
+
The fastest way to test out VisionAgent is to use our web application. You can find it
|
75
75
|
[here](https://va.landing.ai/).
|
76
76
|
|
77
77
|
|
@@ -108,9 +108,9 @@ be images or video files.
|
|
108
108
|
|
109
109
|
## Documentation
|
110
110
|
|
111
|
-
[
|
111
|
+
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
112
112
|
|
113
|
-
##
|
113
|
+
## VisionAgent Basic Usage
|
114
114
|
### Chatting and Message Formats
|
115
115
|
`VisionAgent` is an agent that can chat with you and call other tools or agents to
|
116
116
|
write vision code for you. You can interact with it like you would ChatGPT or any other
|
@@ -292,7 +292,7 @@ response = lmm(
|
|
292
292
|
)
|
293
293
|
```
|
294
294
|
|
295
|
-
##
|
295
|
+
## VisionAgent Coder
|
296
296
|
Underneath the hood, `VisionAgent` uses `VisionAgentCoder` to generate code to solve
|
297
297
|
vision tasks. You can use `VisionAgentCoder` directly to generate code if you want:
|
298
298
|
|
@@ -488,7 +488,7 @@ export AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME="your_embedding_model_deploy
|
|
488
488
|
|
489
489
|
> NOTE: make sure your Azure model deployment have enough quota (token per minute) to support it. The default value 8000TPM is not enough.
|
490
490
|
|
491
|
-
You can then run
|
491
|
+
You can then run VisionAgent using the Azure OpenAI models:
|
492
492
|
|
493
493
|
```python
|
494
494
|
import vision_agent as va
|
@@ -1,13 +1,13 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
2
|
vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
-
vision_agent/agent/agent_utils.py,sha256=
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
4
|
+
vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=x0-TElnTRW7abyq2wAwKRiTUExBGg24C-c74wO1oKtI,26336
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=3Q1VWrN-BNUoSD4OAqKazvXkP2c04PXDYu2Z1f5dQb0,31960
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
8
8
|
vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
|
9
9
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
|
10
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
|
11
11
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
13
13
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -17,19 +17,19 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
|
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
19
|
vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
|
20
|
-
vision_agent/tools/meta_tools.py,sha256=
|
20
|
+
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
23
|
vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
27
|
-
vision_agent/utils/execute.py,sha256=
|
27
|
+
vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo,28028
|
28
28
|
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.176.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.176.dist-info/METADATA,sha256=W86hwl87xkxa59Hn9rw8_FQGiMW3JncY9WQAmC5jxs0,18330
|
34
|
+
vision_agent-0.2.176.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.176.dist-info/RECORD,,
|
File without changes
|
File without changes
|