vision-agent 0.2.174__py3-none-any.whl → 0.2.176__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/agent_utils.py +3 -1
- vision_agent/agent/vision_agent.py +77 -70
- vision_agent/agent/vision_agent_prompts.py +75 -14
- vision_agent/tools/meta_tools.py +104 -106
- vision_agent/utils/execute.py +2 -2
- {vision_agent-0.2.174.dist-info → vision_agent-0.2.176.dist-info}/METADATA +10 -10
- {vision_agent-0.2.174.dist-info → vision_agent-0.2.176.dist-info}/RECORD +9 -9
- {vision_agent-0.2.174.dist-info → vision_agent-0.2.176.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.174.dist-info → vision_agent-0.2.176.dist-info}/WHEEL +0 -0
@@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
72
72
|
if json_dict is None:
|
73
73
|
error_msg = f"Could not extract JSON from the given str: {json_orig}"
|
74
74
|
_LOGGER.exception(error_msg)
|
75
|
-
raise
|
75
|
+
raise json.JSONDecodeError(
|
76
|
+
msg="Could not extract JSON", doc=json_orig, pos=0
|
77
|
+
)
|
76
78
|
|
77
79
|
return json_dict
|
78
80
|
|
@@ -2,7 +2,6 @@ import copy
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import os
|
5
|
-
import tempfile
|
6
5
|
from pathlib import Path
|
7
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
8
7
|
|
@@ -12,6 +11,7 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
12
11
|
EXAMPLES_CODE1,
|
13
12
|
EXAMPLES_CODE2,
|
14
13
|
EXAMPLES_CODE3,
|
14
|
+
EXAMPLES_CODE3_EXTRA2,
|
15
15
|
VA_CODE,
|
16
16
|
)
|
17
17
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
@@ -19,7 +19,6 @@ from vision_agent.tools.meta_tools import (
|
|
19
19
|
META_TOOL_DOCSTRING,
|
20
20
|
Artifacts,
|
21
21
|
check_and_load_image,
|
22
|
-
extract_and_save_files_to_artifacts,
|
23
22
|
use_extra_vision_agent_args,
|
24
23
|
)
|
25
24
|
from vision_agent.utils import CodeInterpreterFactory
|
@@ -37,11 +36,12 @@ class BoilerplateCode:
|
|
37
36
|
pre_code = [
|
38
37
|
"from typing import *",
|
39
38
|
"from vision_agent.utils.execute import CodeInterpreter",
|
40
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
|
41
|
-
"artifacts = Artifacts('{remote_path}')",
|
39
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts, capture_files_into_artifacts",
|
40
|
+
"artifacts = Artifacts('{remote_path}', '{remote_path}')",
|
42
41
|
"artifacts.load('{remote_path}')",
|
43
42
|
]
|
44
43
|
post_code = [
|
44
|
+
"capture_files_into_artifacts(artifacts)",
|
45
45
|
"artifacts.save()",
|
46
46
|
]
|
47
47
|
|
@@ -97,8 +97,9 @@ def _clean_response(response: str) -> str:
|
|
97
97
|
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
98
98
|
chat = copy.deepcopy(chat)
|
99
99
|
|
100
|
+
# only add 10 most recent messages in the chat to not go over token limit
|
100
101
|
conversation = ""
|
101
|
-
for chat_i in chat:
|
102
|
+
for chat_i in chat[-10:]:
|
102
103
|
if chat_i["role"] == "user":
|
103
104
|
conversation += f"USER: {chat_i['content']}\n\n"
|
104
105
|
elif chat_i["role"] == "observation":
|
@@ -110,7 +111,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
110
111
|
|
111
112
|
prompt = VA_CODE.format(
|
112
113
|
documentation=META_TOOL_DOCSTRING,
|
113
|
-
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
|
114
|
+
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
|
114
115
|
conversation=conversation,
|
115
116
|
)
|
116
117
|
message: Message = {"role": "user", "content": prompt}
|
@@ -120,7 +121,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
120
121
|
and "media" in chat[-1]
|
121
122
|
and len(chat[-1]["media"]) > 0 # type: ignore
|
122
123
|
):
|
123
|
-
|
124
|
+
media_obs = [media for media in chat[-1]["media"] if Path(media).exists()] # type: ignore
|
125
|
+
if len(media_obs) > 0:
|
126
|
+
message["media"] = media_obs # type: ignore
|
124
127
|
conv_resp = cast(str, orch([message], stream=False))
|
125
128
|
|
126
129
|
# clean the response first, if we are executing code, do not resond or end
|
@@ -144,16 +147,16 @@ def execute_code_action(
|
|
144
147
|
artifacts: Artifacts,
|
145
148
|
code: str,
|
146
149
|
code_interpreter: CodeInterpreter,
|
147
|
-
artifact_remote_path: str,
|
148
150
|
) -> Tuple[Execution, str]:
|
149
151
|
result = code_interpreter.exec_isolation(
|
150
|
-
BoilerplateCode.add_boilerplate(
|
152
|
+
BoilerplateCode.add_boilerplate(
|
153
|
+
code, remote_path=str(artifacts.remote_save_path)
|
154
|
+
)
|
151
155
|
)
|
152
156
|
|
153
157
|
obs = str(result.logs)
|
154
158
|
if result.error:
|
155
159
|
obs += f"\n{result.error}"
|
156
|
-
extract_and_save_files_to_artifacts(artifacts, code, obs)
|
157
160
|
return result, obs
|
158
161
|
|
159
162
|
|
@@ -161,7 +164,6 @@ def execute_user_code_action(
|
|
161
164
|
artifacts: Artifacts,
|
162
165
|
last_user_message: Message,
|
163
166
|
code_interpreter: CodeInterpreter,
|
164
|
-
artifact_remote_path: str,
|
165
167
|
) -> Tuple[Optional[Execution], Optional[str]]:
|
166
168
|
user_result = None
|
167
169
|
user_obs = None
|
@@ -178,11 +180,10 @@ def execute_user_code_action(
|
|
178
180
|
if user_code_action is not None:
|
179
181
|
user_code_action = use_extra_vision_agent_args(user_code_action, False)
|
180
182
|
user_result, user_obs = execute_code_action(
|
181
|
-
artifacts, user_code_action, code_interpreter
|
183
|
+
artifacts, user_code_action, code_interpreter
|
182
184
|
)
|
183
185
|
if user_result.error:
|
184
186
|
user_obs += f"\n{user_result.error}"
|
185
|
-
extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs)
|
186
187
|
return user_result, user_obs
|
187
188
|
|
188
189
|
|
@@ -211,6 +212,19 @@ def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
|
|
211
212
|
return response
|
212
213
|
|
213
214
|
|
215
|
+
def setup_artifacts() -> Artifacts:
|
216
|
+
# this is setting remote artifacts path
|
217
|
+
sandbox = os.environ.get("CODE_SANDBOX_RUNTIME", None)
|
218
|
+
if sandbox is None or sandbox == "local":
|
219
|
+
remote = WORKSPACE / "artifacts.pkl"
|
220
|
+
elif sandbox == "e2b":
|
221
|
+
remote = Path("/home/user/artifacts.pkl")
|
222
|
+
else:
|
223
|
+
raise ValueError(f"Unknown code sandbox runtime {sandbox}")
|
224
|
+
artifacts = Artifacts(remote, Path(os.getcwd()) / "artifacts.pkl")
|
225
|
+
return artifacts
|
226
|
+
|
227
|
+
|
214
228
|
def new_format_to_old_format(new_format: Dict[str, Any]) -> Dict[str, Any]:
|
215
229
|
thoughts = new_format["thinking"] if new_format["thinking"] is not None else ""
|
216
230
|
response = new_format["response"] if new_format["response"] is not None else ""
|
@@ -231,9 +245,18 @@ def old_format_to_new_format(old_format_str: str) -> str:
|
|
231
245
|
except json.JSONDecodeError:
|
232
246
|
return old_format_str
|
233
247
|
|
234
|
-
|
235
|
-
|
236
|
-
|
248
|
+
if "thoughts" in old_format:
|
249
|
+
thinking = (
|
250
|
+
old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
|
251
|
+
)
|
252
|
+
else:
|
253
|
+
thinking = None
|
254
|
+
|
255
|
+
let_user_respond = (
|
256
|
+
old_format["let_user_respond"] if "let_user_respond" in old_format else True
|
257
|
+
)
|
258
|
+
|
259
|
+
if "response" in old_format and "<execute_python>" in old_format["response"]:
|
237
260
|
execute_python = extract_tag(old_format["response"], "execute_python")
|
238
261
|
response = (
|
239
262
|
old_format["response"]
|
@@ -244,7 +267,7 @@ def old_format_to_new_format(old_format_str: str) -> str:
|
|
244
267
|
)
|
245
268
|
else:
|
246
269
|
execute_python = None
|
247
|
-
response = old_format["response"]
|
270
|
+
response = old_format["response"] if "response" in old_format else None
|
248
271
|
|
249
272
|
return json.dumps(
|
250
273
|
{
|
@@ -275,7 +298,6 @@ class VisionAgent(Agent):
|
|
275
298
|
self,
|
276
299
|
agent: Optional[LMM] = None,
|
277
300
|
verbosity: int = 0,
|
278
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
279
301
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
280
302
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
281
303
|
) -> None:
|
@@ -285,8 +307,6 @@ class VisionAgent(Agent):
|
|
285
307
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
286
308
|
of other agents.
|
287
309
|
verbosity (int): The verbosity level of the agent.
|
288
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
289
|
-
artifacts file.
|
290
310
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
291
311
|
function to send intermediate update messages.
|
292
312
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -302,14 +322,6 @@ class VisionAgent(Agent):
|
|
302
322
|
self.callback_message = callback_message
|
303
323
|
if self.verbosity >= 1:
|
304
324
|
_LOGGER.setLevel(logging.INFO)
|
305
|
-
self.local_artifacts_path = cast(
|
306
|
-
str,
|
307
|
-
(
|
308
|
-
Path(local_artifacts_path)
|
309
|
-
if local_artifacts_path is not None
|
310
|
-
else Path(tempfile.NamedTemporaryFile(delete=False).name)
|
311
|
-
),
|
312
|
-
)
|
313
325
|
|
314
326
|
def __call__(
|
315
327
|
self,
|
@@ -385,8 +397,7 @@ class VisionAgent(Agent):
|
|
385
397
|
raise ValueError("chat cannot be empty")
|
386
398
|
|
387
399
|
if not artifacts:
|
388
|
-
|
389
|
-
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
400
|
+
artifacts = setup_artifacts()
|
390
401
|
|
391
402
|
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
392
403
|
code_interpreter = (
|
@@ -395,8 +406,15 @@ class VisionAgent(Agent):
|
|
395
406
|
and not isinstance(self.code_interpreter, str)
|
396
407
|
else CodeInterpreterFactory.new_instance(
|
397
408
|
code_sandbox_runtime=self.code_interpreter,
|
409
|
+
remote_path=artifacts.remote_save_path.parent,
|
398
410
|
)
|
399
411
|
)
|
412
|
+
|
413
|
+
if code_interpreter.remote_path != artifacts.remote_save_path.parent:
|
414
|
+
raise ValueError(
|
415
|
+
f"Code interpreter remote path {code_interpreter.remote_path} does not match artifacts remote path {artifacts.remote_save_path.parent}"
|
416
|
+
)
|
417
|
+
|
400
418
|
with code_interpreter:
|
401
419
|
orig_chat = copy.deepcopy(chat)
|
402
420
|
int_chat = copy.deepcopy(chat)
|
@@ -409,7 +427,7 @@ class VisionAgent(Agent):
|
|
409
427
|
artifacts.artifacts[Path(media).name] = open(media, "rb").read()
|
410
428
|
|
411
429
|
media_remote_path = (
|
412
|
-
Path(
|
430
|
+
Path(artifacts.remote_save_path.parent) / Path(media).name
|
413
431
|
)
|
414
432
|
chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
|
415
433
|
media_list.append(media_remote_path)
|
@@ -436,15 +454,13 @@ class VisionAgent(Agent):
|
|
436
454
|
|
437
455
|
# Save the current state of artifacts, will include any images the user
|
438
456
|
# passed in.
|
439
|
-
artifacts.save(
|
457
|
+
artifacts.save()
|
440
458
|
|
441
459
|
# Upload artifacts to remote location and show where they are going
|
442
460
|
# to be loaded to. The actual loading happens in BoilerplateCode as
|
443
461
|
# part of the pre_code.
|
444
|
-
|
445
|
-
|
446
|
-
)
|
447
|
-
artifacts_loaded = artifacts.show(code_interpreter.remote_path)
|
462
|
+
code_interpreter.upload_file(artifacts.local_save_path)
|
463
|
+
artifacts_loaded = artifacts.show(artifacts.remote_save_path.parent)
|
448
464
|
int_chat.append({"role": "observation", "content": artifacts_loaded})
|
449
465
|
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
450
466
|
self.streaming_message({"role": "observation", "content": artifacts_loaded})
|
@@ -453,7 +469,6 @@ class VisionAgent(Agent):
|
|
453
469
|
artifacts,
|
454
470
|
last_user_message,
|
455
471
|
code_interpreter,
|
456
|
-
str(remote_artifacts_path),
|
457
472
|
)
|
458
473
|
finished = user_result is not None and user_obs is not None
|
459
474
|
if user_result is not None and user_obs is not None:
|
@@ -472,7 +487,16 @@ class VisionAgent(Agent):
|
|
472
487
|
)
|
473
488
|
|
474
489
|
while not finished and iterations < self.max_iterations:
|
490
|
+
# ensure we upload the artifacts before each turn, so any local
|
491
|
+
# modifications we made to it will be reflected in the remote
|
492
|
+
code_interpreter.upload_file(artifacts.local_save_path)
|
493
|
+
|
475
494
|
response = run_conversation(self.agent, int_chat)
|
495
|
+
code_action = use_extra_vision_agent_args(
|
496
|
+
response.get("execute_python", None),
|
497
|
+
test_multi_plan,
|
498
|
+
custom_tool_names,
|
499
|
+
)
|
476
500
|
if self.verbosity >= 1:
|
477
501
|
_LOGGER.info(response)
|
478
502
|
int_chat.append(
|
@@ -532,31 +556,20 @@ class VisionAgent(Agent):
|
|
532
556
|
artifacts,
|
533
557
|
code_action,
|
534
558
|
code_interpreter,
|
535
|
-
str(remote_artifacts_path),
|
536
559
|
)
|
537
|
-
|
538
|
-
media_obs = check_and_load_image(code_action)
|
539
|
-
|
540
|
-
if self.verbosity >= 1:
|
541
|
-
_LOGGER.info(obs)
|
542
|
-
|
543
560
|
obs_chat_elt: Message = {"role": "observation", "content": obs}
|
561
|
+
media_obs = check_and_load_image(code_action)
|
544
562
|
if media_obs and result.success:
|
545
|
-
#
|
546
|
-
#
|
547
|
-
code_interpreter.download_file(
|
548
|
-
str(remote_artifacts_path.name),
|
549
|
-
str(self.local_artifacts_path),
|
550
|
-
)
|
551
|
-
artifacts.load(
|
552
|
-
self.local_artifacts_path,
|
553
|
-
Path(self.local_artifacts_path).parent,
|
554
|
-
)
|
563
|
+
# media paths will be under the local_save_path when we download
|
564
|
+
# them after each turn
|
555
565
|
obs_chat_elt["media"] = [
|
556
|
-
|
566
|
+
artifacts.local_save_path.parent / media_ob
|
557
567
|
for media_ob in media_obs
|
558
568
|
]
|
559
569
|
|
570
|
+
if self.verbosity >= 1:
|
571
|
+
_LOGGER.info(obs)
|
572
|
+
|
560
573
|
# don't add execution results to internal chat
|
561
574
|
int_chat.append(obs_chat_elt)
|
562
575
|
obs_chat_elt["execution"] = result
|
@@ -573,13 +586,15 @@ class VisionAgent(Agent):
|
|
573
586
|
iterations += 1
|
574
587
|
last_response = response
|
575
588
|
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
589
|
+
# after each turn, download the artifacts locally
|
590
|
+
code_interpreter.download_file(
|
591
|
+
str(artifacts.remote_save_path.name),
|
592
|
+
str(artifacts.local_save_path),
|
593
|
+
)
|
594
|
+
artifacts.load(
|
595
|
+
artifacts.local_save_path, artifacts.local_save_path.parent
|
596
|
+
)
|
597
|
+
|
583
598
|
return orig_chat, artifacts
|
584
599
|
|
585
600
|
def streaming_message(self, message: Dict[str, Any]) -> None:
|
@@ -595,7 +610,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
595
610
|
self,
|
596
611
|
agent: Optional[LMM] = None,
|
597
612
|
verbosity: int = 0,
|
598
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
599
613
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
600
614
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
601
615
|
) -> None:
|
@@ -605,8 +619,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
605
619
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
606
620
|
of other agents.
|
607
621
|
verbosity (int): The verbosity level of the agent.
|
608
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
609
|
-
artifacts file.
|
610
622
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
611
623
|
function to send intermediate update messages.
|
612
624
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -619,7 +631,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
619
631
|
super().__init__(
|
620
632
|
agent,
|
621
633
|
verbosity,
|
622
|
-
local_artifacts_path,
|
623
634
|
callback_message,
|
624
635
|
code_interpreter,
|
625
636
|
)
|
@@ -630,7 +641,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
630
641
|
self,
|
631
642
|
agent: Optional[LMM] = None,
|
632
643
|
verbosity: int = 0,
|
633
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
634
644
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
635
645
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
636
646
|
) -> None:
|
@@ -640,8 +650,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
640
650
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
641
651
|
of other agents.
|
642
652
|
verbosity (int): The verbosity level of the agent.
|
643
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
644
|
-
artifacts file.
|
645
653
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
646
654
|
function to send intermediate update messages.
|
647
655
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -654,7 +662,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
654
662
|
super().__init__(
|
655
663
|
agent,
|
656
664
|
verbosity,
|
657
|
-
local_artifacts_path,
|
658
665
|
callback_message,
|
659
666
|
code_interpreter,
|
660
667
|
)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
VA_CODE = """
|
2
2
|
**Role**: You are a helpful agent that assists users with writing code.
|
3
3
|
|
4
|
-
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <
|
4
|
+
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`.
|
5
5
|
|
6
6
|
<execute_python>
|
7
7
|
print("Hello World!")
|
@@ -26,10 +26,11 @@ Here is the current conversation so far:
|
|
26
26
|
|
27
27
|
**Instructions**:
|
28
28
|
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
29
|
-
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
|
29
|
+
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
|
30
30
|
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
|
31
|
-
4. **
|
32
|
-
5. **
|
31
|
+
4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
|
32
|
+
5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
|
33
|
+
6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
|
33
34
|
|
34
35
|
<thinking>Your thoughts here...</thinking>
|
35
36
|
<response>Your response to the user here...</response>
|
@@ -53,7 +54,7 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
|
|
53
54
|
<let_user_respond>false</let_user_respond>
|
54
55
|
|
55
56
|
OBSERVATION:
|
56
|
-
[Artifact dog_detector.py]
|
57
|
+
[Artifact dog_detector.py (5 lines total)]
|
57
58
|
0|from vision_agent.tools import load_image, owl_v2
|
58
59
|
1|def detect_dogs(image_path: str):
|
59
60
|
2| image = load_image(image_path)
|
@@ -94,7 +95,7 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
|
|
94
95
|
<let_user_respond>false</let_user_respond>
|
95
96
|
|
96
97
|
OBSERVATION:
|
97
|
-
[Artifact dog_detector.py]
|
98
|
+
[Artifact dog_detector.py (5 lines total)]
|
98
99
|
0|from vision_agent.tools import load_image, owl_v2
|
99
100
|
1|def detect_dogs(image_path: str):
|
100
101
|
2| image = load_image(image_path)
|
@@ -142,7 +143,7 @@ AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will te
|
|
142
143
|
|
143
144
|
|
144
145
|
EXAMPLES_CODE3 = """
|
145
|
-
USER: Can you create a function to count workers with helmets, return the count and
|
146
|
+
USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes?
|
146
147
|
|
147
148
|
OBSERVATION:
|
148
149
|
[Artifacts loaded]
|
@@ -166,9 +167,9 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w
|
|
166
167
|
<let_user_respond>false</let_user_respond>
|
167
168
|
|
168
169
|
OBSERVATION:
|
169
|
-
[Artifact code.py]
|
170
|
+
[Artifact code.py (17 lines total)]
|
170
171
|
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
171
|
-
1|def count_workers_with_helmets(image_path: str
|
172
|
+
1|def count_workers_with_helmets(image_path: str):
|
172
173
|
2| image = load_image(image_path)
|
173
174
|
3| detections = florence2_phrase_grounding("worker, helmet", image)
|
174
175
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
@@ -182,15 +183,16 @@ OBSERVATION:
|
|
182
183
|
11| break
|
183
184
|
11| if person_has_helmet:
|
184
185
|
13| count += 1
|
185
|
-
14| overlay_bounding_boxes(image, detections)
|
186
|
-
|
187
|
-
16| return count
|
186
|
+
14| viz = overlay_bounding_boxes(image, detections)
|
187
|
+
16| return count, viz
|
188
188
|
[End of artifact]
|
189
189
|
|
190
|
-
AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and
|
190
|
+
AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization.</thinking>
|
191
191
|
<execute_python>
|
192
192
|
from code import count_workers_with_helmets
|
193
|
-
|
193
|
+
count, viz = count_workers_with_helmets('/path/to/images/workers.png')
|
194
|
+
save_image(viz, 'workers_viz.png')
|
195
|
+
print(count)
|
194
196
|
</execute_python>
|
195
197
|
<let_user_respond>false</let_user_respond>
|
196
198
|
|
@@ -201,7 +203,9 @@ OBSERVATION:
|
|
201
203
|
AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
|
202
204
|
<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
|
203
205
|
<let_user_respond>true</let_user_respond>
|
206
|
+
"""
|
204
207
|
|
208
|
+
EXAMPLES_CODE3_EXTRA1 = """
|
205
209
|
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
|
206
210
|
|
207
211
|
AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
|
@@ -249,3 +253,60 @@ AGENT: <thinking>Three workers with helmets are detected, I will show this to th
|
|
249
253
|
<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
|
250
254
|
<let_user_respond>true</let_user_respond>
|
251
255
|
"""
|
256
|
+
|
257
|
+
EXAMPLES_CODE3_EXTRA2 = """
|
258
|
+
USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
|
259
|
+
|
260
|
+
AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
|
261
|
+
<execute_python>
|
262
|
+
CODE = '''
|
263
|
+
import streamlit as st
|
264
|
+
from code imoprt count_workers_with_helmets
|
265
|
+
import os
|
266
|
+
|
267
|
+
def main():
|
268
|
+
st.title("Worker with Helmet Counter")
|
269
|
+
uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
|
270
|
+
if uploaded_file is not None:
|
271
|
+
with open("temp_image.jpg", "wb") as f:
|
272
|
+
f.write(uploaded_file.read())
|
273
|
+
|
274
|
+
count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
|
275
|
+
st.image("temp_image_viz.jpg")
|
276
|
+
st.write(f"Number of workers with helmets: {count}")
|
277
|
+
os.remove("temp_image.jpg")
|
278
|
+
os.remove("temp_image_viz.jpg")
|
279
|
+
|
280
|
+
if __name__ == "__main__":
|
281
|
+
main()
|
282
|
+
'''
|
283
|
+
edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
|
284
|
+
</execute_python>
|
285
|
+
|
286
|
+
OBSERVATION:
|
287
|
+
[Artifact streamlit_app.py (19 lines total)]
|
288
|
+
0|import streamlit as st
|
289
|
+
1|from code imoprt count_workers_with_helmets
|
290
|
+
2|import os
|
291
|
+
3|
|
292
|
+
4|def main():
|
293
|
+
5| st.title("Worker with Helmet Counter")
|
294
|
+
6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
|
295
|
+
7| if uploaded_file is not None:
|
296
|
+
8| with open("temp_image.jpg", "wb") as f:
|
297
|
+
9| f.write(uploaded_file.read())
|
298
|
+
10|
|
299
|
+
11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
|
300
|
+
12| st.image("temp_image_viz.jpg")
|
301
|
+
13| st.write(f"Number of workers with helmets: {count}")
|
302
|
+
14| os.remove("temp_image.jpg")
|
303
|
+
15| os.remove("temp_image_viz.jpg")
|
304
|
+
16|
|
305
|
+
17|if __name__ == "__main__":
|
306
|
+
18| main()
|
307
|
+
[End of artifact]
|
308
|
+
|
309
|
+
AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
|
310
|
+
<response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
|
311
|
+
<let_user_respond>true</let_user_respond>
|
312
|
+
"""
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -6,15 +6,13 @@ import re
|
|
6
6
|
import subprocess
|
7
7
|
import tempfile
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Any, Dict, List, Optional, Union
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
10
10
|
|
11
11
|
import libcst as cst
|
12
12
|
from IPython.display import display
|
13
13
|
|
14
14
|
import vision_agent as va
|
15
|
-
from vision_agent.agent.agent_utils import extract_json
|
16
15
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
17
|
-
from vision_agent.lmm import AnthropicLMM
|
18
16
|
from vision_agent.lmm.types import Message
|
19
17
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
20
18
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
@@ -26,7 +24,6 @@ CURRENT_FILE = None
|
|
26
24
|
CURRENT_LINE = 0
|
27
25
|
DEFAULT_WINDOW_SIZE = 100
|
28
26
|
ZMQ_PORT = os.environ.get("ZMQ_PORT", None)
|
29
|
-
VERBOSITY = os.environ.get("VERBOSITY", 0)
|
30
27
|
|
31
28
|
|
32
29
|
def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
|
@@ -38,16 +35,6 @@ def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
|
|
38
35
|
socket.send_json(inp)
|
39
36
|
|
40
37
|
|
41
|
-
def filter_file(file_name: Union[str, Path]) -> bool:
|
42
|
-
file_name_p = Path(file_name)
|
43
|
-
return (
|
44
|
-
file_name_p.is_file()
|
45
|
-
and "__pycache__" not in str(file_name_p)
|
46
|
-
and file_name_p.suffix in [".py", ".txt"]
|
47
|
-
and not file_name_p.name.startswith(".")
|
48
|
-
)
|
49
|
-
|
50
|
-
|
51
38
|
def redisplay_results(execution: Execution) -> None:
|
52
39
|
"""This function is used to add previous execution results to the current output.
|
53
40
|
This is handy if you are inside a notebook environment, call it notebook1, and you
|
@@ -86,8 +73,19 @@ class Artifacts:
|
|
86
73
|
need to be in sync with the remote environment the VisionAgent is running in.
|
87
74
|
"""
|
88
75
|
|
89
|
-
def __init__(
|
76
|
+
def __init__(
|
77
|
+
self, remote_save_path: Union[str, Path], local_save_path: Union[str, Path]
|
78
|
+
) -> None:
|
79
|
+
"""Initializes the Artifacts object with it's remote and local save paths.
|
80
|
+
|
81
|
+
Parameters:
|
82
|
+
remote_save_path (Union[str, Path]): The path to save the artifacts in the
|
83
|
+
remote environment. For example "/home/user/artifacts.pkl".
|
84
|
+
local_save_path (Union[str, Path]): The path to save the artifacts in the
|
85
|
+
local environment. For example "/Users/my_user/workspace/artifacts.pkl".
|
86
|
+
"""
|
90
87
|
self.remote_save_path = Path(remote_save_path)
|
88
|
+
self.local_save_path = Path(local_save_path)
|
91
89
|
self.artifacts: Dict[str, Any] = {}
|
92
90
|
|
93
91
|
self.code_sandbox_runtime = None
|
@@ -95,31 +93,46 @@ class Artifacts:
|
|
95
93
|
def load(
|
96
94
|
self,
|
97
95
|
artifacts_path: Union[str, Path],
|
98
|
-
|
96
|
+
load_to_dir: Optional[Union[str, Path]] = None,
|
99
97
|
) -> None:
|
100
|
-
"""Loads are artifacts into the
|
101
|
-
into remote_save_path. If an artifact value is None it
|
98
|
+
"""Loads are artifacts into the load_to_dir directory. If load_to_dir is None,
|
99
|
+
it will load into remote_save_path directory. If an artifact value is None it
|
100
|
+
will skip loading it.
|
102
101
|
|
103
102
|
Parameters:
|
104
|
-
artifacts_path (Union[str, Path]): The file path to load the artifacts from
|
103
|
+
artifacts_path (Union[str, Path]): The file path to load the artifacts from.
|
104
|
+
If you are in the remote environment this would be remote_save_path, if
|
105
|
+
you are in the local environment this would be local_save_path.
|
106
|
+
load_to_dir (Optional[Union[str, Path]): The directory to load the artifacts
|
107
|
+
into. If None, it will load into remote_save_path directory.
|
105
108
|
"""
|
106
109
|
with open(artifacts_path, "rb") as f:
|
107
110
|
self.artifacts = pkl.load(f)
|
108
111
|
|
109
|
-
|
112
|
+
load_to_dir = (
|
113
|
+
self.remote_save_path.parent if load_to_dir is None else Path(load_to_dir)
|
114
|
+
)
|
110
115
|
|
111
116
|
for k, v in self.artifacts.items():
|
112
117
|
if v is not None:
|
113
118
|
mode = "w" if isinstance(v, str) else "wb"
|
114
|
-
with open(
|
119
|
+
with open(load_to_dir / k, mode) as f:
|
115
120
|
f.write(v)
|
116
121
|
|
117
|
-
def show(self,
|
118
|
-
"""
|
122
|
+
def show(self, uploaded_file_dir: Optional[Union[str, Path]] = None) -> str:
|
123
|
+
"""Prints out the artifacts and the directory they have been loaded to. If you
|
124
|
+
pass in upload_file_dir, it will show the artifacts have been loaded to the
|
125
|
+
upload_file_dir directory. If you don't pass in upload_file_dir, it will show
|
126
|
+
the artifacts have been loaded to the remote_save_path directory.
|
127
|
+
|
128
|
+
Parameters:
|
129
|
+
uploaded_file_dir (Optional[Union[str, Path]): The directory the artifacts
|
130
|
+
have been loaded to.
|
131
|
+
"""
|
119
132
|
loaded_path = (
|
120
|
-
Path(
|
121
|
-
if
|
122
|
-
else self.remote_save_path
|
133
|
+
Path(uploaded_file_dir)
|
134
|
+
if uploaded_file_dir is not None
|
135
|
+
else self.remote_save_path.parent
|
123
136
|
)
|
124
137
|
output_str = "[Artifacts loaded]\n"
|
125
138
|
for k in self.artifacts.keys():
|
@@ -131,9 +144,10 @@ class Artifacts:
|
|
131
144
|
return output_str
|
132
145
|
|
133
146
|
def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
|
134
|
-
|
135
|
-
|
136
|
-
|
147
|
+
"""Saves the artifacts to the local_save_path directory. If local_path is None,
|
148
|
+
it will save to the local_save_path directory.
|
149
|
+
"""
|
150
|
+
save_path = Path(local_path) if local_path is not None else self.local_save_path
|
137
151
|
with open(save_path, "wb") as f:
|
138
152
|
pkl.dump(self.artifacts, f)
|
139
153
|
|
@@ -150,6 +164,38 @@ class Artifacts:
|
|
150
164
|
return name in self.artifacts
|
151
165
|
|
152
166
|
|
167
|
+
def filter_file(file_name: Union[str, Path]) -> Tuple[bool, bool]:
|
168
|
+
file_name_p = Path(file_name)
|
169
|
+
return (
|
170
|
+
file_name_p.is_file()
|
171
|
+
and "__pycache__" not in str(file_name_p)
|
172
|
+
and not file_name_p.name.startswith(".")
|
173
|
+
and file_name_p.suffix
|
174
|
+
in [".png", ".jpeg", ".jpg", ".mp4", ".txt", ".json", ".csv"]
|
175
|
+
), file_name_p.suffix in [".png", ".jpeg", ".jpg", ".mp4"]
|
176
|
+
|
177
|
+
|
178
|
+
def capture_files_into_artifacts(artifacts: Artifacts) -> None:
|
179
|
+
"""This function is used to capture all files in the current directory into an
|
180
|
+
artifact object. This is useful if you want to capture all files in the current
|
181
|
+
directory and use them in a different environment where you don't have access to
|
182
|
+
the file system.
|
183
|
+
|
184
|
+
Parameters:
|
185
|
+
artifact (Artifacts): The artifact object to save the files to.
|
186
|
+
"""
|
187
|
+
for file in Path(".").glob("**/*"):
|
188
|
+
usable_file, is_media = filter_file(file)
|
189
|
+
mode = "rb" if is_media else "r"
|
190
|
+
if usable_file:
|
191
|
+
file_name = file.name
|
192
|
+
if file_name.startswith(str(Path(artifacts.remote_save_path).parents)):
|
193
|
+
idx = len(Path(artifacts.remote_save_path).parents)
|
194
|
+
file_name = file_name[idx:]
|
195
|
+
with open(file, mode) as f:
|
196
|
+
artifacts[file_name] = f.read()
|
197
|
+
|
198
|
+
|
153
199
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
154
200
|
|
155
201
|
|
@@ -174,9 +220,9 @@ def view_lines(
|
|
174
220
|
f"[Artifact: {name} ({total_lines} lines total)]\n"
|
175
221
|
+ format_lines(lines[start:end], start)
|
176
222
|
+ (
|
177
|
-
"[End of artifact]"
|
223
|
+
"\n[End of artifact]"
|
178
224
|
if end == len(lines)
|
179
|
-
else f"[{len(lines) - end} more lines]"
|
225
|
+
else f"\n[{len(lines) - end} more lines]"
|
180
226
|
)
|
181
227
|
)
|
182
228
|
|
@@ -256,8 +302,10 @@ def edit_code_artifact(
|
|
256
302
|
Parameters:
|
257
303
|
artifacts (Artifacts): The artifacts object to edit the artifact from.
|
258
304
|
name (str): The name of the artifact to edit.
|
259
|
-
start (int): The line number to start the edit
|
260
|
-
|
305
|
+
start (int): The line number to start the edit, can be in [-1, total_lines]
|
306
|
+
where -1 represents the end of the file.
|
307
|
+
end (int): The line number to end the edit, can be in [-1, total_lines] where
|
308
|
+
-1 represents the end of the file.
|
261
309
|
content (str): The content to insert.
|
262
310
|
"""
|
263
311
|
# just make the artifact if it doesn't exist instead of forcing agent to call
|
@@ -266,17 +314,21 @@ def edit_code_artifact(
|
|
266
314
|
artifacts[name] = ""
|
267
315
|
|
268
316
|
total_lines = len(artifacts[name].splitlines())
|
317
|
+
if start == -1:
|
318
|
+
start = total_lines
|
319
|
+
if end == -1:
|
320
|
+
end = total_lines
|
321
|
+
|
269
322
|
if start < 0 or end < 0 or start > end or end > total_lines:
|
270
323
|
print("[Invalid line range]")
|
271
324
|
return "[Invalid line range]"
|
272
|
-
if start == end:
|
273
|
-
end += 1
|
274
325
|
|
275
326
|
new_content_lines = content.splitlines(keepends=True)
|
276
327
|
new_content_lines = [
|
277
328
|
line if line.endswith("\n") else line + "\n" for line in new_content_lines
|
278
329
|
]
|
279
330
|
lines = artifacts[name].splitlines(keepends=True)
|
331
|
+
lines = [line if line.endswith("\n") else line + "\n" for line in lines]
|
280
332
|
edited_lines = lines[:start] + new_content_lines + lines[end:]
|
281
333
|
|
282
334
|
cur_line = start + len(content.split("\n")) // 2
|
@@ -371,14 +423,16 @@ def generate_vision_plan(
|
|
371
423
|
[End Plan Context]
|
372
424
|
"""
|
373
425
|
|
426
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
374
427
|
if ZMQ_PORT is not None:
|
375
428
|
agent = va.agent.VisionAgentPlanner(
|
376
429
|
report_progress_callback=lambda inp: report_progress_callback(
|
377
430
|
int(ZMQ_PORT), inp
|
378
|
-
)
|
431
|
+
),
|
432
|
+
verbosity=0,
|
379
433
|
)
|
380
434
|
else:
|
381
|
-
agent = va.agent.VisionAgentPlanner()
|
435
|
+
agent = va.agent.VisionAgentPlanner(verbosity=0)
|
382
436
|
|
383
437
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
384
438
|
response = agent.generate_plan(
|
@@ -435,14 +489,16 @@ def generate_vision_code(
|
|
435
489
|
dogs = owl_v2("dog", image)
|
436
490
|
return dogs
|
437
491
|
"""
|
492
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
438
493
|
if ZMQ_PORT is not None:
|
439
494
|
agent = va.agent.VisionAgentCoder(
|
440
495
|
report_progress_callback=lambda inp: report_progress_callback(
|
441
496
|
int(ZMQ_PORT), inp
|
442
|
-
)
|
497
|
+
),
|
498
|
+
verbosity=0,
|
443
499
|
)
|
444
500
|
else:
|
445
|
-
agent = va.agent.VisionAgentCoder(verbosity=
|
501
|
+
agent = va.agent.VisionAgentCoder(verbosity=0)
|
446
502
|
|
447
503
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
448
504
|
response = agent.generate_code(
|
@@ -506,7 +562,8 @@ def edit_vision_code(
|
|
506
562
|
return dogs
|
507
563
|
"""
|
508
564
|
|
509
|
-
|
565
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
566
|
+
agent = va.agent.VisionAgentCoder(verbosity=0)
|
510
567
|
if name not in artifacts:
|
511
568
|
print(f"[Artifact {name} does not exist]")
|
512
569
|
return f"[Artifact {name} does not exist]"
|
@@ -570,8 +627,9 @@ def check_and_load_image(code: str) -> List[str]:
|
|
570
627
|
|
571
628
|
|
572
629
|
def view_media_artifact(artifacts: Artifacts, name: str) -> str:
|
573
|
-
"""Allows
|
574
|
-
|
630
|
+
"""Allows only the agent to view the media artifact with the given name. DO NOT use
|
631
|
+
this to show media to the user, the user can already see all media saved in the
|
632
|
+
artifacts.
|
575
633
|
|
576
634
|
Parameters:
|
577
635
|
artifacts (Artifacts): The artifacts object to show the image from.
|
@@ -648,10 +706,10 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
|
|
648
706
|
|
649
707
|
|
650
708
|
def use_extra_vision_agent_args(
|
651
|
-
code: str,
|
709
|
+
code: Optional[str],
|
652
710
|
test_multi_plan: bool = True,
|
653
711
|
custom_tool_names: Optional[List[str]] = None,
|
654
|
-
) -> str:
|
712
|
+
) -> Optional[str]:
|
655
713
|
"""This is for forcing arguments passed by the user to VisionAgent into the
|
656
714
|
VisionAgentCoder call.
|
657
715
|
|
@@ -663,6 +721,8 @@ def use_extra_vision_agent_args(
|
|
663
721
|
Returns:
|
664
722
|
str: The edited code.
|
665
723
|
"""
|
724
|
+
if code is None:
|
725
|
+
return None
|
666
726
|
|
667
727
|
class VisionAgentTransformer(cst.CSTTransformer):
|
668
728
|
def __init__(
|
@@ -815,74 +875,12 @@ def use_object_detection_fine_tuning(
|
|
815
875
|
return diff
|
816
876
|
|
817
877
|
|
818
|
-
def extract_and_save_files_to_artifacts(
|
819
|
-
artifacts: Artifacts, code: str, obs: str
|
820
|
-
) -> None:
|
821
|
-
"""Extracts and saves files used in the code to the artifacts object.
|
822
|
-
|
823
|
-
Parameters:
|
824
|
-
artifacts (Artifacts): The artifacts object to save the files to.
|
825
|
-
code (str): The code to extract the files from.
|
826
|
-
"""
|
827
|
-
try:
|
828
|
-
response = extract_json(
|
829
|
-
AnthropicLMM()( # type: ignore
|
830
|
-
f"""You are a helpful AI assistant. Your job is to look at a snippet of code and the output of running that code and return the file paths that are being saved in the file. Below is the code snippet:
|
831
|
-
|
832
|
-
```python
|
833
|
-
{code}
|
834
|
-
```
|
835
|
-
|
836
|
-
```output
|
837
|
-
{obs}
|
838
|
-
```
|
839
|
-
|
840
|
-
Return the file paths in the following JSON format:
|
841
|
-
{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
|
842
|
-
)
|
843
|
-
)
|
844
|
-
except json.JSONDecodeError:
|
845
|
-
return
|
846
|
-
|
847
|
-
text_file_ext = [
|
848
|
-
".txt",
|
849
|
-
".md",
|
850
|
-
"rtf",
|
851
|
-
".html",
|
852
|
-
".htm",
|
853
|
-
"xml",
|
854
|
-
".json",
|
855
|
-
".csv",
|
856
|
-
".tsv",
|
857
|
-
".yaml",
|
858
|
-
".yml",
|
859
|
-
".toml",
|
860
|
-
".conf",
|
861
|
-
".env" ".ini",
|
862
|
-
".log",
|
863
|
-
".py",
|
864
|
-
".java",
|
865
|
-
".js",
|
866
|
-
".cpp",
|
867
|
-
".c" ".sql",
|
868
|
-
".sh",
|
869
|
-
]
|
870
|
-
|
871
|
-
if "file_paths" in response and isinstance(response["file_paths"], list):
|
872
|
-
for file_path in response["file_paths"]:
|
873
|
-
read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
|
874
|
-
if Path(file_path).is_file():
|
875
|
-
with open(file_path, read_mode) as f:
|
876
|
-
artifacts[Path(file_path).name] = f.read()
|
877
|
-
|
878
|
-
|
879
878
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
880
879
|
[
|
881
880
|
get_tool_descriptions,
|
882
881
|
open_code_artifact,
|
883
882
|
create_code_artifact,
|
884
883
|
edit_code_artifact,
|
885
|
-
generate_vision_plan,
|
886
884
|
generate_vision_code,
|
887
885
|
edit_vision_code,
|
888
886
|
view_media_artifact,
|
vision_agent/utils/execute.py
CHANGED
@@ -575,6 +575,7 @@ class LocalCodeInterpreter(CodeInterpreter):
|
|
575
575
|
super().__init__(timeout=timeout)
|
576
576
|
self.nb = nbformat.v4.new_notebook()
|
577
577
|
# Set the notebook execution path to the remote path
|
578
|
+
self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
|
578
579
|
self.resources = {"metadata": {"path": str(self.remote_path)}}
|
579
580
|
self.nb_client = NotebookClient(
|
580
581
|
self.nb,
|
@@ -591,7 +592,6 @@ Timeout: {self.timeout}"""
|
|
591
592
|
)
|
592
593
|
sleep(1)
|
593
594
|
self._new_kernel()
|
594
|
-
self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
|
595
595
|
|
596
596
|
def _new_kernel(self) -> None:
|
597
597
|
if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
|
@@ -659,7 +659,7 @@ Timeout: {self.timeout}"""
|
|
659
659
|
def download_file(
|
660
660
|
self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
|
661
661
|
) -> Path:
|
662
|
-
with open(self.remote_path / remote_file_path, "rb") as f:
|
662
|
+
with open(self.remote_path / Path(remote_file_path).name, "rb") as f:
|
663
663
|
contents = f.read()
|
664
664
|
with open(local_file_path, "wb") as f:
|
665
665
|
f.write(contents)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.176
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -53,25 +53,25 @@ Description-Content-Type: text/markdown
|
|
53
53
|
![version](https://img.shields.io/pypi/pyversions/vision-agent)
|
54
54
|
</div>
|
55
55
|
|
56
|
-
|
56
|
+
VisionAgent is a library that helps you utilize agent frameworks to generate code to
|
57
57
|
solve your vision task. Many current vision problems can easily take hours or days to
|
58
58
|
solve, you need to find the right model, figure out how to use it and program it to
|
59
|
-
accomplish the task you want.
|
59
|
+
accomplish the task you want. VisionAgent aims to provide an in-seconds experience by
|
60
60
|
allowing users to describe their problem in text and have the agent framework generate
|
61
61
|
code to solve the task for them. Check out our discord for updates and roadmaps!
|
62
62
|
|
63
63
|
## Table of Contents
|
64
64
|
- [🚀Quick Start](#quick-start)
|
65
65
|
- [📚Documentation](#documentation)
|
66
|
-
- [🔍🤖
|
66
|
+
- [🔍🤖VisionAgent](#vision-agent-basic-usage)
|
67
67
|
- [🛠️Tools](#tools)
|
68
68
|
- [🤖LMMs](#lmms)
|
69
|
-
- [💻🤖
|
69
|
+
- [💻🤖VisionAgent Coder](#vision-agent-coder)
|
70
70
|
- [🏗️Additional Backends](#additional-backends)
|
71
71
|
|
72
72
|
## Quick Start
|
73
73
|
### Web Application
|
74
|
-
The fastest way to test out
|
74
|
+
The fastest way to test out VisionAgent is to use our web application. You can find it
|
75
75
|
[here](https://va.landing.ai/).
|
76
76
|
|
77
77
|
|
@@ -108,9 +108,9 @@ be images or video files.
|
|
108
108
|
|
109
109
|
## Documentation
|
110
110
|
|
111
|
-
[
|
111
|
+
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
112
112
|
|
113
|
-
##
|
113
|
+
## VisionAgent Basic Usage
|
114
114
|
### Chatting and Message Formats
|
115
115
|
`VisionAgent` is an agent that can chat with you and call other tools or agents to
|
116
116
|
write vision code for you. You can interact with it like you would ChatGPT or any other
|
@@ -292,7 +292,7 @@ response = lmm(
|
|
292
292
|
)
|
293
293
|
```
|
294
294
|
|
295
|
-
##
|
295
|
+
## VisionAgent Coder
|
296
296
|
Underneath the hood, `VisionAgent` uses `VisionAgentCoder` to generate code to solve
|
297
297
|
vision tasks. You can use `VisionAgentCoder` directly to generate code if you want:
|
298
298
|
|
@@ -488,7 +488,7 @@ export AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME="your_embedding_model_deploy
|
|
488
488
|
|
489
489
|
> NOTE: make sure your Azure model deployment have enough quota (token per minute) to support it. The default value 8000TPM is not enough.
|
490
490
|
|
491
|
-
You can then run
|
491
|
+
You can then run VisionAgent using the Azure OpenAI models:
|
492
492
|
|
493
493
|
```python
|
494
494
|
import vision_agent as va
|
@@ -1,13 +1,13 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
2
|
vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
-
vision_agent/agent/agent_utils.py,sha256=
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
4
|
+
vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=x0-TElnTRW7abyq2wAwKRiTUExBGg24C-c74wO1oKtI,26336
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=3Q1VWrN-BNUoSD4OAqKazvXkP2c04PXDYu2Z1f5dQb0,31960
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
8
8
|
vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
|
9
9
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
|
10
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
|
11
11
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
13
13
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -17,19 +17,19 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
|
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
19
|
vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
|
20
|
-
vision_agent/tools/meta_tools.py,sha256=
|
20
|
+
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
23
|
vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
27
|
-
vision_agent/utils/execute.py,sha256=
|
27
|
+
vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo,28028
|
28
28
|
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.176.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.176.dist-info/METADATA,sha256=W86hwl87xkxa59Hn9rw8_FQGiMW3JncY9WQAmC5jxs0,18330
|
34
|
+
vision_agent-0.2.176.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.176.dist-info/RECORD,,
|
File without changes
|
File without changes
|