vision-agent 0.2.173__py3-none-any.whl → 0.2.175__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent_utils.py +3 -1
- vision_agent/agent/vision_agent.py +63 -68
- vision_agent/agent/vision_agent_coder.py +101 -31
- vision_agent/agent/vision_agent_prompts.py +75 -14
- vision_agent/tools/meta_tools.py +141 -111
- vision_agent/utils/execute.py +2 -2
- {vision_agent-0.2.173.dist-info → vision_agent-0.2.175.dist-info}/METADATA +2 -2
- {vision_agent-0.2.173.dist-info → vision_agent-0.2.175.dist-info}/RECORD +10 -10
- {vision_agent-0.2.173.dist-info → vision_agent-0.2.175.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.173.dist-info → vision_agent-0.2.175.dist-info}/WHEEL +0 -0
@@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
72
72
|
if json_dict is None:
|
73
73
|
error_msg = f"Could not extract JSON from the given str: {json_orig}"
|
74
74
|
_LOGGER.exception(error_msg)
|
75
|
-
raise
|
75
|
+
raise json.JSONDecodeError(
|
76
|
+
msg="Could not extract JSON", doc=json_orig, pos=0
|
77
|
+
)
|
76
78
|
|
77
79
|
return json_dict
|
78
80
|
|
@@ -2,7 +2,6 @@ import copy
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import os
|
5
|
-
import tempfile
|
6
5
|
from pathlib import Path
|
7
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
8
7
|
|
@@ -12,6 +11,7 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
12
11
|
EXAMPLES_CODE1,
|
13
12
|
EXAMPLES_CODE2,
|
14
13
|
EXAMPLES_CODE3,
|
14
|
+
EXAMPLES_CODE3_EXTRA2,
|
15
15
|
VA_CODE,
|
16
16
|
)
|
17
17
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
@@ -19,7 +19,6 @@ from vision_agent.tools.meta_tools import (
|
|
19
19
|
META_TOOL_DOCSTRING,
|
20
20
|
Artifacts,
|
21
21
|
check_and_load_image,
|
22
|
-
extract_and_save_files_to_artifacts,
|
23
22
|
use_extra_vision_agent_args,
|
24
23
|
)
|
25
24
|
from vision_agent.utils import CodeInterpreterFactory
|
@@ -37,11 +36,12 @@ class BoilerplateCode:
|
|
37
36
|
pre_code = [
|
38
37
|
"from typing import *",
|
39
38
|
"from vision_agent.utils.execute import CodeInterpreter",
|
40
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
|
41
|
-
"artifacts = Artifacts('{remote_path}')",
|
39
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts, capture_files_into_artifacts",
|
40
|
+
"artifacts = Artifacts('{remote_path}', '{remote_path}')",
|
42
41
|
"artifacts.load('{remote_path}')",
|
43
42
|
]
|
44
43
|
post_code = [
|
44
|
+
"capture_files_into_artifacts(artifacts)",
|
45
45
|
"artifacts.save()",
|
46
46
|
]
|
47
47
|
|
@@ -97,8 +97,9 @@ def _clean_response(response: str) -> str:
|
|
97
97
|
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
98
98
|
chat = copy.deepcopy(chat)
|
99
99
|
|
100
|
+
# only add 10 most recent messages in the chat to not go over token limit
|
100
101
|
conversation = ""
|
101
|
-
for chat_i in chat:
|
102
|
+
for chat_i in chat[-10:]:
|
102
103
|
if chat_i["role"] == "user":
|
103
104
|
conversation += f"USER: {chat_i['content']}\n\n"
|
104
105
|
elif chat_i["role"] == "observation":
|
@@ -110,7 +111,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
110
111
|
|
111
112
|
prompt = VA_CODE.format(
|
112
113
|
documentation=META_TOOL_DOCSTRING,
|
113
|
-
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
|
114
|
+
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
|
114
115
|
conversation=conversation,
|
115
116
|
)
|
116
117
|
message: Message = {"role": "user", "content": prompt}
|
@@ -120,7 +121,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
120
121
|
and "media" in chat[-1]
|
121
122
|
and len(chat[-1]["media"]) > 0 # type: ignore
|
122
123
|
):
|
123
|
-
|
124
|
+
media_obs = [media for media in chat[-1]["media"] if Path(media).exists()] # type: ignore
|
125
|
+
if len(media_obs) > 0:
|
126
|
+
message["media"] = media_obs # type: ignore
|
124
127
|
conv_resp = cast(str, orch([message], stream=False))
|
125
128
|
|
126
129
|
# clean the response first, if we are executing code, do not resond or end
|
@@ -144,16 +147,16 @@ def execute_code_action(
|
|
144
147
|
artifacts: Artifacts,
|
145
148
|
code: str,
|
146
149
|
code_interpreter: CodeInterpreter,
|
147
|
-
artifact_remote_path: str,
|
148
150
|
) -> Tuple[Execution, str]:
|
149
151
|
result = code_interpreter.exec_isolation(
|
150
|
-
BoilerplateCode.add_boilerplate(
|
152
|
+
BoilerplateCode.add_boilerplate(
|
153
|
+
code, remote_path=str(artifacts.remote_save_path)
|
154
|
+
)
|
151
155
|
)
|
152
156
|
|
153
157
|
obs = str(result.logs)
|
154
158
|
if result.error:
|
155
159
|
obs += f"\n{result.error}"
|
156
|
-
extract_and_save_files_to_artifacts(artifacts, code, obs)
|
157
160
|
return result, obs
|
158
161
|
|
159
162
|
|
@@ -161,7 +164,6 @@ def execute_user_code_action(
|
|
161
164
|
artifacts: Artifacts,
|
162
165
|
last_user_message: Message,
|
163
166
|
code_interpreter: CodeInterpreter,
|
164
|
-
artifact_remote_path: str,
|
165
167
|
) -> Tuple[Optional[Execution], Optional[str]]:
|
166
168
|
user_result = None
|
167
169
|
user_obs = None
|
@@ -178,11 +180,10 @@ def execute_user_code_action(
|
|
178
180
|
if user_code_action is not None:
|
179
181
|
user_code_action = use_extra_vision_agent_args(user_code_action, False)
|
180
182
|
user_result, user_obs = execute_code_action(
|
181
|
-
artifacts, user_code_action, code_interpreter
|
183
|
+
artifacts, user_code_action, code_interpreter
|
182
184
|
)
|
183
185
|
if user_result.error:
|
184
186
|
user_obs += f"\n{user_result.error}"
|
185
|
-
extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs)
|
186
187
|
return user_result, user_obs
|
187
188
|
|
188
189
|
|
@@ -231,9 +232,18 @@ def old_format_to_new_format(old_format_str: str) -> str:
|
|
231
232
|
except json.JSONDecodeError:
|
232
233
|
return old_format_str
|
233
234
|
|
234
|
-
|
235
|
-
|
236
|
-
|
235
|
+
if "thoughts" in old_format:
|
236
|
+
thinking = (
|
237
|
+
old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
|
238
|
+
)
|
239
|
+
else:
|
240
|
+
thinking = None
|
241
|
+
|
242
|
+
let_user_respond = (
|
243
|
+
old_format["let_user_respond"] if "let_user_respond" in old_format else True
|
244
|
+
)
|
245
|
+
|
246
|
+
if "response" in old_format and "<execute_python>" in old_format["response"]:
|
237
247
|
execute_python = extract_tag(old_format["response"], "execute_python")
|
238
248
|
response = (
|
239
249
|
old_format["response"]
|
@@ -244,7 +254,7 @@ def old_format_to_new_format(old_format_str: str) -> str:
|
|
244
254
|
)
|
245
255
|
else:
|
246
256
|
execute_python = None
|
247
|
-
response = old_format["response"]
|
257
|
+
response = old_format["response"] if "response" in old_format else None
|
248
258
|
|
249
259
|
return json.dumps(
|
250
260
|
{
|
@@ -275,7 +285,6 @@ class VisionAgent(Agent):
|
|
275
285
|
self,
|
276
286
|
agent: Optional[LMM] = None,
|
277
287
|
verbosity: int = 0,
|
278
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
279
288
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
280
289
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
281
290
|
) -> None:
|
@@ -285,8 +294,6 @@ class VisionAgent(Agent):
|
|
285
294
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
286
295
|
of other agents.
|
287
296
|
verbosity (int): The verbosity level of the agent.
|
288
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
289
|
-
artifacts file.
|
290
297
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
291
298
|
function to send intermediate update messages.
|
292
299
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -302,14 +309,6 @@ class VisionAgent(Agent):
|
|
302
309
|
self.callback_message = callback_message
|
303
310
|
if self.verbosity >= 1:
|
304
311
|
_LOGGER.setLevel(logging.INFO)
|
305
|
-
self.local_artifacts_path = cast(
|
306
|
-
str,
|
307
|
-
(
|
308
|
-
Path(local_artifacts_path)
|
309
|
-
if local_artifacts_path is not None
|
310
|
-
else Path(tempfile.NamedTemporaryFile(delete=False).name)
|
311
|
-
),
|
312
|
-
)
|
313
312
|
|
314
313
|
def __call__(
|
315
314
|
self,
|
@@ -386,7 +385,7 @@ class VisionAgent(Agent):
|
|
386
385
|
|
387
386
|
if not artifacts:
|
388
387
|
# this is setting remote artifacts path
|
389
|
-
artifacts = Artifacts(
|
388
|
+
artifacts = Artifacts("", "")
|
390
389
|
|
391
390
|
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
392
391
|
code_interpreter = (
|
@@ -395,8 +394,15 @@ class VisionAgent(Agent):
|
|
395
394
|
and not isinstance(self.code_interpreter, str)
|
396
395
|
else CodeInterpreterFactory.new_instance(
|
397
396
|
code_sandbox_runtime=self.code_interpreter,
|
397
|
+
remote_path=artifacts.remote_save_path.parent,
|
398
398
|
)
|
399
399
|
)
|
400
|
+
|
401
|
+
if code_interpreter.remote_path != artifacts.remote_save_path.parent:
|
402
|
+
raise ValueError(
|
403
|
+
f"Code interpreter remote path {code_interpreter.remote_path} does not match {artifacts.remote_save_path.parent}"
|
404
|
+
)
|
405
|
+
|
400
406
|
with code_interpreter:
|
401
407
|
orig_chat = copy.deepcopy(chat)
|
402
408
|
int_chat = copy.deepcopy(chat)
|
@@ -436,15 +442,13 @@ class VisionAgent(Agent):
|
|
436
442
|
|
437
443
|
# Save the current state of artifacts, will include any images the user
|
438
444
|
# passed in.
|
439
|
-
artifacts.save(
|
445
|
+
artifacts.save()
|
440
446
|
|
441
447
|
# Upload artifacts to remote location and show where they are going
|
442
448
|
# to be loaded to. The actual loading happens in BoilerplateCode as
|
443
449
|
# part of the pre_code.
|
444
|
-
|
445
|
-
|
446
|
-
)
|
447
|
-
artifacts_loaded = artifacts.show(code_interpreter.remote_path)
|
450
|
+
code_interpreter.upload_file(artifacts.local_save_path)
|
451
|
+
artifacts_loaded = artifacts.show(artifacts.remote_save_path.parent)
|
448
452
|
int_chat.append({"role": "observation", "content": artifacts_loaded})
|
449
453
|
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
450
454
|
self.streaming_message({"role": "observation", "content": artifacts_loaded})
|
@@ -453,7 +457,6 @@ class VisionAgent(Agent):
|
|
453
457
|
artifacts,
|
454
458
|
last_user_message,
|
455
459
|
code_interpreter,
|
456
|
-
str(remote_artifacts_path),
|
457
460
|
)
|
458
461
|
finished = user_result is not None and user_obs is not None
|
459
462
|
if user_result is not None and user_obs is not None:
|
@@ -472,7 +475,16 @@ class VisionAgent(Agent):
|
|
472
475
|
)
|
473
476
|
|
474
477
|
while not finished and iterations < self.max_iterations:
|
478
|
+
# ensure we upload the artifacts before each turn, so any local
|
479
|
+
# modifications we made to it will be reflected in the remote
|
480
|
+
code_interpreter.upload_file(artifacts.local_save_path)
|
481
|
+
|
475
482
|
response = run_conversation(self.agent, int_chat)
|
483
|
+
code_action = use_extra_vision_agent_args(
|
484
|
+
response.get("execute_python", None),
|
485
|
+
test_multi_plan,
|
486
|
+
custom_tool_names,
|
487
|
+
)
|
476
488
|
if self.verbosity >= 1:
|
477
489
|
_LOGGER.info(response)
|
478
490
|
int_chat.append(
|
@@ -532,31 +544,20 @@ class VisionAgent(Agent):
|
|
532
544
|
artifacts,
|
533
545
|
code_action,
|
534
546
|
code_interpreter,
|
535
|
-
str(remote_artifacts_path),
|
536
547
|
)
|
537
|
-
|
538
|
-
media_obs = check_and_load_image(code_action)
|
539
|
-
|
540
|
-
if self.verbosity >= 1:
|
541
|
-
_LOGGER.info(obs)
|
542
|
-
|
543
548
|
obs_chat_elt: Message = {"role": "observation", "content": obs}
|
549
|
+
media_obs = check_and_load_image(code_action)
|
544
550
|
if media_obs and result.success:
|
545
|
-
#
|
546
|
-
#
|
547
|
-
code_interpreter.download_file(
|
548
|
-
str(remote_artifacts_path.name),
|
549
|
-
str(self.local_artifacts_path),
|
550
|
-
)
|
551
|
-
artifacts.load(
|
552
|
-
self.local_artifacts_path,
|
553
|
-
Path(self.local_artifacts_path).parent,
|
554
|
-
)
|
551
|
+
# media paths will be under the local_save_path when we download
|
552
|
+
# them after each turn
|
555
553
|
obs_chat_elt["media"] = [
|
556
|
-
|
554
|
+
artifacts.local_save_path.parent / media_ob
|
557
555
|
for media_ob in media_obs
|
558
556
|
]
|
559
557
|
|
558
|
+
if self.verbosity >= 1:
|
559
|
+
_LOGGER.info(obs)
|
560
|
+
|
560
561
|
# don't add execution results to internal chat
|
561
562
|
int_chat.append(obs_chat_elt)
|
562
563
|
obs_chat_elt["execution"] = result
|
@@ -573,13 +574,15 @@ class VisionAgent(Agent):
|
|
573
574
|
iterations += 1
|
574
575
|
last_response = response
|
575
576
|
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
577
|
+
# after each turn, download the artifacts locally
|
578
|
+
code_interpreter.download_file(
|
579
|
+
str(artifacts.remote_save_path.name),
|
580
|
+
str(artifacts.local_save_path),
|
581
|
+
)
|
582
|
+
artifacts.load(
|
583
|
+
artifacts.local_save_path, artifacts.local_save_path.parent
|
584
|
+
)
|
585
|
+
|
583
586
|
return orig_chat, artifacts
|
584
587
|
|
585
588
|
def streaming_message(self, message: Dict[str, Any]) -> None:
|
@@ -595,7 +598,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
595
598
|
self,
|
596
599
|
agent: Optional[LMM] = None,
|
597
600
|
verbosity: int = 0,
|
598
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
599
601
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
600
602
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
601
603
|
) -> None:
|
@@ -605,8 +607,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
605
607
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
606
608
|
of other agents.
|
607
609
|
verbosity (int): The verbosity level of the agent.
|
608
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
609
|
-
artifacts file.
|
610
610
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
611
611
|
function to send intermediate update messages.
|
612
612
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -619,7 +619,6 @@ class OpenAIVisionAgent(VisionAgent):
|
|
619
619
|
super().__init__(
|
620
620
|
agent,
|
621
621
|
verbosity,
|
622
|
-
local_artifacts_path,
|
623
622
|
callback_message,
|
624
623
|
code_interpreter,
|
625
624
|
)
|
@@ -630,7 +629,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
630
629
|
self,
|
631
630
|
agent: Optional[LMM] = None,
|
632
631
|
verbosity: int = 0,
|
633
|
-
local_artifacts_path: Optional[Union[str, Path]] = None,
|
634
632
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
635
633
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
636
634
|
) -> None:
|
@@ -640,8 +638,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
640
638
|
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
641
639
|
of other agents.
|
642
640
|
verbosity (int): The verbosity level of the agent.
|
643
|
-
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
644
|
-
artifacts file.
|
645
641
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
646
642
|
function to send intermediate update messages.
|
647
643
|
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
@@ -654,7 +650,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
654
650
|
super().__init__(
|
655
651
|
agent,
|
656
652
|
verbosity,
|
657
|
-
local_artifacts_path,
|
658
653
|
callback_message,
|
659
654
|
code_interpreter,
|
660
655
|
)
|
@@ -5,7 +5,7 @@ import sys
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
7
7
|
|
8
|
-
|
8
|
+
import libcst as cst
|
9
9
|
from tabulate import tabulate
|
10
10
|
|
11
11
|
import vision_agent.tools as T
|
@@ -49,42 +49,112 @@ WORKSPACE = Path(os.getenv("WORKSPACE", ""))
|
|
49
49
|
_LOGGER = logging.getLogger(__name__)
|
50
50
|
|
51
51
|
|
52
|
-
def strip_function_calls(
|
52
|
+
def strip_function_calls( # noqa: C901
|
53
|
+
code: str, exclusions: Optional[List[str]] = None
|
54
|
+
) -> str:
|
53
55
|
"""This will strip out all code that calls functions except for functions included
|
54
56
|
in exclusions.
|
55
57
|
"""
|
56
58
|
if exclusions is None:
|
57
59
|
exclusions = []
|
58
60
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
node.
|
86
|
-
|
87
|
-
|
61
|
+
def check_and_remove_node(node: cst.CSTNode, exclusions: List[str]) -> cst.CSTNode:
|
62
|
+
if hasattr(node, "value") and isinstance(node.value, cst.Call):
|
63
|
+
if (
|
64
|
+
isinstance(node.value.func, cst.Name)
|
65
|
+
and node.value.func.value in exclusions
|
66
|
+
):
|
67
|
+
return node
|
68
|
+
return cst.RemoveFromParent() # type: ignore
|
69
|
+
return node
|
70
|
+
|
71
|
+
class StripFunctionCallsTransformer(cst.CSTTransformer):
|
72
|
+
def __init__(self, exclusions: List[str]):
|
73
|
+
# Store exclusions to skip removing certain function calls
|
74
|
+
self.exclusions = exclusions
|
75
|
+
self.in_function_or_class = False
|
76
|
+
|
77
|
+
def visit_FunctionDef(self, node: cst.FunctionDef) -> Optional[bool]:
|
78
|
+
self.in_function_or_class = True
|
79
|
+
return True
|
80
|
+
|
81
|
+
def leave_FunctionDef(
|
82
|
+
self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef
|
83
|
+
) -> cst.BaseStatement:
|
84
|
+
self.in_function_or_class = False
|
85
|
+
return updated_node
|
86
|
+
|
87
|
+
def visit_ClassDef(self, node: cst.ClassDef) -> Optional[bool]:
|
88
|
+
self.in_function_or_class = True
|
89
|
+
return True
|
90
|
+
|
91
|
+
def leave_ClassDef(
|
92
|
+
self, node: cst.ClassDef, updated_node: cst.ClassDef
|
93
|
+
) -> cst.BaseStatement:
|
94
|
+
self.in_function_or_class = False
|
95
|
+
return updated_node
|
96
|
+
|
97
|
+
def leave_Expr(
|
98
|
+
self, original_node: cst.Expr, updated_node: cst.Expr
|
99
|
+
) -> cst.Expr:
|
100
|
+
if not self.in_function_or_class:
|
101
|
+
return cast(
|
102
|
+
cst.Expr, check_and_remove_node(updated_node, self.exclusions)
|
103
|
+
)
|
104
|
+
return updated_node
|
105
|
+
|
106
|
+
def leave_Assign(
|
107
|
+
self, original_node: cst.Assign, updated_node: cst.Assign
|
108
|
+
) -> cst.Assign:
|
109
|
+
if not self.in_function_or_class:
|
110
|
+
return cast(
|
111
|
+
cst.Assign, check_and_remove_node(updated_node, self.exclusions)
|
112
|
+
)
|
113
|
+
return updated_node
|
114
|
+
|
115
|
+
def leave_If(self, original_node: cst.If, updated_node: cst.If) -> cst.If:
|
116
|
+
if not self.in_function_or_class:
|
117
|
+
return cast(
|
118
|
+
cst.If, check_and_remove_node(updated_node, self.exclusions)
|
119
|
+
)
|
120
|
+
return updated_node
|
121
|
+
|
122
|
+
def leave_For(self, original_node: cst.For, updated_node: cst.For) -> cst.For:
|
123
|
+
if not self.in_function_or_class:
|
124
|
+
return cast(
|
125
|
+
cst.For, check_and_remove_node(updated_node, self.exclusions)
|
126
|
+
)
|
127
|
+
return updated_node
|
128
|
+
|
129
|
+
def leave_While(
|
130
|
+
self, original_node: cst.While, updated_node: cst.While
|
131
|
+
) -> cst.While:
|
132
|
+
if not self.in_function_or_class:
|
133
|
+
return cast(
|
134
|
+
cst.While, check_and_remove_node(updated_node, self.exclusions)
|
135
|
+
)
|
136
|
+
return updated_node
|
137
|
+
|
138
|
+
def leave_With(
|
139
|
+
self, original_node: cst.With, updated_node: cst.With
|
140
|
+
) -> cst.With:
|
141
|
+
if not self.in_function_or_class:
|
142
|
+
return cast(
|
143
|
+
cst.With, check_and_remove_node(updated_node, self.exclusions)
|
144
|
+
)
|
145
|
+
return updated_node
|
146
|
+
|
147
|
+
def leave_Try(self, original_node: cst.Try, updated_node: cst.Try) -> cst.Try:
|
148
|
+
if not self.in_function_or_class:
|
149
|
+
return cast(
|
150
|
+
cst.Try, check_and_remove_node(updated_node, self.exclusions)
|
151
|
+
)
|
152
|
+
return updated_node
|
153
|
+
|
154
|
+
tree = cst.parse_module(code)
|
155
|
+
transformer = StripFunctionCallsTransformer(exclusions)
|
156
|
+
modified_tree = tree.visit(transformer)
|
157
|
+
return modified_tree.code
|
88
158
|
|
89
159
|
|
90
160
|
def write_code(
|
@@ -1,7 +1,7 @@
|
|
1
1
|
VA_CODE = """
|
2
2
|
**Role**: You are a helpful agent that assists users with writing code.
|
3
3
|
|
4
|
-
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <
|
4
|
+
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`.
|
5
5
|
|
6
6
|
<execute_python>
|
7
7
|
print("Hello World!")
|
@@ -26,10 +26,11 @@ Here is the current conversation so far:
|
|
26
26
|
|
27
27
|
**Instructions**:
|
28
28
|
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
29
|
-
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
|
29
|
+
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
|
30
30
|
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
|
31
|
-
4. **
|
32
|
-
5. **
|
31
|
+
4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
|
32
|
+
5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
|
33
|
+
6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
|
33
34
|
|
34
35
|
<thinking>Your thoughts here...</thinking>
|
35
36
|
<response>Your response to the user here...</response>
|
@@ -53,7 +54,7 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
|
|
53
54
|
<let_user_respond>false</let_user_respond>
|
54
55
|
|
55
56
|
OBSERVATION:
|
56
|
-
[Artifact dog_detector.py]
|
57
|
+
[Artifact dog_detector.py (5 lines total)]
|
57
58
|
0|from vision_agent.tools import load_image, owl_v2
|
58
59
|
1|def detect_dogs(image_path: str):
|
59
60
|
2| image = load_image(image_path)
|
@@ -94,7 +95,7 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
|
|
94
95
|
<let_user_respond>false</let_user_respond>
|
95
96
|
|
96
97
|
OBSERVATION:
|
97
|
-
[Artifact dog_detector.py]
|
98
|
+
[Artifact dog_detector.py (5 lines total)]
|
98
99
|
0|from vision_agent.tools import load_image, owl_v2
|
99
100
|
1|def detect_dogs(image_path: str):
|
100
101
|
2| image = load_image(image_path)
|
@@ -142,7 +143,7 @@ AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will te
|
|
142
143
|
|
143
144
|
|
144
145
|
EXAMPLES_CODE3 = """
|
145
|
-
USER: Can you create a function to count workers with helmets, return the count and
|
146
|
+
USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes?
|
146
147
|
|
147
148
|
OBSERVATION:
|
148
149
|
[Artifacts loaded]
|
@@ -166,9 +167,9 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w
|
|
166
167
|
<let_user_respond>false</let_user_respond>
|
167
168
|
|
168
169
|
OBSERVATION:
|
169
|
-
[Artifact code.py]
|
170
|
+
[Artifact code.py (17 lines total)]
|
170
171
|
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
171
|
-
1|def count_workers_with_helmets(image_path: str
|
172
|
+
1|def count_workers_with_helmets(image_path: str):
|
172
173
|
2| image = load_image(image_path)
|
173
174
|
3| detections = florence2_phrase_grounding("worker, helmet", image)
|
174
175
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
@@ -182,15 +183,16 @@ OBSERVATION:
|
|
182
183
|
11| break
|
183
184
|
11| if person_has_helmet:
|
184
185
|
13| count += 1
|
185
|
-
14| overlay_bounding_boxes(image, detections)
|
186
|
-
|
187
|
-
16| return count
|
186
|
+
14| viz = overlay_bounding_boxes(image, detections)
|
187
|
+
16| return count, viz
|
188
188
|
[End of artifact]
|
189
189
|
|
190
|
-
AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and
|
190
|
+
AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization.</thinking>
|
191
191
|
<execute_python>
|
192
192
|
from code import count_workers_with_helmets
|
193
|
-
|
193
|
+
count, viz = count_workers_with_helmets('/path/to/images/workers.png')
|
194
|
+
save_image(viz, 'workers_viz.png')
|
195
|
+
print(count)
|
194
196
|
</execute_python>
|
195
197
|
<let_user_respond>false</let_user_respond>
|
196
198
|
|
@@ -201,7 +203,9 @@ OBSERVATION:
|
|
201
203
|
AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
|
202
204
|
<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
|
203
205
|
<let_user_respond>true</let_user_respond>
|
206
|
+
"""
|
204
207
|
|
208
|
+
EXAMPLES_CODE3_EXTRA1 = """
|
205
209
|
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
|
206
210
|
|
207
211
|
AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
|
@@ -249,3 +253,60 @@ AGENT: <thinking>Three workers with helmets are detected, I will show this to th
|
|
249
253
|
<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
|
250
254
|
<let_user_respond>true</let_user_respond>
|
251
255
|
"""
|
256
|
+
|
257
|
+
EXAMPLES_CODE3_EXTRA2 = """
|
258
|
+
USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
|
259
|
+
|
260
|
+
AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
|
261
|
+
<execute_python>
|
262
|
+
CODE = '''
|
263
|
+
import streamlit as st
|
264
|
+
from code imoprt count_workers_with_helmets
|
265
|
+
import os
|
266
|
+
|
267
|
+
def main():
|
268
|
+
st.title("Worker with Helmet Counter")
|
269
|
+
uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
|
270
|
+
if uploaded_file is not None:
|
271
|
+
with open("temp_image.jpg", "wb") as f:
|
272
|
+
f.write(uploaded_file.read())
|
273
|
+
|
274
|
+
count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
|
275
|
+
st.image("temp_image_viz.jpg")
|
276
|
+
st.write(f"Number of workers with helmets: {count}")
|
277
|
+
os.remove("temp_image.jpg")
|
278
|
+
os.remove("temp_image_viz.jpg")
|
279
|
+
|
280
|
+
if __name__ == "__main__":
|
281
|
+
main()
|
282
|
+
'''
|
283
|
+
edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
|
284
|
+
</execute_python>
|
285
|
+
|
286
|
+
OBSERVATION:
|
287
|
+
[Artifact streamlit_app.py (19 lines total)]
|
288
|
+
0|import streamlit as st
|
289
|
+
1|from code imoprt count_workers_with_helmets
|
290
|
+
2|import os
|
291
|
+
3|
|
292
|
+
4|def main():
|
293
|
+
5| st.title("Worker with Helmet Counter")
|
294
|
+
6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
|
295
|
+
7| if uploaded_file is not None:
|
296
|
+
8| with open("temp_image.jpg", "wb") as f:
|
297
|
+
9| f.write(uploaded_file.read())
|
298
|
+
10|
|
299
|
+
11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
|
300
|
+
12| st.image("temp_image_viz.jpg")
|
301
|
+
13| st.write(f"Number of workers with helmets: {count}")
|
302
|
+
14| os.remove("temp_image.jpg")
|
303
|
+
15| os.remove("temp_image_viz.jpg")
|
304
|
+
16|
|
305
|
+
17|if __name__ == "__main__":
|
306
|
+
18| main()
|
307
|
+
[End of artifact]
|
308
|
+
|
309
|
+
AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
|
310
|
+
<response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
|
311
|
+
<let_user_respond>true</let_user_respond>
|
312
|
+
"""
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -6,15 +6,13 @@ import re
|
|
6
6
|
import subprocess
|
7
7
|
import tempfile
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Any, Dict, List, Optional, Union
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
10
10
|
|
11
|
+
import libcst as cst
|
11
12
|
from IPython.display import display
|
12
|
-
from redbaron import RedBaron # type: ignore
|
13
13
|
|
14
14
|
import vision_agent as va
|
15
|
-
from vision_agent.agent.agent_utils import extract_json
|
16
15
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
17
|
-
from vision_agent.lmm import AnthropicLMM
|
18
16
|
from vision_agent.lmm.types import Message
|
19
17
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
20
18
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
@@ -26,7 +24,6 @@ CURRENT_FILE = None
|
|
26
24
|
CURRENT_LINE = 0
|
27
25
|
DEFAULT_WINDOW_SIZE = 100
|
28
26
|
ZMQ_PORT = os.environ.get("ZMQ_PORT", None)
|
29
|
-
VERBOSITY = os.environ.get("VERBOSITY", 0)
|
30
27
|
|
31
28
|
|
32
29
|
def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
|
@@ -38,16 +35,6 @@ def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
|
|
38
35
|
socket.send_json(inp)
|
39
36
|
|
40
37
|
|
41
|
-
def filter_file(file_name: Union[str, Path]) -> bool:
|
42
|
-
file_name_p = Path(file_name)
|
43
|
-
return (
|
44
|
-
file_name_p.is_file()
|
45
|
-
and "__pycache__" not in str(file_name_p)
|
46
|
-
and file_name_p.suffix in [".py", ".txt"]
|
47
|
-
and not file_name_p.name.startswith(".")
|
48
|
-
)
|
49
|
-
|
50
|
-
|
51
38
|
def redisplay_results(execution: Execution) -> None:
|
52
39
|
"""This function is used to add previous execution results to the current output.
|
53
40
|
This is handy if you are inside a notebook environment, call it notebook1, and you
|
@@ -86,8 +73,11 @@ class Artifacts:
|
|
86
73
|
need to be in sync with the remote environment the VisionAgent is running in.
|
87
74
|
"""
|
88
75
|
|
89
|
-
def __init__(
|
76
|
+
def __init__(
|
77
|
+
self, remote_save_path: Union[str, Path], local_save_path: Union[str, Path]
|
78
|
+
) -> None:
|
90
79
|
self.remote_save_path = Path(remote_save_path)
|
80
|
+
self.local_save_path = Path(local_save_path)
|
91
81
|
self.artifacts: Dict[str, Any] = {}
|
92
82
|
|
93
83
|
self.code_sandbox_runtime = None
|
@@ -131,9 +121,7 @@ class Artifacts:
|
|
131
121
|
return output_str
|
132
122
|
|
133
123
|
def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
|
134
|
-
save_path = (
|
135
|
-
Path(local_path) if local_path is not None else self.remote_save_path
|
136
|
-
)
|
124
|
+
save_path = Path(local_path) if local_path is not None else self.local_save_path
|
137
125
|
with open(save_path, "wb") as f:
|
138
126
|
pkl.dump(self.artifacts, f)
|
139
127
|
|
@@ -150,6 +138,38 @@ class Artifacts:
|
|
150
138
|
return name in self.artifacts
|
151
139
|
|
152
140
|
|
141
|
+
def filter_file(file_name: Union[str, Path]) -> Tuple[bool, bool]:
|
142
|
+
file_name_p = Path(file_name)
|
143
|
+
return (
|
144
|
+
file_name_p.is_file()
|
145
|
+
and "__pycache__" not in str(file_name_p)
|
146
|
+
and not file_name_p.name.startswith(".")
|
147
|
+
and file_name_p.suffix
|
148
|
+
in [".png", ".jpeg", ".jpg", ".mp4", ".txt", ".json", ".csv"]
|
149
|
+
), file_name_p.suffix in [".png", ".jpeg", ".jpg", ".mp4"]
|
150
|
+
|
151
|
+
|
152
|
+
def capture_files_into_artifacts(artifacts: Artifacts) -> None:
|
153
|
+
"""This function is used to capture all files in the current directory into an
|
154
|
+
artifact object. This is useful if you want to capture all files in the current
|
155
|
+
directory and use them in a different environment where you don't have access to
|
156
|
+
the file system.
|
157
|
+
|
158
|
+
Parameters:
|
159
|
+
artifact (Artifacts): The artifact object to save the files to.
|
160
|
+
"""
|
161
|
+
for file in Path(".").glob("**/*"):
|
162
|
+
usable_file, is_media = filter_file(file)
|
163
|
+
mode = "rb" if is_media else "r"
|
164
|
+
if usable_file:
|
165
|
+
file_name = file.name
|
166
|
+
if file_name.startswith(str(Path(artifacts.remote_save_path).parents)):
|
167
|
+
idx = len(Path(artifacts.remote_save_path).parents)
|
168
|
+
file_name = file_name[idx:]
|
169
|
+
with open(file, mode) as f:
|
170
|
+
artifacts[file_name] = f.read()
|
171
|
+
|
172
|
+
|
153
173
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
154
174
|
|
155
175
|
|
@@ -174,9 +194,9 @@ def view_lines(
|
|
174
194
|
f"[Artifact: {name} ({total_lines} lines total)]\n"
|
175
195
|
+ format_lines(lines[start:end], start)
|
176
196
|
+ (
|
177
|
-
"[End of artifact]"
|
197
|
+
"\n[End of artifact]"
|
178
198
|
if end == len(lines)
|
179
|
-
else f"[{len(lines) - end} more lines]"
|
199
|
+
else f"\n[{len(lines) - end} more lines]"
|
180
200
|
)
|
181
201
|
)
|
182
202
|
|
@@ -256,8 +276,10 @@ def edit_code_artifact(
|
|
256
276
|
Parameters:
|
257
277
|
artifacts (Artifacts): The artifacts object to edit the artifact from.
|
258
278
|
name (str): The name of the artifact to edit.
|
259
|
-
start (int): The line number to start the edit
|
260
|
-
|
279
|
+
start (int): The line number to start the edit, can be in [-1, total_lines]
|
280
|
+
where -1 represents the end of the file.
|
281
|
+
end (int): The line number to end the edit, can be in [-1, total_lines] where
|
282
|
+
-1 represents the end of the file.
|
261
283
|
content (str): The content to insert.
|
262
284
|
"""
|
263
285
|
# just make the artifact if it doesn't exist instead of forcing agent to call
|
@@ -266,17 +288,21 @@ def edit_code_artifact(
|
|
266
288
|
artifacts[name] = ""
|
267
289
|
|
268
290
|
total_lines = len(artifacts[name].splitlines())
|
291
|
+
if start == -1:
|
292
|
+
start = total_lines
|
293
|
+
if end == -1:
|
294
|
+
end = total_lines
|
295
|
+
|
269
296
|
if start < 0 or end < 0 or start > end or end > total_lines:
|
270
297
|
print("[Invalid line range]")
|
271
298
|
return "[Invalid line range]"
|
272
|
-
if start == end:
|
273
|
-
end += 1
|
274
299
|
|
275
300
|
new_content_lines = content.splitlines(keepends=True)
|
276
301
|
new_content_lines = [
|
277
302
|
line if line.endswith("\n") else line + "\n" for line in new_content_lines
|
278
303
|
]
|
279
304
|
lines = artifacts[name].splitlines(keepends=True)
|
305
|
+
lines = [line if line.endswith("\n") else line + "\n" for line in lines]
|
280
306
|
edited_lines = lines[:start] + new_content_lines + lines[end:]
|
281
307
|
|
282
308
|
cur_line = start + len(content.split("\n")) // 2
|
@@ -371,14 +397,16 @@ def generate_vision_plan(
|
|
371
397
|
[End Plan Context]
|
372
398
|
"""
|
373
399
|
|
400
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
374
401
|
if ZMQ_PORT is not None:
|
375
402
|
agent = va.agent.VisionAgentPlanner(
|
376
403
|
report_progress_callback=lambda inp: report_progress_callback(
|
377
404
|
int(ZMQ_PORT), inp
|
378
|
-
)
|
405
|
+
),
|
406
|
+
verbosity=0,
|
379
407
|
)
|
380
408
|
else:
|
381
|
-
agent = va.agent.VisionAgentPlanner()
|
409
|
+
agent = va.agent.VisionAgentPlanner(verbosity=0)
|
382
410
|
|
383
411
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
384
412
|
response = agent.generate_plan(
|
@@ -435,14 +463,16 @@ def generate_vision_code(
|
|
435
463
|
dogs = owl_v2("dog", image)
|
436
464
|
return dogs
|
437
465
|
"""
|
466
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
438
467
|
if ZMQ_PORT is not None:
|
439
468
|
agent = va.agent.VisionAgentCoder(
|
440
469
|
report_progress_callback=lambda inp: report_progress_callback(
|
441
470
|
int(ZMQ_PORT), inp
|
442
|
-
)
|
471
|
+
),
|
472
|
+
verbosity=0,
|
443
473
|
)
|
444
474
|
else:
|
445
|
-
agent = va.agent.VisionAgentCoder(verbosity=
|
475
|
+
agent = va.agent.VisionAgentCoder(verbosity=0)
|
446
476
|
|
447
477
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
448
478
|
response = agent.generate_code(
|
@@ -506,7 +536,8 @@ def edit_vision_code(
|
|
506
536
|
return dogs
|
507
537
|
"""
|
508
538
|
|
509
|
-
|
539
|
+
# verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
|
540
|
+
agent = va.agent.VisionAgentCoder(verbosity=0)
|
510
541
|
if name not in artifacts:
|
511
542
|
print(f"[Artifact {name} does not exist]")
|
512
543
|
return f"[Artifact {name} does not exist]"
|
@@ -570,8 +601,9 @@ def check_and_load_image(code: str) -> List[str]:
|
|
570
601
|
|
571
602
|
|
572
603
|
def view_media_artifact(artifacts: Artifacts, name: str) -> str:
|
573
|
-
"""Allows
|
574
|
-
|
604
|
+
"""Allows only the agent to view the media artifact with the given name. DO NOT use
|
605
|
+
this to show media to the user, the user can already see all media saved in the
|
606
|
+
artifacts.
|
575
607
|
|
576
608
|
Parameters:
|
577
609
|
artifacts (Artifacts): The artifacts object to show the image from.
|
@@ -648,10 +680,10 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
|
|
648
680
|
|
649
681
|
|
650
682
|
def use_extra_vision_agent_args(
|
651
|
-
code: str,
|
683
|
+
code: Optional[str],
|
652
684
|
test_multi_plan: bool = True,
|
653
685
|
custom_tool_names: Optional[List[str]] = None,
|
654
|
-
) -> str:
|
686
|
+
) -> Optional[str]:
|
655
687
|
"""This is for forcing arguments passed by the user to VisionAgent into the
|
656
688
|
VisionAgentCoder call.
|
657
689
|
|
@@ -663,21 +695,81 @@ def use_extra_vision_agent_args(
|
|
663
695
|
Returns:
|
664
696
|
str: The edited code.
|
665
697
|
"""
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
698
|
+
if code is None:
|
699
|
+
return None
|
700
|
+
|
701
|
+
class VisionAgentTransformer(cst.CSTTransformer):
|
702
|
+
def __init__(
|
703
|
+
self, test_multi_plan: bool, custom_tool_names: Optional[List[str]]
|
704
|
+
):
|
705
|
+
self.test_multi_plan = test_multi_plan
|
706
|
+
self.custom_tool_names = custom_tool_names
|
707
|
+
|
708
|
+
def leave_Call(
|
709
|
+
self, original_node: cst.Call, updated_node: cst.Call
|
710
|
+
) -> cst.Call:
|
711
|
+
# Check if the function being called is generate_vision_code or edit_vision_code
|
712
|
+
if isinstance(updated_node.func, cst.Name) and updated_node.func.value in [
|
713
|
+
"generate_vision_code",
|
714
|
+
"edit_vision_code",
|
715
|
+
]:
|
716
|
+
# Add test_multi_plan argument to generate_vision_code calls
|
717
|
+
if updated_node.func.value == "generate_vision_code":
|
718
|
+
new_arg = cst.Arg(
|
719
|
+
keyword=cst.Name("test_multi_plan"),
|
720
|
+
value=cst.Name(str(self.test_multi_plan)),
|
721
|
+
equal=cst.AssignEqual(
|
722
|
+
whitespace_before=cst.SimpleWhitespace(""),
|
723
|
+
whitespace_after=cst.SimpleWhitespace(""),
|
724
|
+
),
|
725
|
+
)
|
726
|
+
updated_node = updated_node.with_changes(
|
727
|
+
args=[*updated_node.args, new_arg]
|
728
|
+
)
|
729
|
+
|
730
|
+
# Add custom_tool_names if provided
|
731
|
+
if self.custom_tool_names is not None:
|
732
|
+
list_arg = []
|
733
|
+
for i, tool_name in enumerate(self.custom_tool_names):
|
734
|
+
if i < len(self.custom_tool_names) - 1:
|
735
|
+
list_arg.append(
|
736
|
+
cst._nodes.expression.Element(
|
737
|
+
value=cst.SimpleString(value=f'"{tool_name}"'),
|
738
|
+
comma=cst.Comma(
|
739
|
+
whitespace_before=cst.SimpleWhitespace(""),
|
740
|
+
whitespace_after=cst.SimpleWhitespace(" "),
|
741
|
+
),
|
742
|
+
)
|
743
|
+
)
|
744
|
+
else:
|
745
|
+
list_arg.append(
|
746
|
+
cst._nodes.expression.Element(
|
747
|
+
value=cst.SimpleString(value=f'"{tool_name}"'),
|
748
|
+
)
|
749
|
+
)
|
750
|
+
new_arg = cst.Arg(
|
751
|
+
keyword=cst.Name("custom_tool_names"),
|
752
|
+
value=cst.List(list_arg),
|
753
|
+
equal=cst.AssignEqual(
|
754
|
+
whitespace_before=cst.SimpleWhitespace(""),
|
755
|
+
whitespace_after=cst.SimpleWhitespace(""),
|
756
|
+
),
|
757
|
+
)
|
758
|
+
updated_node = updated_node.with_changes(
|
759
|
+
args=[*updated_node.args, new_arg]
|
760
|
+
)
|
761
|
+
|
762
|
+
return updated_node
|
763
|
+
|
764
|
+
# Parse the input code into a CST node
|
765
|
+
tree = cst.parse_module(code)
|
766
|
+
|
767
|
+
# Apply the transformer to modify the CST
|
768
|
+
transformer = VisionAgentTransformer(test_multi_plan, custom_tool_names)
|
769
|
+
modified_tree = tree.visit(transformer)
|
770
|
+
|
771
|
+
# Return the modified code as a string
|
772
|
+
return modified_tree.code
|
681
773
|
|
682
774
|
|
683
775
|
def use_object_detection_fine_tuning(
|
@@ -757,74 +849,12 @@ def use_object_detection_fine_tuning(
|
|
757
849
|
return diff
|
758
850
|
|
759
851
|
|
760
|
-
def extract_and_save_files_to_artifacts(
|
761
|
-
artifacts: Artifacts, code: str, obs: str
|
762
|
-
) -> None:
|
763
|
-
"""Extracts and saves files used in the code to the artifacts object.
|
764
|
-
|
765
|
-
Parameters:
|
766
|
-
artifacts (Artifacts): The artifacts object to save the files to.
|
767
|
-
code (str): The code to extract the files from.
|
768
|
-
"""
|
769
|
-
try:
|
770
|
-
response = extract_json(
|
771
|
-
AnthropicLMM()( # type: ignore
|
772
|
-
f"""You are a helpful AI assistant. Your job is to look at a snippet of code and the output of running that code and return the file paths that are being saved in the file. Below is the code snippet:
|
773
|
-
|
774
|
-
```python
|
775
|
-
{code}
|
776
|
-
```
|
777
|
-
|
778
|
-
```output
|
779
|
-
{obs}
|
780
|
-
```
|
781
|
-
|
782
|
-
Return the file paths in the following JSON format:
|
783
|
-
{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
|
784
|
-
)
|
785
|
-
)
|
786
|
-
except json.JSONDecodeError:
|
787
|
-
return
|
788
|
-
|
789
|
-
text_file_ext = [
|
790
|
-
".txt",
|
791
|
-
".md",
|
792
|
-
"rtf",
|
793
|
-
".html",
|
794
|
-
".htm",
|
795
|
-
"xml",
|
796
|
-
".json",
|
797
|
-
".csv",
|
798
|
-
".tsv",
|
799
|
-
".yaml",
|
800
|
-
".yml",
|
801
|
-
".toml",
|
802
|
-
".conf",
|
803
|
-
".env" ".ini",
|
804
|
-
".log",
|
805
|
-
".py",
|
806
|
-
".java",
|
807
|
-
".js",
|
808
|
-
".cpp",
|
809
|
-
".c" ".sql",
|
810
|
-
".sh",
|
811
|
-
]
|
812
|
-
|
813
|
-
if "file_paths" in response and isinstance(response["file_paths"], list):
|
814
|
-
for file_path in response["file_paths"]:
|
815
|
-
read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
|
816
|
-
if Path(file_path).is_file():
|
817
|
-
with open(file_path, read_mode) as f:
|
818
|
-
artifacts[Path(file_path).name] = f.read()
|
819
|
-
|
820
|
-
|
821
852
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
822
853
|
[
|
823
854
|
get_tool_descriptions,
|
824
855
|
open_code_artifact,
|
825
856
|
create_code_artifact,
|
826
857
|
edit_code_artifact,
|
827
|
-
generate_vision_plan,
|
828
858
|
generate_vision_code,
|
829
859
|
edit_vision_code,
|
830
860
|
view_media_artifact,
|
vision_agent/utils/execute.py
CHANGED
@@ -575,6 +575,7 @@ class LocalCodeInterpreter(CodeInterpreter):
|
|
575
575
|
super().__init__(timeout=timeout)
|
576
576
|
self.nb = nbformat.v4.new_notebook()
|
577
577
|
# Set the notebook execution path to the remote path
|
578
|
+
self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
|
578
579
|
self.resources = {"metadata": {"path": str(self.remote_path)}}
|
579
580
|
self.nb_client = NotebookClient(
|
580
581
|
self.nb,
|
@@ -591,7 +592,6 @@ Timeout: {self.timeout}"""
|
|
591
592
|
)
|
592
593
|
sleep(1)
|
593
594
|
self._new_kernel()
|
594
|
-
self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
|
595
595
|
|
596
596
|
def _new_kernel(self) -> None:
|
597
597
|
if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
|
@@ -659,7 +659,7 @@ Timeout: {self.timeout}"""
|
|
659
659
|
def download_file(
|
660
660
|
self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
|
661
661
|
) -> Path:
|
662
|
-
with open(self.remote_path / remote_file_path, "rb") as f:
|
662
|
+
with open(self.remote_path / Path(remote_file_path).name, "rb") as f:
|
663
663
|
contents = f.read()
|
664
664
|
with open(local_file_path, "wb") as f:
|
665
665
|
f.write(contents)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.175
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -16,6 +16,7 @@ Requires-Dist: e2b-code-interpreter (==0.0.11a37)
|
|
16
16
|
Requires-Dist: flake8 (>=7.0.0,<8.0.0)
|
17
17
|
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
18
18
|
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
19
|
+
Requires-Dist: libcst (>=1.5.0,<2.0.0)
|
19
20
|
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
20
21
|
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
21
22
|
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
@@ -27,7 +28,6 @@ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
|
27
28
|
Requires-Dist: pydantic (==2.7.4)
|
28
29
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
29
30
|
Requires-Dist: pytube (==15.0.0)
|
30
|
-
Requires-Dist: redbaron (>=0.9.2,<0.10.0)
|
31
31
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
32
32
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
33
33
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
@@ -1,13 +1,13 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
2
|
vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
-
vision_agent/agent/agent_utils.py,sha256=
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
4
|
+
vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=YfSYp9UeML-f67mn1TvjwyXDB_CxaaG_4mzNC1py5xU,25882
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=3Q1VWrN-BNUoSD4OAqKazvXkP2c04PXDYu2Z1f5dQb0,31960
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
8
8
|
vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
|
9
9
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
|
10
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
|
11
11
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
13
13
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -17,19 +17,19 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
|
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
19
|
vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
|
20
|
-
vision_agent/tools/meta_tools.py,sha256=
|
20
|
+
vision_agent/tools/meta_tools.py,sha256=v6e4pnWDIO46ZTWuk-1FkMszfmz2pj-N5wRP8_0WelM,30648
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
23
|
vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
27
|
-
vision_agent/utils/execute.py,sha256=
|
27
|
+
vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo,28028
|
28
28
|
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.175.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.175.dist-info/METADATA,sha256=4qXZ_bRfFYb5fgTP4XcRG4bH9IcVhqE2akIQObcpMSo,18339
|
34
|
+
vision_agent-0.2.175.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.175.dist-info/RECORD,,
|
File without changes
|
File without changes
|