vision-agent 0.2.158__py3-none-any.whl → 0.2.160__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +51 -4
- vision_agent/tools/meta_tools.py +27 -6
- vision_agent/tools/tools.py +4 -2
- {vision_agent-0.2.158.dist-info → vision_agent-0.2.160.dist-info}/METADATA +1 -1
- {vision_agent-0.2.158.dist-info → vision_agent-0.2.160.dist-info}/RECORD +7 -7
- {vision_agent-0.2.158.dist-info → vision_agent-0.2.160.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.158.dist-info → vision_agent-0.2.160.dist-info}/WHEEL +0 -0
@@ -149,6 +149,32 @@ def execute_user_code_action(
|
|
149
149
|
return user_result, user_obs
|
150
150
|
|
151
151
|
|
152
|
+
def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
|
153
|
+
response = copy.deepcopy(response)
|
154
|
+
if "response" in response:
|
155
|
+
resp_str = response["response"]
|
156
|
+
if "<execute_python>" in resp_str:
|
157
|
+
# only include descriptions for these, the rest will just have executing
|
158
|
+
# code
|
159
|
+
description_map = {
|
160
|
+
"open_code_artifact": "Reading file.",
|
161
|
+
"create_code_artifact": "Creating file.",
|
162
|
+
"edit_code_artifact": "Editing file.",
|
163
|
+
"generate_vision_code": "Generating vision code.",
|
164
|
+
"edit_vision_code": "Editing vision code.",
|
165
|
+
}
|
166
|
+
description = ""
|
167
|
+
for k, v in description_map.items():
|
168
|
+
if k in resp_str:
|
169
|
+
description += v + " "
|
170
|
+
if description == "":
|
171
|
+
description = "Executing code."
|
172
|
+
resp_str = resp_str[resp_str.find("<execute_python>") :]
|
173
|
+
resp_str = description + resp_str
|
174
|
+
response["response"] = resp_str
|
175
|
+
return response
|
176
|
+
|
177
|
+
|
152
178
|
class VisionAgent(Agent):
|
153
179
|
"""Vision Agent is an agent that can chat with the user and call tools or other
|
154
180
|
agents to generate code for it. Vision Agent uses python code to execute actions
|
@@ -335,8 +361,18 @@ class VisionAgent(Agent):
|
|
335
361
|
response = run_conversation(self.agent, int_chat)
|
336
362
|
if self.verbosity >= 1:
|
337
363
|
_LOGGER.info(response)
|
338
|
-
int_chat.append(
|
339
|
-
|
364
|
+
int_chat.append(
|
365
|
+
{
|
366
|
+
"role": "assistant",
|
367
|
+
"content": str(add_step_descriptions(response)),
|
368
|
+
}
|
369
|
+
)
|
370
|
+
orig_chat.append(
|
371
|
+
{
|
372
|
+
"role": "assistant",
|
373
|
+
"content": str(add_step_descriptions(response)),
|
374
|
+
}
|
375
|
+
)
|
340
376
|
|
341
377
|
# sometimes it gets stuck in a loop, so we force it to exit
|
342
378
|
if last_response == response:
|
@@ -382,6 +418,16 @@ class VisionAgent(Agent):
|
|
382
418
|
|
383
419
|
obs_chat_elt: Message = {"role": "observation", "content": obs}
|
384
420
|
if media_obs and result.success:
|
421
|
+
# for view_media_artifact, we need to ensure the media is loaded
|
422
|
+
# locally so the conversation agent can actually see it
|
423
|
+
code_interpreter.download_file(
|
424
|
+
str(remote_artifacts_path.name),
|
425
|
+
str(self.local_artifacts_path),
|
426
|
+
)
|
427
|
+
artifacts.load(
|
428
|
+
self.local_artifacts_path,
|
429
|
+
Path(self.local_artifacts_path).parent,
|
430
|
+
)
|
385
431
|
obs_chat_elt["media"] = [
|
386
432
|
Path(self.local_artifacts_path).parent / media_ob
|
387
433
|
for media_ob in media_obs
|
@@ -407,8 +453,9 @@ class VisionAgent(Agent):
|
|
407
453
|
code_interpreter.download_file(
|
408
454
|
str(remote_artifacts_path.name), str(self.local_artifacts_path)
|
409
455
|
)
|
410
|
-
artifacts.load(
|
411
|
-
|
456
|
+
artifacts.load(
|
457
|
+
self.local_artifacts_path, Path(self.local_artifacts_path).parent
|
458
|
+
)
|
412
459
|
return orig_chat, artifacts
|
413
460
|
|
414
461
|
def streaming_message(self, message: Dict[str, Any]) -> None:
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import base64
|
1
2
|
import difflib
|
2
3
|
import json
|
3
4
|
import os
|
@@ -92,19 +93,26 @@ class Artifacts:
|
|
92
93
|
|
93
94
|
self.code_sandbox_runtime = None
|
94
95
|
|
95
|
-
def load(
|
96
|
-
|
97
|
-
|
96
|
+
def load(
|
97
|
+
self,
|
98
|
+
artifacts_path: Union[str, Path],
|
99
|
+
load_to: Optional[Union[str, Path]] = None,
|
100
|
+
) -> None:
|
101
|
+
"""Loads are artifacts into the load_to path. If load_to is None, it will load
|
102
|
+
into remote_save_path. If an artifact value is None it will skip loading it.
|
98
103
|
|
99
104
|
Parameters:
|
100
|
-
|
105
|
+
artifacts_path (Union[str, Path]): The file path to load the artifacts from
|
101
106
|
"""
|
102
|
-
with open(
|
107
|
+
with open(artifacts_path, "rb") as f:
|
103
108
|
self.artifacts = pkl.load(f)
|
109
|
+
|
110
|
+
load_to = self.remote_save_path.parent if load_to is None else Path(load_to)
|
111
|
+
|
104
112
|
for k, v in self.artifacts.items():
|
105
113
|
if v is not None:
|
106
114
|
mode = "w" if isinstance(v, str) else "wb"
|
107
|
-
with open(
|
115
|
+
with open(load_to / k, mode) as f:
|
108
116
|
f.write(v)
|
109
117
|
|
110
118
|
def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str:
|
@@ -503,6 +511,19 @@ def write_media_artifact(
|
|
503
511
|
return f"[Invalid media type {type(media)}]"
|
504
512
|
artifacts[name] = media_bytes
|
505
513
|
print(f"[Media {name} saved]")
|
514
|
+
display(
|
515
|
+
{
|
516
|
+
MimeType.APPLICATION_ARTIFACT: json.dumps(
|
517
|
+
{
|
518
|
+
"name": name,
|
519
|
+
"action": "create",
|
520
|
+
"content": base64.b64encode(media_bytes).decode("utf-8"),
|
521
|
+
"contentType": "media_output",
|
522
|
+
}
|
523
|
+
)
|
524
|
+
},
|
525
|
+
raw=True,
|
526
|
+
)
|
506
527
|
return f"[Media {name} saved]"
|
507
528
|
|
508
529
|
|
vision_agent/tools/tools.py
CHANGED
@@ -700,6 +700,7 @@ def countgd_counting(
|
|
700
700
|
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
701
701
|
]
|
702
702
|
"""
|
703
|
+
image_size = image.shape[:2]
|
703
704
|
buffer_bytes = numpy_to_bytes(image)
|
704
705
|
files = [("image", buffer_bytes)]
|
705
706
|
prompt = prompt.replace(", ", " .")
|
@@ -712,7 +713,7 @@ def countgd_counting(
|
|
712
713
|
bboxes_formatted = [
|
713
714
|
ODResponseData(
|
714
715
|
label=bbox["label"],
|
715
|
-
bbox=
|
716
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
716
717
|
score=round(bbox["score"], 2),
|
717
718
|
)
|
718
719
|
for bbox in bboxes_per_frame
|
@@ -757,6 +758,7 @@ def countgd_example_based_counting(
|
|
757
758
|
{'score': 0.98, 'label': 'object', 'bounding_box': [0.44, 0.24, 0.49, 0.58},
|
758
759
|
]
|
759
760
|
"""
|
761
|
+
image_size = image.shape[:2]
|
760
762
|
buffer_bytes = numpy_to_bytes(image)
|
761
763
|
files = [("image", buffer_bytes)]
|
762
764
|
visual_prompts = [
|
@@ -771,7 +773,7 @@ def countgd_example_based_counting(
|
|
771
773
|
bboxes_formatted = [
|
772
774
|
ODResponseData(
|
773
775
|
label=bbox["label"],
|
774
|
-
bbox=
|
776
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
775
777
|
score=round(bbox["score"], 2),
|
776
778
|
)
|
777
779
|
for bbox in bboxes_per_frame
|
@@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=eIpLz2NunEqEsBBrECJaD34-2uY0bsFNnW-XKfqqohs,2518
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=etqyLMZHJJz_A6tkonoYGlYvFvEW0uUHs5D1gsYwkSs,20412
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=2ZoGikn2nakGDfs20XRshZjQUyvbw6l47UhExJAYkqI,38515
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
|
@@ -15,10 +15,10 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
|
|
15
15
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
17
|
vision_agent/tools/__init__.py,sha256=PLVbfTMjKxQlHIRWnq9b785W9a52AXQS_tOa0tkQ0ZY,2420
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=TG4vXN7W2M3rOJhzVsCrDAhbctd3RJcuOmbAeXsk2Sw,25798
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=vS1yCk3Fza9eYOTHPFwwroo_ULdw2ztMQMb81x1U5f8,78524
|
22
22
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
23
23
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.160.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.160.dist-info/METADATA,sha256=yfWiwd_dUVN2AQOkCSDM9A41NPqACMoj8NaVgOHOmNQ,17753
|
32
|
+
vision_agent-0.2.160.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.160.dist-info/RECORD,,
|
File without changes
|
File without changes
|