vision-agent 0.2.158__py3-none-any.whl → 0.2.160__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent.py +51 -4
- vision_agent/tools/meta_tools.py +27 -6
- vision_agent/tools/tools.py +4 -2
- {vision_agent-0.2.158.dist-info → vision_agent-0.2.160.dist-info}/METADATA +1 -1
- {vision_agent-0.2.158.dist-info → vision_agent-0.2.160.dist-info}/RECORD +7 -7
- {vision_agent-0.2.158.dist-info → vision_agent-0.2.160.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.158.dist-info → vision_agent-0.2.160.dist-info}/WHEEL +0 -0
@@ -149,6 +149,32 @@ def execute_user_code_action(
|
|
149
149
|
return user_result, user_obs
|
150
150
|
|
151
151
|
|
152
|
+
def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
|
153
|
+
response = copy.deepcopy(response)
|
154
|
+
if "response" in response:
|
155
|
+
resp_str = response["response"]
|
156
|
+
if "<execute_python>" in resp_str:
|
157
|
+
# only include descriptions for these, the rest will just have executing
|
158
|
+
# code
|
159
|
+
description_map = {
|
160
|
+
"open_code_artifact": "Reading file.",
|
161
|
+
"create_code_artifact": "Creating file.",
|
162
|
+
"edit_code_artifact": "Editing file.",
|
163
|
+
"generate_vision_code": "Generating vision code.",
|
164
|
+
"edit_vision_code": "Editing vision code.",
|
165
|
+
}
|
166
|
+
description = ""
|
167
|
+
for k, v in description_map.items():
|
168
|
+
if k in resp_str:
|
169
|
+
description += v + " "
|
170
|
+
if description == "":
|
171
|
+
description = "Executing code."
|
172
|
+
resp_str = resp_str[resp_str.find("<execute_python>") :]
|
173
|
+
resp_str = description + resp_str
|
174
|
+
response["response"] = resp_str
|
175
|
+
return response
|
176
|
+
|
177
|
+
|
152
178
|
class VisionAgent(Agent):
|
153
179
|
"""Vision Agent is an agent that can chat with the user and call tools or other
|
154
180
|
agents to generate code for it. Vision Agent uses python code to execute actions
|
@@ -335,8 +361,18 @@ class VisionAgent(Agent):
|
|
335
361
|
response = run_conversation(self.agent, int_chat)
|
336
362
|
if self.verbosity >= 1:
|
337
363
|
_LOGGER.info(response)
|
338
|
-
int_chat.append(
|
339
|
-
|
364
|
+
int_chat.append(
|
365
|
+
{
|
366
|
+
"role": "assistant",
|
367
|
+
"content": str(add_step_descriptions(response)),
|
368
|
+
}
|
369
|
+
)
|
370
|
+
orig_chat.append(
|
371
|
+
{
|
372
|
+
"role": "assistant",
|
373
|
+
"content": str(add_step_descriptions(response)),
|
374
|
+
}
|
375
|
+
)
|
340
376
|
|
341
377
|
# sometimes it gets stuck in a loop, so we force it to exit
|
342
378
|
if last_response == response:
|
@@ -382,6 +418,16 @@ class VisionAgent(Agent):
|
|
382
418
|
|
383
419
|
obs_chat_elt: Message = {"role": "observation", "content": obs}
|
384
420
|
if media_obs and result.success:
|
421
|
+
# for view_media_artifact, we need to ensure the media is loaded
|
422
|
+
# locally so the conversation agent can actually see it
|
423
|
+
code_interpreter.download_file(
|
424
|
+
str(remote_artifacts_path.name),
|
425
|
+
str(self.local_artifacts_path),
|
426
|
+
)
|
427
|
+
artifacts.load(
|
428
|
+
self.local_artifacts_path,
|
429
|
+
Path(self.local_artifacts_path).parent,
|
430
|
+
)
|
385
431
|
obs_chat_elt["media"] = [
|
386
432
|
Path(self.local_artifacts_path).parent / media_ob
|
387
433
|
for media_ob in media_obs
|
@@ -407,8 +453,9 @@ class VisionAgent(Agent):
|
|
407
453
|
code_interpreter.download_file(
|
408
454
|
str(remote_artifacts_path.name), str(self.local_artifacts_path)
|
409
455
|
)
|
410
|
-
artifacts.load(
|
411
|
-
|
456
|
+
artifacts.load(
|
457
|
+
self.local_artifacts_path, Path(self.local_artifacts_path).parent
|
458
|
+
)
|
412
459
|
return orig_chat, artifacts
|
413
460
|
|
414
461
|
def streaming_message(self, message: Dict[str, Any]) -> None:
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import base64
|
1
2
|
import difflib
|
2
3
|
import json
|
3
4
|
import os
|
@@ -92,19 +93,26 @@ class Artifacts:
|
|
92
93
|
|
93
94
|
self.code_sandbox_runtime = None
|
94
95
|
|
95
|
-
def load(
|
96
|
-
|
97
|
-
|
96
|
+
def load(
|
97
|
+
self,
|
98
|
+
artifacts_path: Union[str, Path],
|
99
|
+
load_to: Optional[Union[str, Path]] = None,
|
100
|
+
) -> None:
|
101
|
+
"""Loads are artifacts into the load_to path. If load_to is None, it will load
|
102
|
+
into remote_save_path. If an artifact value is None it will skip loading it.
|
98
103
|
|
99
104
|
Parameters:
|
100
|
-
|
105
|
+
artifacts_path (Union[str, Path]): The file path to load the artifacts from
|
101
106
|
"""
|
102
|
-
with open(
|
107
|
+
with open(artifacts_path, "rb") as f:
|
103
108
|
self.artifacts = pkl.load(f)
|
109
|
+
|
110
|
+
load_to = self.remote_save_path.parent if load_to is None else Path(load_to)
|
111
|
+
|
104
112
|
for k, v in self.artifacts.items():
|
105
113
|
if v is not None:
|
106
114
|
mode = "w" if isinstance(v, str) else "wb"
|
107
|
-
with open(
|
115
|
+
with open(load_to / k, mode) as f:
|
108
116
|
f.write(v)
|
109
117
|
|
110
118
|
def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str:
|
@@ -503,6 +511,19 @@ def write_media_artifact(
|
|
503
511
|
return f"[Invalid media type {type(media)}]"
|
504
512
|
artifacts[name] = media_bytes
|
505
513
|
print(f"[Media {name} saved]")
|
514
|
+
display(
|
515
|
+
{
|
516
|
+
MimeType.APPLICATION_ARTIFACT: json.dumps(
|
517
|
+
{
|
518
|
+
"name": name,
|
519
|
+
"action": "create",
|
520
|
+
"content": base64.b64encode(media_bytes).decode("utf-8"),
|
521
|
+
"contentType": "media_output",
|
522
|
+
}
|
523
|
+
)
|
524
|
+
},
|
525
|
+
raw=True,
|
526
|
+
)
|
506
527
|
return f"[Media {name} saved]"
|
507
528
|
|
508
529
|
|
vision_agent/tools/tools.py
CHANGED
@@ -700,6 +700,7 @@ def countgd_counting(
|
|
700
700
|
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
701
701
|
]
|
702
702
|
"""
|
703
|
+
image_size = image.shape[:2]
|
703
704
|
buffer_bytes = numpy_to_bytes(image)
|
704
705
|
files = [("image", buffer_bytes)]
|
705
706
|
prompt = prompt.replace(", ", " .")
|
@@ -712,7 +713,7 @@ def countgd_counting(
|
|
712
713
|
bboxes_formatted = [
|
713
714
|
ODResponseData(
|
714
715
|
label=bbox["label"],
|
715
|
-
bbox=
|
716
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
716
717
|
score=round(bbox["score"], 2),
|
717
718
|
)
|
718
719
|
for bbox in bboxes_per_frame
|
@@ -757,6 +758,7 @@ def countgd_example_based_counting(
|
|
757
758
|
{'score': 0.98, 'label': 'object', 'bounding_box': [0.44, 0.24, 0.49, 0.58},
|
758
759
|
]
|
759
760
|
"""
|
761
|
+
image_size = image.shape[:2]
|
760
762
|
buffer_bytes = numpy_to_bytes(image)
|
761
763
|
files = [("image", buffer_bytes)]
|
762
764
|
visual_prompts = [
|
@@ -771,7 +773,7 @@ def countgd_example_based_counting(
|
|
771
773
|
bboxes_formatted = [
|
772
774
|
ODResponseData(
|
773
775
|
label=bbox["label"],
|
774
|
-
bbox=
|
776
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
775
777
|
score=round(bbox["score"], 2),
|
776
778
|
)
|
777
779
|
for bbox in bboxes_per_frame
|
@@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=eIpLz2NunEqEsBBrECJaD34-2uY0bsFNnW-XKfqqohs,2518
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=etqyLMZHJJz_A6tkonoYGlYvFvEW0uUHs5D1gsYwkSs,20412
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=2ZoGikn2nakGDfs20XRshZjQUyvbw6l47UhExJAYkqI,38515
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
|
@@ -15,10 +15,10 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
|
|
15
15
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
17
|
vision_agent/tools/__init__.py,sha256=PLVbfTMjKxQlHIRWnq9b785W9a52AXQS_tOa0tkQ0ZY,2420
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=TG4vXN7W2M3rOJhzVsCrDAhbctd3RJcuOmbAeXsk2Sw,25798
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=vS1yCk3Fza9eYOTHPFwwroo_ULdw2ztMQMb81x1U5f8,78524
|
22
22
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
23
23
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.160.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.160.dist-info/METADATA,sha256=yfWiwd_dUVN2AQOkCSDM9A41NPqACMoj8NaVgOHOmNQ,17753
|
32
|
+
vision_agent-0.2.160.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.160.dist-info/RECORD,,
|
File without changes
|
File without changes
|