vision-agent 0.2.133__py3-none-any.whl → 0.2.135__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +15 -3
- vision_agent/tools/meta_tools.py +19 -14
- vision_agent/utils/execute.py +21 -1
- {vision_agent-0.2.133.dist-info → vision_agent-0.2.135.dist-info}/METADATA +1 -1
- {vision_agent-0.2.133.dist-info → vision_agent-0.2.135.dist-info}/RECORD +7 -7
- {vision_agent-0.2.133.dist-info → vision_agent-0.2.135.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.133.dist-info → vision_agent-0.2.135.dist-info}/WHEEL +0 -0
@@ -15,6 +15,7 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
15
15
|
from vision_agent.lmm import LMM, Message, OpenAILMM
|
16
16
|
from vision_agent.tools import META_TOOL_DOCSTRING, save_image, load_image
|
17
17
|
from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
|
18
|
+
from vision_agent.tools.tools import extract_frames, save_video
|
18
19
|
from vision_agent.utils import CodeInterpreterFactory
|
19
20
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
20
21
|
|
@@ -224,9 +225,20 @@ class VisionAgent(Agent):
|
|
224
225
|
for media in chat_i["media"]:
|
225
226
|
if type(media) is str and media.startswith(("http", "https")):
|
226
227
|
# TODO: Ideally we should not call VA.tools here, we should come to revisit how to better support remote image later
|
227
|
-
file_path =
|
228
|
-
|
229
|
-
|
228
|
+
file_path = str(
|
229
|
+
Path(self.local_artifacts_path).parent
|
230
|
+
/ Path(media).name
|
231
|
+
)
|
232
|
+
if file_path.lower().endswith(
|
233
|
+
".mp4"
|
234
|
+
) or file_path.lower().endswith(".mov"):
|
235
|
+
video_frames = extract_frames(media)
|
236
|
+
save_video(
|
237
|
+
[frame for frame, _ in video_frames], file_path
|
238
|
+
)
|
239
|
+
else:
|
240
|
+
ndarray = load_image(media)
|
241
|
+
save_image(ndarray, file_path)
|
230
242
|
media = file_path
|
231
243
|
else:
|
232
244
|
media = cast(str, media)
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -53,25 +53,27 @@ def redisplay_results(execution: Execution) -> None:
|
|
53
53
|
"""
|
54
54
|
for result in execution.results:
|
55
55
|
if result.text is not None:
|
56
|
-
display({MimeType.TEXT_PLAIN: result.text})
|
56
|
+
display({MimeType.TEXT_PLAIN: result.text}, raw=True)
|
57
57
|
if result.html is not None:
|
58
|
-
display({MimeType.TEXT_HTML: result.html})
|
58
|
+
display({MimeType.TEXT_HTML: result.html}, raw=True)
|
59
59
|
if result.markdown is not None:
|
60
|
-
display({MimeType.TEXT_MARKDOWN: result.markdown})
|
60
|
+
display({MimeType.TEXT_MARKDOWN: result.markdown}, raw=True)
|
61
61
|
if result.svg is not None:
|
62
|
-
display({MimeType.IMAGE_SVG: result.svg})
|
62
|
+
display({MimeType.IMAGE_SVG: result.svg}, raw=True)
|
63
63
|
if result.png is not None:
|
64
|
-
display({MimeType.IMAGE_PNG: result.png})
|
64
|
+
display({MimeType.IMAGE_PNG: result.png}, raw=True)
|
65
65
|
if result.jpeg is not None:
|
66
|
-
display({MimeType.IMAGE_JPEG: result.jpeg})
|
66
|
+
display({MimeType.IMAGE_JPEG: result.jpeg}, raw=True)
|
67
67
|
if result.mp4 is not None:
|
68
|
-
display({MimeType.VIDEO_MP4_B64: result.mp4})
|
68
|
+
display({MimeType.VIDEO_MP4_B64: result.mp4}, raw=True)
|
69
69
|
if result.latex is not None:
|
70
|
-
display({MimeType.TEXT_LATEX: result.latex})
|
70
|
+
display({MimeType.TEXT_LATEX: result.latex}, raw=True)
|
71
71
|
if result.json is not None:
|
72
|
-
display({MimeType.APPLICATION_JSON: result.json})
|
72
|
+
display({MimeType.APPLICATION_JSON: result.json}, raw=True)
|
73
|
+
if result.artifact_name is not None:
|
74
|
+
display({MimeType.TEXT_ARTIFACT_NAME: result.artifact_name}, raw=True)
|
73
75
|
if result.extra is not None:
|
74
|
-
display(result.extra)
|
76
|
+
display(result.extra, raw=True)
|
75
77
|
|
76
78
|
|
77
79
|
class Artifacts:
|
@@ -208,7 +210,7 @@ def create_code_artifact(artifacts: Artifacts, name: str) -> str:
|
|
208
210
|
return_str = f"[Artifact {name} created]"
|
209
211
|
print(return_str)
|
210
212
|
|
211
|
-
display({MimeType.
|
213
|
+
display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
|
212
214
|
return return_str
|
213
215
|
|
214
216
|
|
@@ -292,7 +294,7 @@ def edit_code_artifact(
|
|
292
294
|
|
293
295
|
artifacts[name] = "".join(edited_lines)
|
294
296
|
|
295
|
-
display({MimeType.
|
297
|
+
display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
|
296
298
|
return open_code_artifact(artifacts, name, cur_line)
|
297
299
|
|
298
300
|
|
@@ -348,7 +350,7 @@ def generate_vision_code(
|
|
348
350
|
code_lines = code.splitlines(keepends=True)
|
349
351
|
total_lines = len(code_lines)
|
350
352
|
|
351
|
-
display({MimeType.
|
353
|
+
display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
|
352
354
|
return view_lines(code_lines, 0, total_lines, name, total_lines)
|
353
355
|
|
354
356
|
|
@@ -413,7 +415,7 @@ def edit_vision_code(
|
|
413
415
|
code_lines = code.splitlines(keepends=True)
|
414
416
|
total_lines = len(code_lines)
|
415
417
|
|
416
|
-
display({MimeType.
|
418
|
+
display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
|
417
419
|
return view_lines(code_lines, 0, total_lines, name, total_lines)
|
418
420
|
|
419
421
|
|
@@ -427,6 +429,7 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
|
|
427
429
|
with open(local_path, "rb") as f:
|
428
430
|
media = f.read()
|
429
431
|
artifacts[Path(local_path).name] = media
|
432
|
+
display({MimeType.TEXT_ARTIFACT_NAME: Path(local_path).name}, raw=True)
|
430
433
|
return f"[Media {Path(local_path).name} saved]"
|
431
434
|
|
432
435
|
|
@@ -592,6 +595,8 @@ def use_florence2_fine_tuning(
|
|
592
595
|
|
593
596
|
diff = get_diff_with_prompts(name, code, new_code)
|
594
597
|
print(diff)
|
598
|
+
|
599
|
+
display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
|
595
600
|
return diff
|
596
601
|
|
597
602
|
|
vision_agent/utils/execute.py
CHANGED
@@ -56,6 +56,7 @@ class MimeType(str, Enum):
|
|
56
56
|
TEXT_LATEX = "text/latex"
|
57
57
|
APPLICATION_JSON = "application/json"
|
58
58
|
APPLICATION_JAVASCRIPT = "application/javascript"
|
59
|
+
TEXT_ARTIFACT_NAME = "text/artifact/name"
|
59
60
|
|
60
61
|
|
61
62
|
class FileSerializer:
|
@@ -103,6 +104,7 @@ class Result:
|
|
103
104
|
latex: Optional[str] = None
|
104
105
|
json: Optional[Dict[str, Any]] = None
|
105
106
|
javascript: Optional[str] = None
|
107
|
+
artifact_name: Optional[str] = None
|
106
108
|
extra: Optional[Dict[str, Any]] = None
|
107
109
|
"Extra data that can be included. Not part of the standard types."
|
108
110
|
|
@@ -127,6 +129,7 @@ class Result:
|
|
127
129
|
self.latex = data.pop(MimeType.TEXT_LATEX, None)
|
128
130
|
self.json = data.pop(MimeType.APPLICATION_JSON, None)
|
129
131
|
self.javascript = data.pop(MimeType.APPLICATION_JAVASCRIPT, None)
|
132
|
+
self.artifact_name = data.pop(MimeType.TEXT_ARTIFACT_NAME, None)
|
130
133
|
self.extra = data
|
131
134
|
# Only keeping the PNG representation if both PNG and JPEG are present
|
132
135
|
if self.png and self.jpeg:
|
@@ -204,6 +207,8 @@ class Result:
|
|
204
207
|
formats.append("javascript")
|
205
208
|
if self.mp4:
|
206
209
|
formats.append("mp4")
|
210
|
+
if self.artifact_name:
|
211
|
+
formats.append("artifact_name")
|
207
212
|
if self.extra:
|
208
213
|
formats.extend(iter(self.extra))
|
209
214
|
return formats
|
@@ -691,8 +696,9 @@ class CodeInterpreterFactory:
|
|
691
696
|
if not code_sandbox_runtime:
|
692
697
|
code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
|
693
698
|
if code_sandbox_runtime == "e2b":
|
699
|
+
envs = _get_e2b_env()
|
694
700
|
instance: CodeInterpreter = E2BCodeInterpreter(
|
695
|
-
timeout=_SESSION_TIMEOUT, remote_path=remote_path
|
701
|
+
timeout=_SESSION_TIMEOUT, remote_path=remote_path, envs=envs
|
696
702
|
)
|
697
703
|
elif code_sandbox_runtime == "local":
|
698
704
|
instance = LocalCodeInterpreter(
|
@@ -705,6 +711,20 @@ class CodeInterpreterFactory:
|
|
705
711
|
return instance
|
706
712
|
|
707
713
|
|
714
|
+
def _get_e2b_env() -> Union[Dict[str, str], None]:
|
715
|
+
openai_api_key = os.getenv("OPENAI_API_KEY", "")
|
716
|
+
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY", "")
|
717
|
+
if openai_api_key or anthropic_api_key:
|
718
|
+
envs = {}
|
719
|
+
if openai_api_key:
|
720
|
+
envs["OPENAI_API_KEY"] = openai_api_key
|
721
|
+
if anthropic_api_key:
|
722
|
+
envs["ANTHROPIC_API_KEY"] = anthropic_api_key
|
723
|
+
else:
|
724
|
+
envs = None
|
725
|
+
return envs
|
726
|
+
|
727
|
+
|
708
728
|
def _parse_local_code_interpreter_outputs(outputs: List[Dict[str, Any]]) -> Execution:
|
709
729
|
"""Parse notebook cell outputs to Execution object. Output types:
|
710
730
|
https://nbformat.readthedocs.io/en/latest/format_description.html#code-cell-outputs
|
@@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=TddDT4e3JVc68Dt0zSk0B4OBORx_R2WhAGK71uqEe2w,204
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=qOYQn-wJsa4j4YjFOBQ41xyklCg8Y94CIIGw9ZXmgIU,2053
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=zCgCOPhOBcw9AyoUfyJcJ3HELE0FJvW4X5sWIjd67Bw,13868
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=OI95goKTqVaEEPYwkn6bVsHsHZeifoBC8rjG9nD0Znc,36909
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a7P19QscKNiaweke0zHPCfi5GQImpG-ZGKv_kXz0seg,13452
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=-fXiIIb48duXVljWYcJ0Y4ZzfNnRFi3C5cKdF4SdDo8,10075
|
@@ -15,19 +15,19 @@ vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,
|
|
15
15
|
vision_agent/lmm/lmm.py,sha256=soWmEjtleQUSH2G3tYZWxOmteIqkgMVcmuZfx4mxszU,16838
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
17
|
vision_agent/tools/__init__.py,sha256=nufZNzbcLTuXwxFmvZNj99qE8EO2qtEPT8wFsuI9vyE,2397
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=Oz-wbsVSjVIH2LkTg5E4Yt2jN2N5W4DbvGkx-yVi0H0,21549
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=ZYqzcw_e937reoNr7gJgyKjQ7Gudxz1ttfIyo7F65w8,7758
|
21
21
|
vision_agent/tools/tools.py,sha256=WKeB99ED0o_ISS_vZc-ch_1Dc8_Fl2fhnGlfVNwNouc,70024
|
22
22
|
vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
|
23
23
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
|
-
vision_agent/utils/execute.py,sha256=
|
25
|
+
vision_agent/utils/execute.py,sha256=QY1GFwRDghecue_lz6s2IiRzcG1y8BrYrBohipYs7l4,27982
|
26
26
|
vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
|
27
27
|
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=hOjfEOZNcddYdoa0CoviXA4Vo9kwURKuojIJgLLJdp0,4745
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.135.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.135.dist-info/METADATA,sha256=oL3jpDgWSw8X5Bp_nEzTy93CrKPf0rdz_C7w3KQPc8I,12252
|
32
|
+
vision_agent-0.2.135.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.135.dist-info/RECORD,,
|
File without changes
|
File without changes
|