vision-agent 0.2.133__py3-none-any.whl → 0.2.135__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@ from vision_agent.agent.vision_agent_prompts import (
15
15
  from vision_agent.lmm import LMM, Message, OpenAILMM
16
16
  from vision_agent.tools import META_TOOL_DOCSTRING, save_image, load_image
17
17
  from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
18
+ from vision_agent.tools.tools import extract_frames, save_video
18
19
  from vision_agent.utils import CodeInterpreterFactory
19
20
  from vision_agent.utils.execute import CodeInterpreter, Execution
20
21
 
@@ -224,9 +225,20 @@ class VisionAgent(Agent):
224
225
  for media in chat_i["media"]:
225
226
  if type(media) is str and media.startswith(("http", "https")):
226
227
  # TODO: Ideally we should not call VA.tools here, we should come to revisit how to better support remote image later
227
- file_path = Path(media).name
228
- ndarray = load_image(media)
229
- save_image(ndarray, file_path)
228
+ file_path = str(
229
+ Path(self.local_artifacts_path).parent
230
+ / Path(media).name
231
+ )
232
+ if file_path.lower().endswith(
233
+ ".mp4"
234
+ ) or file_path.lower().endswith(".mov"):
235
+ video_frames = extract_frames(media)
236
+ save_video(
237
+ [frame for frame, _ in video_frames], file_path
238
+ )
239
+ else:
240
+ ndarray = load_image(media)
241
+ save_image(ndarray, file_path)
230
242
  media = file_path
231
243
  else:
232
244
  media = cast(str, media)
@@ -53,25 +53,27 @@ def redisplay_results(execution: Execution) -> None:
53
53
  """
54
54
  for result in execution.results:
55
55
  if result.text is not None:
56
- display({MimeType.TEXT_PLAIN: result.text})
56
+ display({MimeType.TEXT_PLAIN: result.text}, raw=True)
57
57
  if result.html is not None:
58
- display({MimeType.TEXT_HTML: result.html})
58
+ display({MimeType.TEXT_HTML: result.html}, raw=True)
59
59
  if result.markdown is not None:
60
- display({MimeType.TEXT_MARKDOWN: result.markdown})
60
+ display({MimeType.TEXT_MARKDOWN: result.markdown}, raw=True)
61
61
  if result.svg is not None:
62
- display({MimeType.IMAGE_SVG: result.svg})
62
+ display({MimeType.IMAGE_SVG: result.svg}, raw=True)
63
63
  if result.png is not None:
64
- display({MimeType.IMAGE_PNG: result.png})
64
+ display({MimeType.IMAGE_PNG: result.png}, raw=True)
65
65
  if result.jpeg is not None:
66
- display({MimeType.IMAGE_JPEG: result.jpeg})
66
+ display({MimeType.IMAGE_JPEG: result.jpeg}, raw=True)
67
67
  if result.mp4 is not None:
68
- display({MimeType.VIDEO_MP4_B64: result.mp4})
68
+ display({MimeType.VIDEO_MP4_B64: result.mp4}, raw=True)
69
69
  if result.latex is not None:
70
- display({MimeType.TEXT_LATEX: result.latex})
70
+ display({MimeType.TEXT_LATEX: result.latex}, raw=True)
71
71
  if result.json is not None:
72
- display({MimeType.APPLICATION_JSON: result.json})
72
+ display({MimeType.APPLICATION_JSON: result.json}, raw=True)
73
+ if result.artifact_name is not None:
74
+ display({MimeType.TEXT_ARTIFACT_NAME: result.artifact_name}, raw=True)
73
75
  if result.extra is not None:
74
- display(result.extra)
76
+ display(result.extra, raw=True)
75
77
 
76
78
 
77
79
  class Artifacts:
@@ -208,7 +210,7 @@ def create_code_artifact(artifacts: Artifacts, name: str) -> str:
208
210
  return_str = f"[Artifact {name} created]"
209
211
  print(return_str)
210
212
 
211
- display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
213
+ display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
212
214
  return return_str
213
215
 
214
216
 
@@ -292,7 +294,7 @@ def edit_code_artifact(
292
294
 
293
295
  artifacts[name] = "".join(edited_lines)
294
296
 
295
- display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
297
+ display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
296
298
  return open_code_artifact(artifacts, name, cur_line)
297
299
 
298
300
 
@@ -348,7 +350,7 @@ def generate_vision_code(
348
350
  code_lines = code.splitlines(keepends=True)
349
351
  total_lines = len(code_lines)
350
352
 
351
- display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
353
+ display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
352
354
  return view_lines(code_lines, 0, total_lines, name, total_lines)
353
355
 
354
356
 
@@ -413,7 +415,7 @@ def edit_vision_code(
413
415
  code_lines = code.splitlines(keepends=True)
414
416
  total_lines = len(code_lines)
415
417
 
416
- display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
418
+ display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
417
419
  return view_lines(code_lines, 0, total_lines, name, total_lines)
418
420
 
419
421
 
@@ -427,6 +429,7 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
427
429
  with open(local_path, "rb") as f:
428
430
  media = f.read()
429
431
  artifacts[Path(local_path).name] = media
432
+ display({MimeType.TEXT_ARTIFACT_NAME: Path(local_path).name}, raw=True)
430
433
  return f"[Media {Path(local_path).name} saved]"
431
434
 
432
435
 
@@ -592,6 +595,8 @@ def use_florence2_fine_tuning(
592
595
 
593
596
  diff = get_diff_with_prompts(name, code, new_code)
594
597
  print(diff)
598
+
599
+ display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
595
600
  return diff
596
601
 
597
602
 
@@ -56,6 +56,7 @@ class MimeType(str, Enum):
56
56
  TEXT_LATEX = "text/latex"
57
57
  APPLICATION_JSON = "application/json"
58
58
  APPLICATION_JAVASCRIPT = "application/javascript"
59
+ TEXT_ARTIFACT_NAME = "text/artifact/name"
59
60
 
60
61
 
61
62
  class FileSerializer:
@@ -103,6 +104,7 @@ class Result:
103
104
  latex: Optional[str] = None
104
105
  json: Optional[Dict[str, Any]] = None
105
106
  javascript: Optional[str] = None
107
+ artifact_name: Optional[str] = None
106
108
  extra: Optional[Dict[str, Any]] = None
107
109
  "Extra data that can be included. Not part of the standard types."
108
110
 
@@ -127,6 +129,7 @@ class Result:
127
129
  self.latex = data.pop(MimeType.TEXT_LATEX, None)
128
130
  self.json = data.pop(MimeType.APPLICATION_JSON, None)
129
131
  self.javascript = data.pop(MimeType.APPLICATION_JAVASCRIPT, None)
132
+ self.artifact_name = data.pop(MimeType.TEXT_ARTIFACT_NAME, None)
130
133
  self.extra = data
131
134
  # Only keeping the PNG representation if both PNG and JPEG are present
132
135
  if self.png and self.jpeg:
@@ -204,6 +207,8 @@ class Result:
204
207
  formats.append("javascript")
205
208
  if self.mp4:
206
209
  formats.append("mp4")
210
+ if self.artifact_name:
211
+ formats.append("artifact_name")
207
212
  if self.extra:
208
213
  formats.extend(iter(self.extra))
209
214
  return formats
@@ -691,8 +696,9 @@ class CodeInterpreterFactory:
691
696
  if not code_sandbox_runtime:
692
697
  code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
693
698
  if code_sandbox_runtime == "e2b":
699
+ envs = _get_e2b_env()
694
700
  instance: CodeInterpreter = E2BCodeInterpreter(
695
- timeout=_SESSION_TIMEOUT, remote_path=remote_path
701
+ timeout=_SESSION_TIMEOUT, remote_path=remote_path, envs=envs
696
702
  )
697
703
  elif code_sandbox_runtime == "local":
698
704
  instance = LocalCodeInterpreter(
@@ -705,6 +711,20 @@ class CodeInterpreterFactory:
705
711
  return instance
706
712
 
707
713
 
714
+ def _get_e2b_env() -> Union[Dict[str, str], None]:
715
+ openai_api_key = os.getenv("OPENAI_API_KEY", "")
716
+ anthropic_api_key = os.getenv("ANTHROPIC_API_KEY", "")
717
+ if openai_api_key or anthropic_api_key:
718
+ envs = {}
719
+ if openai_api_key:
720
+ envs["OPENAI_API_KEY"] = openai_api_key
721
+ if anthropic_api_key:
722
+ envs["ANTHROPIC_API_KEY"] = anthropic_api_key
723
+ else:
724
+ envs = None
725
+ return envs
726
+
727
+
708
728
  def _parse_local_code_interpreter_outputs(outputs: List[Dict[str, Any]]) -> Execution:
709
729
  """Parse notebook cell outputs to Execution object. Output types:
710
730
  https://nbformat.readthedocs.io/en/latest/format_description.html#code-cell-outputs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.133
3
+ Version: 0.2.135
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=TddDT4e3JVc68Dt0zSk0B4OBORx_R2WhAGK71uqEe2w,204
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
4
  vision_agent/agent/agent_utils.py,sha256=qOYQn-wJsa4j4YjFOBQ41xyklCg8Y94CIIGw9ZXmgIU,2053
5
- vision_agent/agent/vision_agent.py,sha256=nfxdY5W5UME7JhwFcsB3j2-L5zsYZzJWdlS2R8U_9lE,13224
5
+ vision_agent/agent/vision_agent.py,sha256=zCgCOPhOBcw9AyoUfyJcJ3HELE0FJvW4X5sWIjd67Bw,13868
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=OI95goKTqVaEEPYwkn6bVsHsHZeifoBC8rjG9nD0Znc,36909
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=a7P19QscKNiaweke0zHPCfi5GQImpG-ZGKv_kXz0seg,13452
8
8
  vision_agent/agent/vision_agent_prompts.py,sha256=-fXiIIb48duXVljWYcJ0Y4ZzfNnRFi3C5cKdF4SdDo8,10075
@@ -15,19 +15,19 @@ vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,
15
15
  vision_agent/lmm/lmm.py,sha256=soWmEjtleQUSH2G3tYZWxOmteIqkgMVcmuZfx4mxszU,16838
16
16
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
17
  vision_agent/tools/__init__.py,sha256=nufZNzbcLTuXwxFmvZNj99qE8EO2qtEPT8wFsuI9vyE,2397
18
- vision_agent/tools/meta_tools.py,sha256=qbf_dzVmhf4zhv-xY1zaqRFshDlvj_7ilFQtSr70hdQ,21213
18
+ vision_agent/tools/meta_tools.py,sha256=Oz-wbsVSjVIH2LkTg5E4Yt2jN2N5W4DbvGkx-yVi0H0,21549
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
20
  vision_agent/tools/tool_utils.py,sha256=ZYqzcw_e937reoNr7gJgyKjQ7Gudxz1ttfIyo7F65w8,7758
21
21
  vision_agent/tools/tools.py,sha256=WKeB99ED0o_ISS_vZc-ch_1Dc8_Fl2fhnGlfVNwNouc,70024
22
22
  vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
23
23
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
25
- vision_agent/utils/execute.py,sha256=7MW9GY0gwq1VjOIV07ds4xO11voPZ0Iu_RSfb8K98Y0,27263
25
+ vision_agent/utils/execute.py,sha256=QY1GFwRDghecue_lz6s2IiRzcG1y8BrYrBohipYs7l4,27982
26
26
  vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
27
27
  vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=hOjfEOZNcddYdoa0CoviXA4Vo9kwURKuojIJgLLJdp0,4745
30
- vision_agent-0.2.133.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.133.dist-info/METADATA,sha256=A95zjuoBeW_yaDMLZUU0aITAjXHXSiDBXXvfrmpfhGo,12252
32
- vision_agent-0.2.133.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.133.dist-info/RECORD,,
30
+ vision_agent-0.2.135.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.135.dist-info/METADATA,sha256=oL3jpDgWSw8X5Bp_nEzTy93CrKPf0rdz_C7w3KQPc8I,12252
32
+ vision_agent-0.2.135.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.135.dist-info/RECORD,,