PyPI - vision-agent - Versions diffs - 0.2.94__tar.gz → 0.2.96__tar.gz - Mend

vision-agent 0.2.94tar.gz → 0.2.96tar.gz

Files changed (29) hide show

{vision_agent-0.2.94 → vision_agent-0.2.96}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.94
+Version: 0.2.96
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Requires-Dist: anthropic (>=0.31.0,<0.32.0)
 Requires-Dist: e2b (>=0.17.1,<0.18.0)
-Requires-Dist: e2b-code-interpreter (==0.0.11a2)
+Requires-Dist: e2b-code-interpreter (==0.0.11a17)
 Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
 Requires-Dist: langsmith (>=0.1.58,<0.2.0)
 Requires-Dist: moviepy (>=1.0.0,<2.0.0)
@@ -23,6 +23,7 @@ Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
 Requires-Dist: pandas (>=2.0.0,<3.0.0)
 Requires-Dist: pillow (>=10.0.0,<11.0.0)
 Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
+Requires-Dist: pydantic (==2.7.4)
 Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
 Requires-Dist: pytube (==15.0.0)
 Requires-Dist: requests (>=2.0.0,<3.0.0)

{vision_agent-0.2.94 → vision_agent-0.2.96}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.94"
+version = "0.2.96"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"
@@ -35,11 +35,12 @@ rich = "^13.7.1"
 langsmith = "^0.1.58"
 ipykernel = "^6.29.4"
 e2b = "^0.17.1"
-e2b-code-interpreter = "0.0.11a2"
+e2b-code-interpreter = "0.0.11a17"
 tenacity = "^8.3.0"
 pillow-heif = "^0.16.0"
 pytube = "15.0.0"
 anthropic = "^0.31.0"
+pydantic = "2.7.4"
 [tool.poetry.group.dev.dependencies]
 autoflake = "1.*"

{vision_agent-0.2.94 → vision_agent-0.2.96}/vision_agent/tools/tools.py RENAMED Viewed

@@ -9,7 +9,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import cv2
 import numpy as np
 import requests
-from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
@@ -107,6 +106,7 @@ def grounding_dino(
             "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
         ),
         "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+        "function_name": "grounding_dino",
     }
     data: Dict[str, Any] = send_inference_request(request_data, "tools")
     return_data = []
@@ -162,6 +162,7 @@ def owl_v2(
         "image": image_b64,
         "tool": "open_vocab_detection",
         "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+        "function_name": "owl_v2",
     }
     data: Dict[str, Any] = send_inference_request(request_data, "tools")
     return_data = []
@@ -226,6 +227,7 @@ def grounding_sam(
         "image": image_b64,
         "tool": "visual_grounding_segment",
         "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+        "function_name": "grounding_sam",
     }
     data: Dict[str, Any] = send_inference_request(request_data, "tools")
     return_data = []
@@ -365,6 +367,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
     data = {
         "image": image_b64,
         "tool": "zero_shot_counting",
+        "function_name": "loca_zero_shot_counting",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
@@ -400,6 +403,7 @@ def loca_visual_prompt_counting(
         "image": image_b64,
         "prompt": bbox_str,
         "tool": "few_shot_counting",
+        "function_name": "loca_visual_prompt_counting",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
@@ -429,6 +433,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
         "image": image_b64,
         "prompt": prompt,
         "tool": "image_question_answering_with_context",
+        "function_name": "florencev2_roberta_vqa",
     }
     answer = send_inference_request(data, "tools")
@@ -458,6 +463,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
         "image": image_b64,
         "prompt": prompt,
         "tool": "image_question_answering",
+        "function_name": "git_vqa_v2",
     }
     answer = send_inference_request(data, "tools")
@@ -488,6 +494,7 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
         "prompt": ",".join(classes),
         "image": image_b64,
         "tool": "closed_set_image_classification",
+        "function_name": "clip",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
@@ -515,6 +522,7 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
     data = {
         "image": image_b64,
         "tool": "image_classification",
+        "function_name": "vit_image_classification",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
@@ -542,6 +550,7 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
     data = {
         "image": image_b64,
         "tool": "nsfw_image_classification",
+        "function_name": "vit_nsfw_classification",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["scores"] = round(resp_data["scores"], 4)
@@ -568,6 +577,7 @@ def blip_image_caption(image: np.ndarray) -> str:
     data = {
         "image": image_b64,
         "tool": "image_captioning",
+        "function_name": "blip_image_caption",
     }
     answer = send_inference_request(data, "tools")
@@ -596,6 +606,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
         "image": image_b64,
         "tool": "florence2_image_captioning",
         "detail_caption": detail_caption,
+        "function_name": "florencev2_image_caption",
     }
     answer = send_inference_request(data, "tools")
@@ -631,6 +642,7 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
     data = {
         "image": image_b64,
         "tool": "object_detection",
+        "function_name": "florencev2_object_detection",
     }
     answer = send_inference_request(data, "tools")
@@ -687,6 +699,7 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
     data = {
         "image": image_b64,
         "tool": "panoptic_segmentation",
+        "function_name": "detr_segmentation",
     }
     answer = send_inference_request(data, "tools")
@@ -729,6 +742,7 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
     data = {
         "image": image_b64,
         "tool": "generate_depth",
+        "function_name": "depth_anything_v2",
     }
     answer = send_inference_request(data, "tools")
@@ -760,6 +774,7 @@ def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
     data = {
         "image": image_b64,
         "tool": "generate_hed",
+        "function_name": "generate_soft_edge_image",
     }
     answer = send_inference_request(data, "tools")
@@ -792,6 +807,7 @@ def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
     data = {
         "image": image_b64,
         "tool": "generate_normal",
+        "function_name": "dpt_hybrid_midas",
     }
     answer = send_inference_request(data, "tools")
@@ -823,6 +839,7 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
     data = {
         "image": image_b64,
         "tool": "generate_pose",
+        "function_name": "generate_pose_image",
     }
     answer = send_inference_request(data, "tools")
@@ -863,6 +880,7 @@ def template_match(
         "image": image_b64,
         "template": template_image_b64,
         "tool": "template_match",
+        "function_name": "template_match",
     }
     answer = send_inference_request(data, "tools")
@@ -1044,15 +1062,21 @@ def save_video(
     if fps <= 0:
         _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
         fps = 4
-    with ImageSequenceClip(frames, fps=fps) as video:
-        if output_video_path:
-            f = open(output_video_path, "wb")
-        else:
-            f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)  # type: ignore
-        video.write_videofile(f.name, codec="libx264")
-        f.close()
-        _save_video_to_result(f.name)
-        return f.name
+    if not output_video_path:
+        output_video_path = tempfile.NamedTemporaryFile(
+            suffix=".mp4", delete=False
+        ).name
+    height, width, layers = frames[0].shape if frames else (0, 0, 0)
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # type: ignore
+    video = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
+    for frame in frames:
+        video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+    video.release()
+    _save_video_to_result(output_video_path)
+    return output_video_path
 def _save_video_to_result(video_uri: str) -> None:

{vision_agent-0.2.94 → vision_agent-0.2.96}/vision_agent/utils/execute.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import abc
 import base64
-import copy
 import logging
 import os
 import platform
@@ -17,9 +16,14 @@ from typing import Any, Dict, Iterable, List, Optional, Union
 import nbformat
 import tenacity
 from dotenv import load_dotenv
+from e2b.exceptions import SandboxException
 from e2b_code_interpreter import CodeInterpreter as E2BCodeInterpreterImpl
 from e2b_code_interpreter import Execution as E2BExecution
 from e2b_code_interpreter import Result as E2BResult
+from h11._util import LocalProtocolError
+from httpx import ConnectError
+from httpx import RemoteProtocolError as HttpcoreRemoteProtocolError
+from httpx import RemoteProtocolError as HttpxRemoteProtocolError
 from nbclient import NotebookClient
 from nbclient import __version__ as nbclient_version
 from nbclient.exceptions import CellTimeoutError, DeadKernelError
@@ -29,7 +33,6 @@ from pydantic import BaseModel, field_serializer
 from typing_extensions import Self
 from vision_agent.utils.exceptions import (
-    RemoteSandboxClosedError,
     RemoteSandboxCreationError,
     RemoteSandboxExecutionError,
 )
@@ -106,13 +109,8 @@ class Result:
     is_main_result: bool
     "Whether this data is the result of the cell. Data can be produced by display calls of which can be multiple in a cell."
-    raw: Dict[str, str]
-    "Dictionary that maps MIME types to their corresponding string representations of the data."
     def __init__(self, is_main_result: bool, data: Dict[str, Any]):
         self.is_main_result = is_main_result
-        self.raw = copy.deepcopy(data)
         self.text = data.pop(MimeType.TEXT_PLAIN, None)
         if self.text and (self.text.startswith("'") and self.text.endswith("'")):
             # This is a workaround for the issue that str result is wrapped with single quotes by notebook.
@@ -136,13 +134,13 @@ class Result:
     # Allows to iterate over formats()
     def __getitem__(self, key: Any) -> Any:
-        return self.raw[key] if key in self.raw else getattr(self, key)
+        return getattr(self, key)
     def __str__(self) -> str:
         return repr(self)
     def __repr__(self) -> str:
-        return str(self.raw)
+        return str(self.text)
     def _repr_html_(self) -> Optional[str]:
         """Returns the HTML representation of the data."""
@@ -215,9 +213,16 @@ class Result:
         """
         Creates a Result object from an E2BResult object.
         """
+        data = {
+            MimeType.TEXT_PLAIN.value: result.text,
+            MimeType.IMAGE_PNG.value: result.png,
+            MimeType.APPLICATION_JSON.value: result.json,
+        }
+        for k, v in result.extra.items():
+            data[k] = v
         return Result(
             is_main_result=result.is_main_result,
-            data=result.raw,
+            data=data,
         )
@@ -367,7 +372,7 @@ class Execution(BaseModel):
                     value=_remove_escape_and_color_codes(exec.error.value),
                     traceback_raw=[
                         _remove_escape_and_color_codes(line)
-                        for line in exec.error.traceback_raw
+                        for line in exec.error.traceback.split("\n")
                     ],
                 )
                 if exec.error
@@ -436,11 +441,12 @@ va_version = importlib.metadata.version("vision-agent")
 print(f"Vision Agent version: {va_version}")"""
         )
         sys_versions = "\n".join(result.logs.stdout)
-        _LOGGER.info(f"E2BCodeInterpreter initialized:\n{sys_versions}")
+        _LOGGER.info(
+            f"E2BCodeInterpreter (sandbox id: {self.interpreter.sandbox_id}) initialized:\n{sys_versions}"
+        )
     def close(self, *args: Any, **kwargs: Any) -> None:
         try:
-            self.interpreter.notebook.close()
             self.interpreter.kill(request_timeout=2)
             _LOGGER.info(
                 f"The sandbox {self.interpreter.sandbox_id} is closed successfully."
@@ -451,28 +457,67 @@ print(f"Vision Agent version: {va_version}")"""
             )
     def restart_kernel(self) -> None:
-        self._check_sandbox_liveness()
         self.interpreter.notebook.restart_kernel()
     @tenacity.retry(
         wait=tenacity.wait_exponential_jitter(),
-        stop=tenacity.stop_after_attempt(2),
-        # TODO: change TimeoutError to a more specific exception when e2b team provides more granular retryable exceptions
-        retry=tenacity.retry_if_exception_type(TimeoutError),
+        stop=tenacity.stop_after_attempt(3),
+        retry=tenacity.retry_if_exception_type(
+            (
+                LocalProtocolError,
+                HttpxRemoteProtocolError,
+                HttpcoreRemoteProtocolError,
+                ConnectError,
+                SandboxException,
+            )
+        ),
+        before_sleep=tenacity.before_sleep_log(_LOGGER, logging.INFO),
+        after=tenacity.after_log(_LOGGER, logging.INFO),
     )
     def exec_cell(self, code: str) -> Execution:
-        self._check_sandbox_liveness()
         self.interpreter.set_timeout(_SESSION_TIMEOUT)  # Extend the life of the sandbox
         try:
-            execution = self.interpreter.notebook.exec_cell(code, timeout=self.timeout)
+            _LOGGER.info(
+                f"Start code execution in remote sandbox {self.interpreter.sandbox_id}. Timeout: {_SESSION_TIMEOUT}. Code hash: {hash(code)}"
+            )
+            execution = self.interpreter.notebook.exec_cell(
+                code=code,
+                on_stdout=lambda msg: _LOGGER.info(msg),
+                on_stderr=lambda msg: _LOGGER.info(msg),
+            )
+            _LOGGER.info(
+                f"Finished code execution in remote sandbox {self.interpreter.sandbox_id}. Code hash: {hash(code)}"
+            )
             return Execution.from_e2b_execution(execution)
+        except (
+            LocalProtocolError,
+            HttpxRemoteProtocolError,
+            HttpcoreRemoteProtocolError,
+            ConnectError,
+            SandboxException,
+        ) as e:
+            raise e
         except Exception as e:
             raise RemoteSandboxExecutionError(
-                f"Failed executing code in remote sandbox due to {e}: {code}"
+                f"Failed executing code in remote sandbox ({self.interpreter.sandbox_id}) due to error '{type(e).__name__} {str(e)}', code: {code}"
             ) from e
+    @tenacity.retry(
+        wait=tenacity.wait_exponential_jitter(),
+        stop=tenacity.stop_after_attempt(3),
+        retry=tenacity.retry_if_exception_type(
+            (
+                LocalProtocolError,
+                HttpxRemoteProtocolError,
+                HttpcoreRemoteProtocolError,
+                ConnectError,
+                SandboxException,
+            )
+        ),
+        before_sleep=tenacity.before_sleep_log(_LOGGER, logging.INFO),
+        after=tenacity.after_log(_LOGGER, logging.INFO),
+    )
     def upload_file(self, file: Union[str, Path]) -> str:
-        self._check_sandbox_liveness()
         file_name = Path(file).name
         remote_path = f"/home/user/{file_name}"
         with open(file, "rb") as f:
@@ -481,28 +526,18 @@ print(f"Vision Agent version: {va_version}")"""
             return remote_path
     def download_file(self, file_path: str) -> Path:
-        self._check_sandbox_liveness()
         with tempfile.NamedTemporaryFile(mode="w+b", delete=False) as file:
             file.write(self.interpreter.files.read(path=file_path, format="bytes"))
             _LOGGER.info(f"File ({file_path}) is downloaded to: {file.name}")
             return Path(file.name)
-    def _check_sandbox_liveness(self) -> None:
-        try:
-            alive = self.interpreter.is_running(request_timeout=2)
-        except Exception as e:
-            _LOGGER.error(
-                f"Failed to check the health of the remote sandbox ({self.interpreter.sandbox_id}) due to {e}. Consider the sandbox as dead."
-            )
-            alive = False
-        if not alive:
-            raise RemoteSandboxClosedError(
-                "Remote sandbox is closed unexpectedly. Please start a new VisionAgent instance."
-            )
     @staticmethod
     def _new_e2b_interpreter_impl(*args, **kwargs) -> E2BCodeInterpreterImpl:  # type: ignore
-        return E2BCodeInterpreterImpl(template="va-sandbox", *args, **kwargs)
+        template_name = os.environ.get("E2B_TEMPLATE_NAME", "nx3fagq7sgdliww9cvm3")
+        _LOGGER.info(
+            f"Creating a new E2BCodeInterpreter using template: {template_name}"
+        )
+        return E2BCodeInterpreterImpl(template=template_name, *args, **kwargs)
 class LocalCodeInterpreter(CodeInterpreter):

{vision_agent-0.2.94 → vision_agent-0.2.96}/vision_agent/utils/sim.py RENAMED Viewed

@@ -9,6 +9,7 @@ from openai import AzureOpenAI, Client, OpenAI
 from scipy.spatial.distance import cosine  # type: ignore
+@lru_cache(maxsize=512)
 def get_embedding(
     client: Client, text: str, model: str = "text-embedding-3-small"
 ) -> List[float]: