PyPI - vision-agent - Versions diffs - 0.2.95__tar.gz → 0.2.97__tar.gz - Mend

vision-agent 0.2.95tar.gz → 0.2.97tar.gz

Files changed (29) hide show

{vision_agent-0.2.95 → vision_agent-0.2.97}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.95
+Version: 0.2.97
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Requires-Dist: anthropic (>=0.31.0,<0.32.0)
 Requires-Dist: e2b (>=0.17.1,<0.18.0)
-Requires-Dist: e2b-code-interpreter (==0.0.11a17)
+Requires-Dist: e2b-code-interpreter (==0.0.11a27)
 Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
 Requires-Dist: langsmith (>=0.1.58,<0.2.0)
 Requires-Dist: moviepy (>=1.0.0,<2.0.0)

{vision_agent-0.2.95 → vision_agent-0.2.97}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.95"
+version = "0.2.97"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"
@@ -35,7 +35,7 @@ rich = "^13.7.1"
 langsmith = "^0.1.58"
 ipykernel = "^6.29.4"
 e2b = "^0.17.1"
-e2b-code-interpreter = "0.0.11a17"
+e2b-code-interpreter = "0.0.11a27"
 tenacity = "^8.3.0"
 pillow-heif = "^0.16.0"
 pytube = "15.0.0"

{vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/tools/tools.py RENAMED Viewed

@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import cv2
 import numpy as np
 import requests
+from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
@@ -106,6 +107,7 @@ def grounding_dino(
             "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
         ),
         "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+        "function_name": "grounding_dino",
     }
     data: Dict[str, Any] = send_inference_request(request_data, "tools")
     return_data = []
@@ -161,6 +163,7 @@ def owl_v2(
         "image": image_b64,
         "tool": "open_vocab_detection",
         "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+        "function_name": "owl_v2",
     }
     data: Dict[str, Any] = send_inference_request(request_data, "tools")
     return_data = []
@@ -225,6 +228,7 @@ def grounding_sam(
         "image": image_b64,
         "tool": "visual_grounding_segment",
         "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+        "function_name": "grounding_sam",
     }
     data: Dict[str, Any] = send_inference_request(request_data, "tools")
     return_data = []
@@ -364,6 +368,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
     data = {
         "image": image_b64,
         "tool": "zero_shot_counting",
+        "function_name": "loca_zero_shot_counting",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
@@ -399,6 +404,7 @@ def loca_visual_prompt_counting(
         "image": image_b64,
         "prompt": bbox_str,
         "tool": "few_shot_counting",
+        "function_name": "loca_visual_prompt_counting",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
@@ -428,6 +434,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
         "image": image_b64,
         "prompt": prompt,
         "tool": "image_question_answering_with_context",
+        "function_name": "florencev2_roberta_vqa",
     }
     answer = send_inference_request(data, "tools")
@@ -457,6 +464,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
         "image": image_b64,
         "prompt": prompt,
         "tool": "image_question_answering",
+        "function_name": "git_vqa_v2",
     }
     answer = send_inference_request(data, "tools")
@@ -487,6 +495,7 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
         "prompt": ",".join(classes),
         "image": image_b64,
         "tool": "closed_set_image_classification",
+        "function_name": "clip",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
@@ -514,6 +523,7 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
     data = {
         "image": image_b64,
         "tool": "image_classification",
+        "function_name": "vit_image_classification",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
@@ -541,6 +551,7 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
     data = {
         "image": image_b64,
         "tool": "nsfw_image_classification",
+        "function_name": "vit_nsfw_classification",
     }
     resp_data = send_inference_request(data, "tools")
     resp_data["scores"] = round(resp_data["scores"], 4)
@@ -567,6 +578,7 @@ def blip_image_caption(image: np.ndarray) -> str:
     data = {
         "image": image_b64,
         "tool": "image_captioning",
+        "function_name": "blip_image_caption",
     }
     answer = send_inference_request(data, "tools")
@@ -595,6 +607,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
         "image": image_b64,
         "tool": "florence2_image_captioning",
         "detail_caption": detail_caption,
+        "function_name": "florencev2_image_caption",
     }
     answer = send_inference_request(data, "tools")
@@ -630,6 +643,7 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
     data = {
         "image": image_b64,
         "tool": "object_detection",
+        "function_name": "florencev2_object_detection",
     }
     answer = send_inference_request(data, "tools")
@@ -686,6 +700,7 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
     data = {
         "image": image_b64,
         "tool": "panoptic_segmentation",
+        "function_name": "detr_segmentation",
     }
     answer = send_inference_request(data, "tools")
@@ -728,6 +743,7 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
     data = {
         "image": image_b64,
         "tool": "generate_depth",
+        "function_name": "depth_anything_v2",
     }
     answer = send_inference_request(data, "tools")
@@ -759,6 +775,7 @@ def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
     data = {
         "image": image_b64,
         "tool": "generate_hed",
+        "function_name": "generate_soft_edge_image",
     }
     answer = send_inference_request(data, "tools")
@@ -791,6 +808,7 @@ def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
     data = {
         "image": image_b64,
         "tool": "generate_normal",
+        "function_name": "dpt_hybrid_midas",
     }
     answer = send_inference_request(data, "tools")
@@ -822,6 +840,7 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
     data = {
         "image": image_b64,
         "tool": "generate_pose",
+        "function_name": "generate_pose_image",
     }
     answer = send_inference_request(data, "tools")
@@ -862,6 +881,7 @@ def template_match(
         "image": image_b64,
         "template": template_image_b64,
         "tool": "template_match",
+        "function_name": "template_match",
     }
     answer = send_inference_request(data, "tools")
@@ -1044,20 +1064,15 @@ def save_video(
         _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
         fps = 4
-    if not output_video_path:
-        output_video_path = tempfile.NamedTemporaryFile(
-            suffix=".mp4", delete=False
-        ).name
-    height, width, layers = frames[0].shape if frames else (0, 0, 0)
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # type: ignore
-    video = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
-    for frame in frames:
-        video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
-    video.release()
-    _save_video_to_result(output_video_path)
-    return output_video_path
+    with ImageSequenceClip(frames, fps=fps) as video:
+        if output_video_path:
+            f = open(output_video_path, "wb")
+        else:
+            f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)  # type: ignore
+        video.write_videofile(f.name, codec="libx264")
+        f.close()
+        _save_video_to_result(f.name)
+        return f.name
 def _save_video_to_result(video_uri: str) -> None: