PyPI - vision-agent - Versions diffs - 0.2.165__tar.gz → 0.2.167__tar.gz - Mend

vision-agent 0.2.165tar.gz → 0.2.167tar.gz

Files changed (35) hide show

{vision_agent-0.2.165 → vision_agent-0.2.167}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.165
+Version: 0.2.167
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.165 → vision_agent-0.2.167}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.165"
+version = "0.2.167"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.165 → vision_agent-0.2.167}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -85,6 +85,15 @@ def format_agent_message(agent_message: str) -> str:
     return output
+def _clean_response(response: str) -> str:
+    # Sometimes the LLM will hallucinate responses to an <execute_python> tag as if it
+    # had already executed the code. This function removes the hallucinated response.
+    if "<execute_python>" in response:
+        end_execute_python = response.find("</execute_python>")
+        response = response[: end_execute_python + len("</execute_python>")]
+    return response
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     chat = copy.deepcopy(chat)
@@ -114,6 +123,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         message["media"] = chat[-1]["media"]
     conv_resp = cast(str, orch([message], stream=False))
+    # clean the response first, if we are executing code, do not resond or end
+    # conversation before the code has been executed.
+    conv_resp = _clean_response(conv_resp)
     let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
     let_user_respond = (
         "true" in let_user_respond_str.lower() if let_user_respond_str else False
@@ -458,7 +471,7 @@ class VisionAgent(Agent):
                     self.streaming_message(
                         {
                             "role": "assistant",
-                            "content": json.dumps(response),
+                            "content": json.dumps(add_step_descriptions(response)),
                             "finished": finished and code_action is None,
                         }
                     )

{vision_agent-0.2.165 → vision_agent-0.2.167}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -676,12 +676,13 @@ def use_extra_vision_agent_args(
     for node in red:
         # seems to always be atomtrailers not call type
         if node.type == "atomtrailers":
+            if node.name.value == "generate_vision_code":
+                node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
             if (
                 node.name.value == "generate_vision_code"
                 or node.name.value == "edit_vision_code"
             ):
-                node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
                 if custom_tool_names is not None:
                     node.value[1].value.append(f"custom_tool_names={custom_tool_names}")
     cleaned_code = red.dumps().strip()

{vision_agent-0.2.165 → vision_agent-0.2.167}/vision_agent/utils/video.py RENAMED Viewed

@@ -11,6 +11,9 @@ import numpy as np
 _LOGGER = logging.getLogger(__name__)
 # The maximum length of the clip to extract frames from, in seconds
+_DEFAULT_VIDEO_FPS = 24
+_DEFAULT_INPUT_FPS = 1.0
 def play_video(video_base64: str) -> None:
     """Play a video file"""
@@ -51,7 +54,9 @@ def _resize_frame(frame: np.ndarray) -> np.ndarray:
 def video_writer(
-    frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
+    frames: List[np.ndarray],
+    fps: float = _DEFAULT_INPUT_FPS,
+    filename: Optional[str] = None,
 ) -> str:
     if filename is None:
         filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
@@ -78,7 +83,7 @@ def video_writer(
 def frames_to_bytes(
-    frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
+    frames: List[np.ndarray], fps: float = _DEFAULT_INPUT_FPS, file_ext: str = ".mp4"
 ) -> bytes:
     r"""Convert a list of frames to a video file encoded into a byte string.
@@ -101,7 +106,7 @@ def frames_to_bytes(
 # same file name and the time savings are very large.
 @lru_cache(maxsize=8)
 def extract_frames_from_video(
-    video_uri: str, fps: float = 1.0
+    video_uri: str, fps: float = _DEFAULT_INPUT_FPS
 ) -> List[Tuple[np.ndarray, float]]:
     """Extract frames from a video along with the timestamp in seconds.
@@ -118,6 +123,16 @@ def extract_frames_from_video(
     cap = cv2.VideoCapture(video_uri)
     orig_fps = cap.get(cv2.CAP_PROP_FPS)
+    if not orig_fps or orig_fps <= 0:
+        _LOGGER.warning(
+            f"Input video, {video_uri}, has no fps, using the default value {_DEFAULT_VIDEO_FPS}"
+        )
+        orig_fps = _DEFAULT_VIDEO_FPS
+    if not fps or fps <= 0:
+        _LOGGER.warning(
+            f"Input fps, {fps}, is illegal, using the default value: {_DEFAULT_INPUT_FPS}"
+        )
+        fps = _DEFAULT_INPUT_FPS
     orig_frame_time = 1 / orig_fps
     targ_frame_time = 1 / fps
     frames: List[Tuple[np.ndarray, float]] = []
@@ -129,10 +144,15 @@ def extract_frames_from_video(
             break
         elapsed_time += orig_frame_time
+        # This is to prevent float point precision loss issue, which can cause
+        # the elapsed time to be slightly less than the target frame time, which
+        # causes the last frame to be skipped
+        elapsed_time = round(elapsed_time, 8)
         if elapsed_time >= targ_frame_time:
             frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
             elapsed_time -= targ_frame_time
         i += 1
     cap.release()
+    _LOGGER.info(f"Extracted {len(frames)} frames from {video_uri}")
     return frames