PyPI - vision-agent - Versions diffs - 0.2.131__tar.gz → 0.2.133__tar.gz - Mend

vision-agent 0.2.131tar.gz → 0.2.133tar.gz

Files changed (33) hide show

{vision_agent-0.2.131 → vision_agent-0.2.133}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.131
+Version: 0.2.133
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -13,7 +13,6 @@ Requires-Dist: anthropic (>=0.31.0,<0.32.0)
 Requires-Dist: av (>=11.0.0,<12.0.0)
 Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
 Requires-Dist: e2b-code-interpreter (==0.0.11a37)
-Requires-Dist: eva-decord (>=0.6.1,<0.7.0)
 Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
 Requires-Dist: langsmith (>=0.1.58,<0.2.0)
 Requires-Dist: nbclient (>=0.10.0,<0.11.0)

{vision_agent-0.2.131 → vision_agent-0.2.133}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.131"
+version = "0.2.133"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"
@@ -41,7 +41,6 @@ pillow-heif = "^0.16.0"
 pytube = "15.0.0"
 anthropic = "^0.31.0"
 pydantic = "2.7.4"
-eva-decord = "^0.6.1"
 av = "^11.0.0"
 [tool.poetry.group.dev.dependencies]

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/agent/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@ from .agent import Agent
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
     AzureVisionAgentCoder,
+    ClaudeVisionAgentCoder,
     OllamaVisionAgentCoder,
     VisionAgentCoder,
 )

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/agent/agent_utils.py RENAMED Viewed

@@ -14,6 +14,10 @@ def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
     if match:
         json_str = match.group()
         try:
+            # remove trailing comma
+            trailing_bracket_pattern = r",\s+\}"
+            json_str = re.sub(trailing_bracket_pattern, "}", json_str, flags=re.DOTALL)
             json_dict = json.loads(json_str)
             return json_dict  # type: ignore
         except json.JSONDecodeError:
@@ -21,29 +25,37 @@ def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
     return None
+def _find_markdown_json(json_str: str) -> str:
+    pattern = r"```json(.*?)```"
+    match = re.search(pattern, json_str, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return json_str
+def _strip_markdown_code(inp_str: str) -> str:
+    pattern = r"```python.*?```"
+    cleaned_str = re.sub(pattern, "", inp_str, flags=re.DOTALL)
+    return cleaned_str
 def extract_json(json_str: str) -> Dict[str, Any]:
+    json_str = json_str.replace("\n", " ").strip()
     try:
-        json_str = json_str.replace("\n", " ")
-        json_dict = json.loads(json_str)
+        return json.loads(json_str)  # type: ignore
     except json.JSONDecodeError:
-        if "```json" in json_str:
-            json_str = json_str[json_str.find("```json") + len("```json") :]
-            json_str = json_str[: json_str.find("```")]
-        elif "```" in json_str:
-            json_str = json_str[json_str.find("```") + len("```") :]
-            # get the last ``` not one from an intermediate string
-            json_str = json_str[: json_str.find("}```")]
-        try:
-            json_dict = json.loads(json_str)
-        except json.JSONDecodeError as e:
-            json_dict = _extract_sub_json(json_str)
-            if json_dict is not None:
-                return json_dict  # type: ignore
-            error_msg = f"Could not extract JSON from the given str: {json_str}"
+        json_orig = json_str
+        json_str = _strip_markdown_code(json_str)
+        json_str = _find_markdown_json(json_str)
+        json_dict = _extract_sub_json(json_str)
+        if json_dict is None:
+            error_msg = f"Could not extract JSON from the given str: {json_orig}"
             _LOGGER.exception(error_msg)
-            raise ValueError(error_msg) from e
+            raise ValueError(error_msg)
-    return json_dict  # type: ignore
+        return json_dict
 def extract_code(code: str) -> str:

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -3,7 +3,7 @@ import logging
 import os
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast, Callable
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
@@ -13,7 +13,7 @@ from vision_agent.agent.vision_agent_prompts import (
     VA_CODE,
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
-from vision_agent.tools import META_TOOL_DOCSTRING
+from vision_agent.tools import META_TOOL_DOCSTRING, save_image, load_image
 from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
@@ -123,6 +123,7 @@ class VisionAgent(Agent):
         verbosity: int = 0,
         local_artifacts_path: Optional[Union[str, Path]] = None,
         code_sandbox_runtime: Optional[str] = None,
+        callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
     ) -> None:
         """Initialize the VisionAgent.
@@ -141,6 +142,7 @@ class VisionAgent(Agent):
         self.max_iterations = 100
         self.verbosity = verbosity
         self.code_sandbox_runtime = code_sandbox_runtime
+        self.callback_message = callback_message
         if self.verbosity >= 1:
             _LOGGER.setLevel(logging.INFO)
         self.local_artifacts_path = cast(
@@ -220,7 +222,14 @@ class VisionAgent(Agent):
             for chat_i in int_chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = cast(str, media)
+                        if type(media) is str and media.startswith(("http", "https")):
+                            # TODO: Ideally we should not call VA.tools here, we should come to revisit how to better support remote image later
+                            file_path = Path(media).name
+                            ndarray = load_image(media)
+                            save_image(ndarray, file_path)
+                            media = file_path
+                        else:
+                            media = cast(str, media)
                         artifacts.artifacts[Path(media).name] = open(media, "rb").read()
                         media_remote_path = (
@@ -262,6 +271,7 @@ class VisionAgent(Agent):
             artifacts_loaded = artifacts.show()
             int_chat.append({"role": "observation", "content": artifacts_loaded})
             orig_chat.append({"role": "observation", "content": artifacts_loaded})
+            self.streaming_message({"role": "observation", "content": artifacts_loaded})
             while not finished and iterations < self.max_iterations:
                 response = run_conversation(self.agent, int_chat)
@@ -274,6 +284,8 @@ class VisionAgent(Agent):
                 if last_response == response:
                     response["let_user_respond"] = True
+                self.streaming_message({"role": "assistant", "content": response})
                 if response["let_user_respond"]:
                     break
@@ -293,6 +305,13 @@ class VisionAgent(Agent):
                     orig_chat.append(
                         {"role": "observation", "content": obs, "execution": result}
                     )
+                    self.streaming_message(
+                        {
+                            "role": "observation",
+                            "content": obs,
+                            "execution": result,
+                        }
+                    )
                 iterations += 1
                 last_response = response
@@ -305,5 +324,9 @@ class VisionAgent(Agent):
             artifacts.save()
         return orig_chat, artifacts
+    def streaming_message(self, message: Dict[str, Any]) -> None:
+        if self.callback_message:
+            self.callback_message(message)
     def log_progress(self, data: Dict[str, Any]) -> None:
         pass

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -27,7 +27,14 @@ from vision_agent.agent.vision_agent_coder_prompts import (
     TEST_PLANS,
     USER_REQ,
 )
-from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
+from vision_agent.lmm import (
+    LMM,
+    AzureOpenAILMM,
+    ClaudeSonnetLMM,
+    Message,
+    OllamaLMM,
+    OpenAILMM,
+)
 from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
@@ -167,9 +174,10 @@ def pick_plan(
         }
     )
     tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
-    tool_output_str = ""
-    if len(tool_output.logs.stdout) > 0:
-        tool_output_str = tool_output.logs.stdout[0]
+    # Because of the way we trace function calls the trace information ends up in the
+    # results. We don't want to show this info to the LLM so we don't include it in the
+    # tool_output_str.
+    tool_output_str = tool_output.text(include_results=False).strip()
     if verbosity == 2:
         _print_code("Initial code and tests:", code)
@@ -196,7 +204,7 @@ def pick_plan(
             docstring=tool_info,
             plans=plan_str,
             previous_attempts=PREVIOUS_FAILED.format(
-                code=code, error=tool_output.text()
+                code=code, error="\n".join(tool_output_str.splitlines()[-50:])
             ),
             media=media,
         )
@@ -225,11 +233,11 @@ def pick_plan(
                 "status": "completed" if tool_output.success else "failed",
             }
         )
-        tool_output_str = tool_output.text().strip()
+        tool_output_str = tool_output.text(include_results=False).strip()
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
-            _LOGGER.info(f"Code execution result after attempt {count}")
+            _LOGGER.info(f"Code execution result after attempt {count + 1}")
         count += 1
@@ -387,7 +395,6 @@ def write_and_test_code(
             "code": DefaultImports.prepend_imports(code),
             "payload": {
                 "test": test,
-                # "result": result.to_json(),
             },
         }
     )
@@ -406,6 +413,7 @@ def write_and_test_code(
             working_memory,
             debugger,
             code_interpreter,
+            tool_info,
             code,
             test,
             result,
@@ -431,6 +439,7 @@ def debug_code(
     working_memory: List[Dict[str, str]],
     debugger: LMM,
     code_interpreter: CodeInterpreter,
+    tool_info: str,
     code: str,
     test: str,
     result: Execution,
@@ -451,17 +460,38 @@ def debug_code(
     count = 0
     while not success and count < 3:
         try:
-            fixed_code_and_test = extract_json(
-                debugger(  # type: ignore
-                    FIX_BUG.format(
-                        code=code,
-                        tests=test,
-                        result="\n".join(result.text().splitlines()[-50:]),
-                        feedback=format_memory(working_memory + new_working_memory),
+            # LLMs write worse code when it's in JSON, so we have it write JSON
+            # followed by code each wrapped in markdown blocks.
+            fixed_code_and_test_str = debugger(
+                FIX_BUG.format(
+                    docstring=tool_info,
+                    code=code,
+                    tests=test,
+                    # Because of the way we trace function calls the trace information
+                    # ends up in the results. We don't want to show this info to the
+                    # LLM so we don't include it in the tool_output_str.
+                    result="\n".join(
+                        result.text(include_results=False).splitlines()[-50:]
                     ),
-                    stream=False,
-                )
+                    feedback=format_memory(working_memory + new_working_memory),
+                ),
+                stream=False,
             )
+            fixed_code_and_test_str = cast(str, fixed_code_and_test_str)
+            fixed_code_and_test = extract_json(fixed_code_and_test_str)
+            code = extract_code(fixed_code_and_test_str)
+            if (
+                "which_code" in fixed_code_and_test
+                and fixed_code_and_test["which_code"] == "test"
+            ):
+                fixed_code_and_test["code"] = ""
+                fixed_code_and_test["test"] = code
+            else:  # for everything else always assume it's updating code
+                fixed_code_and_test["code"] = code
+                fixed_code_and_test["test"] = ""
+            if "which_code" in fixed_code_and_test:
+                del fixed_code_and_test["which_code"]
             success = True
         except Exception as e:
             _LOGGER.exception(f"Error while extracting JSON: {e}")
@@ -472,9 +502,9 @@ def debug_code(
     old_test = test
     if fixed_code_and_test["code"].strip() != "":
-        code = extract_code(fixed_code_and_test["code"])
+        code = fixed_code_and_test["code"]
     if fixed_code_and_test["test"].strip() != "":
-        test = extract_code(fixed_code_and_test["test"])
+        test = fixed_code_and_test["test"]
     new_working_memory.append(
         {
@@ -628,9 +658,7 @@ class VisionAgentCoder(Agent):
         )
         self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
         self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
-        self.debugger = (
-            OpenAILMM(temperature=0.0, json_mode=True) if debugger is None else debugger
-        )
+        self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
         self.verbosity = verbosity
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
@@ -876,6 +904,40 @@ class VisionAgentCoder(Agent):
                 )
+class ClaudeVisionAgentCoder(VisionAgentCoder):
+    def __init__(
+        self,
+        planner: Optional[LMM] = None,
+        coder: Optional[LMM] = None,
+        tester: Optional[LMM] = None,
+        debugger: Optional[LMM] = None,
+        tool_recommender: Optional[Sim] = None,
+        verbosity: int = 0,
+        report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+    ) -> None:
+        # NOTE: Claude doesn't have an official JSON mode
+        self.planner = ClaudeSonnetLMM(temperature=0.0) if planner is None else planner
+        self.coder = ClaudeSonnetLMM(temperature=0.0) if coder is None else coder
+        self.tester = ClaudeSonnetLMM(temperature=0.0) if tester is None else tester
+        self.debugger = (
+            ClaudeSonnetLMM(temperature=0.0) if debugger is None else debugger
+        )
+        self.verbosity = verbosity
+        if self.verbosity > 0:
+            _LOGGER.setLevel(logging.INFO)
+        # Anthropic does not offer any embedding models and instead recomends Voyage,
+        # we're using OpenAI's embedder for now.
+        self.tool_recommender = (
+            Sim(T.TOOLS_DF, sim_key="desc")
+            if tool_recommender is None
+            else tool_recommender
+        )
+        self.report_progress_callback = report_progress_callback
+        self.code_sandbox_runtime = code_sandbox_runtime
 class OllamaVisionAgentCoder(VisionAgentCoder):
     """VisionAgentCoder that uses Ollama models for planning, coding, testing.
@@ -920,7 +982,7 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
                 else tester
             ),
             debugger=(
-                OllamaLMM(model_name="llama3.1", temperature=0.0, json_mode=True)
+                OllamaLMM(model_name="llama3.1", temperature=0.0)
                 if debugger is None
                 else debugger
             ),
@@ -983,9 +1045,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
             coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder,
             tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester,
             debugger=(
-                AzureOpenAILMM(temperature=0.0, json_mode=True)
-                if debugger is None
-                else debugger
+                AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
             ),
             tool_recommender=(
                 AzureSim(T.TOOLS_DF, sim_key="desc")

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

@@ -63,6 +63,7 @@ This is the documentation for the functions you have access to. You may call any
 **Plans**:
 {plans}
+**Previous Attempts**:
 {previous_attempts}
 **Instructions**:
@@ -108,16 +109,27 @@ plan2:
 - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
-- Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
+- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
 ```python
-from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
+import numpy as np
+from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking
 # sample at 1 FPS and use the first 10 frames to reduce processing time
 frames = extract_frames("video.mp4", 1)
 frames = [f[0] for f in frames][:10]
+def remove_arrays(o):
+    if isinstance(o, list):
+        return [remove_arrays(e) for e in o]
+    elif isinstance(o, dict):
+        return {{k: remove_arrays(v) for k, v in o.items()}}
+    elif isinstance(o, np.ndarray):
+        return "array: " + str(o.shape)
+    else:
+        return o
 # plan1
 owl_v2_out = [owl_v2_image("person", f) for f in frames]
@@ -125,9 +137,10 @@ owl_v2_out = [owl_v2_image("person", f) for f in frames]
 florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
 # plan3
-countgd_out = [countgd_counting(f) for f in frames]
+f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
+remove_arrays(f2s2_tracking_out)
-final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
+final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
 print(final_out)
 ```
 """
@@ -161,9 +174,10 @@ PICK_PLAN = """
 **Instructions**:
 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
+2. Solve the problem yourself given the image and pick the plan that matches your solution the best.
 3. Output a JSON object with the following format:
 {{
+    "predicted_answer": str # the answer you would expect from the best plan
     "thoughts": str # your thought process for choosing the best plan
     "best_plan": str # the best plan you have chosen
 }}
@@ -311,6 +325,11 @@ This is the documentation for the functions you have access to. You may call any
 FIX_BUG = """
 **Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages.
+**Documentation**:
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
+{docstring}
 **Instructions**:
 Please re-complete the code to fix the error message. Here is the previous version:
 ```python
@@ -323,17 +342,24 @@ When we run this test code:
 ```
 It raises this error:
+```
 {result}
+```
 This is previous feedback provided on the code:
 {feedback}
-Please fix the bug by follow the error information and return a JSON object with the following format:
+Please fix the bug by correcting the error. Return the following JSON object followed by the fixed code in the below format:
+```json
 {{
     "reflections": str # any thoughts you have about the bug and how you fixed it
-    "code": str # the fixed code if any, else an empty string
-    "test": str # the fixed test code if any, else an empty string
+    "which_code": str # the code that was fixed, can only be 'code' or 'test'
 }}
+```
+```python
+# Your fixed code here
+```
 """

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/lmm/lmm.py RENAMED Viewed

@@ -371,7 +371,7 @@ class ClaudeSonnetLMM(LMM):
     def __init__(
         self,
         api_key: Optional[str] = None,
-        model_name: str = "claude-3-sonnet-20240229",
+        model_name: str = "claude-3-5-sonnet-20240620",
         max_tokens: int = 4096,
         **kwargs: Any,
     ):

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -37,6 +37,7 @@ from .tools import (
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
+    ixc25_temporal_localization,
     ixc25_video_vqa,
     load_image,
     loca_visual_prompt_counting,

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/tools/tools.py RENAMED Viewed

@@ -468,7 +468,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     pil_image = Image.fromarray(image).convert("RGB")
     image_size = pil_image.size[::-1]
-    if image_size[0] < 1 and image_size[1] < 1:
+    if image_size[0] < 1 or image_size[1] < 1:
         return []
     image_buffer = io.BytesIO()
     pil_image.save(image_buffer, format="PNG")
@@ -781,6 +781,44 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data["answer"])
+def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
+    """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
+    given a prompt that can be other an object or a phrase. It returns a list of
+    boolean values indicating whether the object or phrase is present in the
+    corresponding frame.
+    Parameters:
+        prompt (str): The question about the video
+        frames (List[np.ndarray]): The reference frames used for the question
+    Returns:
+        List[bool]: A list of boolean values indicating whether the object or phrase is
+            present in the corresponding frame.
+    Example
+    -------
+        >>> output = ixc25_temporal_localization('soccer goal', frames)
+        >>> print(output)
+        [False, False, False, True, True, True, False, False, False, False]
+        >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
+    """
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "prompt": prompt,
+        "chunk_length": 2,
+        "function_name": "ixc25_temporal_localization",
+    }
+    data: List[int] = send_inference_request(
+        payload, "video-temporal-localization", files=files, v2=True
+    )
+    chunk_size = round(len(frames) / len(data))
+    data_explode = [[elt] * chunk_size for elt in data]
+    data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
+    return data_bool[: len(frames)]
 def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
     """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
     including regular images or images of documents or presentations. It returns text
@@ -1112,6 +1150,8 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
@@ -1467,7 +1507,7 @@ def extract_frames(
     Parameters:
         video_uri (Union[str, Path]): The path to the video file, url or youtube link
         fps (float, optional): The frame rate per second to extract the frames. Defaults
-            to 10.
+            to 1.
     Returns:
         List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/utils/execute.py RENAMED Viewed

@@ -292,7 +292,7 @@ class Execution(BaseModel):
     error: Optional[Error] = None
     "Error object if an error occurred, None otherwise."
-    def text(self, include_logs: bool = True) -> str:
+    def text(self, include_logs: bool = True, include_results: bool = True) -> str:
         """Returns the text representation of this object, i.e. including the main
         result or the error traceback, optionally along with the logs (stdout, stderr).
         """
@@ -300,15 +300,17 @@ class Execution(BaseModel):
         if self.error:
             return prefix + "\n----- Error -----\n" + self.error.traceback
-        result_str = [
-            (
-                f"----- Final output -----\n{res.text}"
-                if res.is_main_result
-                else f"----- Intermediate output-----\n{res.text}"
-            )
-            for res in self.results
-        ]
-        return prefix + "\n" + "\n".join(result_str)
+        if include_results:
+            result_str = [
+                (
+                    f"----- Final output -----\n{res.text}"
+                    if res.is_main_result
+                    else f"----- Intermediate output-----\n{res.text}"
+                )
+                for res in self.results
+            ]
+            return prefix + "\n" + "\n".join(result_str)
+        return prefix
     @property
     def success(self) -> bool:

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/utils/video.py RENAMED Viewed

@@ -7,7 +7,6 @@ from typing import List, Optional, Tuple
 import av  # type: ignore
 import cv2
 import numpy as np
-from decord import VideoReader  # type: ignore
 _LOGGER = logging.getLogger(__name__)
 # The maximum length of the clip to extract frames from, in seconds
@@ -103,7 +102,7 @@ def frames_to_bytes(
 def extract_frames_from_video(
     video_uri: str, fps: float = 1.0
 ) -> List[Tuple[np.ndarray, float]]:
-    """Extract frames from a video
+    """Extract frames from a video along with the timestamp in seconds.
     Parameters:
         video_uri (str): the path to the video file or a video file url
@@ -115,12 +114,24 @@ def extract_frames_from_video(
             from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
             the video. The frames are sorted by the timestamp in ascending order.
     """
-    vr = VideoReader(video_uri)
-    orig_fps = vr.get_avg_fps()
-    if fps > orig_fps:
-        fps = orig_fps
-    s = orig_fps / fps
-    samples = [(int(i * s), int(i * s) / orig_fps) for i in range(int(len(vr) / s))]
-    frames = vr.get_batch([s[0] for s in samples]).asnumpy()
-    return [(frames[i, :, :, :], samples[i][1]) for i in range(len(samples))]
+    cap = cv2.VideoCapture(video_uri)
+    orig_fps = cap.get(cv2.CAP_PROP_FPS)
+    orig_frame_time = 1 / orig_fps
+    targ_frame_time = 1 / fps
+    frames: List[Tuple[np.ndarray, float]] = []
+    i = 0
+    elapsed_time = 0.0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        elapsed_time += orig_frame_time
+        if elapsed_time >= targ_frame_time:
+            frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
+            elapsed_time -= targ_frame_time
+        i += 1
+    cap.release()
+    return frames

{vision_agent-0.2.131 → vision_agent-0.2.133}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/README.md RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/agent/agent.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/clients/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/clients/http.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/clients/landing_public_api.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/fonts/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/fonts/default_font_ch_en.ttf RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/lmm/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/lmm/types.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/tools/meta_tools.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/tools/prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/tools/tool_utils.py RENAMED Viewed

@@ -1,7 +1,7 @@
-from base64 import b64encode
 import inspect
 import logging
 import os
+from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 import pandas as pd

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/tools/tools_types.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/utils/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/utils/exceptions.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/utils/image_utils.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/utils/sim.py RENAMED Viewed

File without changes

{vision_agent-0.2.131 → vision_agent-0.2.133}/vision_agent/utils/type_defs.py RENAMED Viewed

File without changes

vision-agent 0.2.131__tar.gz → 0.2.133__tar.gz

vision-agent 0.2.131tar.gz → 0.2.133tar.gz