PyPI - vision-agent - Versions diffs - 0.2.132__py3-none-any.whl → 0.2.134__py3-none-any.whl - Mend

vision-agent 0.2.132py3-none-any.whl → 0.2.134py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

vision_agent/agent/__init__.py +1 -0
vision_agent/agent/agent_utils.py +30 -18
vision_agent/agent/vision_agent_coder.py +86 -26
vision_agent/agent/vision_agent_coder_prompts.py +34 -8
vision_agent/lmm/lmm.py +1 -1
vision_agent/tools/__init__.py +1 -0
vision_agent/tools/tool_utils.py +1 -1
vision_agent/tools/tools.py +42 -2
vision_agent/utils/execute.py +28 -11
vision_agent/utils/video.py +22 -11
{vision_agent-0.2.132.dist-info → vision_agent-0.2.134.dist-info}/METADATA +1 -2
{vision_agent-0.2.132.dist-info → vision_agent-0.2.134.dist-info}/RECORD +14 -14
{vision_agent-0.2.132.dist-info → vision_agent-0.2.134.dist-info}/LICENSE +0 -0
{vision_agent-0.2.132.dist-info → vision_agent-0.2.134.dist-info}/WHEEL +0 -0

vision_agent/agent/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from .agent import Agent
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
     AzureVisionAgentCoder,
+    ClaudeVisionAgentCoder,
     OllamaVisionAgentCoder,
     VisionAgentCoder,
 )

vision_agent/agent/agent_utils.py CHANGED Viewed

@@ -14,6 +14,10 @@ def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
     if match:
         json_str = match.group()
         try:
+            # remove trailing comma
+            trailing_bracket_pattern = r",\s+\}"
+            json_str = re.sub(trailing_bracket_pattern, "}", json_str, flags=re.DOTALL)
             json_dict = json.loads(json_str)
             return json_dict  # type: ignore
         except json.JSONDecodeError:
@@ -21,29 +25,37 @@ def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
     return None
+def _find_markdown_json(json_str: str) -> str:
+    pattern = r"```json(.*?)```"
+    match = re.search(pattern, json_str, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return json_str
+def _strip_markdown_code(inp_str: str) -> str:
+    pattern = r"```python.*?```"
+    cleaned_str = re.sub(pattern, "", inp_str, flags=re.DOTALL)
+    return cleaned_str
 def extract_json(json_str: str) -> Dict[str, Any]:
+    json_str = json_str.replace("\n", " ").strip()
     try:
-        json_str = json_str.replace("\n", " ")
-        json_dict = json.loads(json_str)
+        return json.loads(json_str)  # type: ignore
     except json.JSONDecodeError:
-        if "```json" in json_str:
-            json_str = json_str[json_str.find("```json") + len("```json") :]
-            json_str = json_str[: json_str.find("```")]
-        elif "```" in json_str:
-            json_str = json_str[json_str.find("```") + len("```") :]
-            # get the last ``` not one from an intermediate string
-            json_str = json_str[: json_str.find("}```")]
-        try:
-            json_dict = json.loads(json_str)
-        except json.JSONDecodeError as e:
-            json_dict = _extract_sub_json(json_str)
-            if json_dict is not None:
-                return json_dict  # type: ignore
-            error_msg = f"Could not extract JSON from the given str: {json_str}"
+        json_orig = json_str
+        json_str = _strip_markdown_code(json_str)
+        json_str = _find_markdown_json(json_str)
+        json_dict = _extract_sub_json(json_str)
+        if json_dict is None:
+            error_msg = f"Could not extract JSON from the given str: {json_orig}"
             _LOGGER.exception(error_msg)
-            raise ValueError(error_msg) from e
+            raise ValueError(error_msg)
-    return json_dict  # type: ignore
+        return json_dict
 def extract_code(code: str) -> str:

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -27,7 +27,14 @@ from vision_agent.agent.vision_agent_coder_prompts import (
     TEST_PLANS,
     USER_REQ,
 )
-from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
+from vision_agent.lmm import (
+    LMM,
+    AzureOpenAILMM,
+    ClaudeSonnetLMM,
+    Message,
+    OllamaLMM,
+    OpenAILMM,
+)
 from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
@@ -167,9 +174,10 @@ def pick_plan(
         }
     )
     tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
-    tool_output_str = ""
-    if len(tool_output.logs.stdout) > 0:
-        tool_output_str = tool_output.logs.stdout[0]
+    # Because of the way we trace function calls the trace information ends up in the
+    # results. We don't want to show this info to the LLM so we don't include it in the
+    # tool_output_str.
+    tool_output_str = tool_output.text(include_results=False).strip()
     if verbosity == 2:
         _print_code("Initial code and tests:", code)
@@ -196,7 +204,7 @@ def pick_plan(
             docstring=tool_info,
             plans=plan_str,
             previous_attempts=PREVIOUS_FAILED.format(
-                code=code, error=tool_output.text()
+                code=code, error="\n".join(tool_output_str.splitlines()[-50:])
             ),
             media=media,
         )
@@ -225,11 +233,11 @@ def pick_plan(
                 "status": "completed" if tool_output.success else "failed",
             }
         )
-        tool_output_str = tool_output.text().strip()
+        tool_output_str = tool_output.text(include_results=False).strip()
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
-            _LOGGER.info(f"Code execution result after attempt {count}")
+            _LOGGER.info(f"Code execution result after attempt {count + 1}")
         count += 1
@@ -387,7 +395,6 @@ def write_and_test_code(
             "code": DefaultImports.prepend_imports(code),
             "payload": {
                 "test": test,
-                # "result": result.to_json(),
             },
         }
     )
@@ -406,6 +413,7 @@ def write_and_test_code(
             working_memory,
             debugger,
             code_interpreter,
+            tool_info,
             code,
             test,
             result,
@@ -431,6 +439,7 @@ def debug_code(
     working_memory: List[Dict[str, str]],
     debugger: LMM,
     code_interpreter: CodeInterpreter,
+    tool_info: str,
     code: str,
     test: str,
     result: Execution,
@@ -451,17 +460,38 @@ def debug_code(
     count = 0
     while not success and count < 3:
         try:
-            fixed_code_and_test = extract_json(
-                debugger(  # type: ignore
-                    FIX_BUG.format(
-                        code=code,
-                        tests=test,
-                        result="\n".join(result.text().splitlines()[-50:]),
-                        feedback=format_memory(working_memory + new_working_memory),
+            # LLMs write worse code when it's in JSON, so we have it write JSON
+            # followed by code each wrapped in markdown blocks.
+            fixed_code_and_test_str = debugger(
+                FIX_BUG.format(
+                    docstring=tool_info,
+                    code=code,
+                    tests=test,
+                    # Because of the way we trace function calls the trace information
+                    # ends up in the results. We don't want to show this info to the
+                    # LLM so we don't include it in the tool_output_str.
+                    result="\n".join(
+                        result.text(include_results=False).splitlines()[-50:]
                     ),
-                    stream=False,
-                )
+                    feedback=format_memory(working_memory + new_working_memory),
+                ),
+                stream=False,
             )
+            fixed_code_and_test_str = cast(str, fixed_code_and_test_str)
+            fixed_code_and_test = extract_json(fixed_code_and_test_str)
+            code = extract_code(fixed_code_and_test_str)
+            if (
+                "which_code" in fixed_code_and_test
+                and fixed_code_and_test["which_code"] == "test"
+            ):
+                fixed_code_and_test["code"] = ""
+                fixed_code_and_test["test"] = code
+            else:  # for everything else always assume it's updating code
+                fixed_code_and_test["code"] = code
+                fixed_code_and_test["test"] = ""
+            if "which_code" in fixed_code_and_test:
+                del fixed_code_and_test["which_code"]
             success = True
         except Exception as e:
             _LOGGER.exception(f"Error while extracting JSON: {e}")
@@ -472,9 +502,9 @@ def debug_code(
     old_test = test
     if fixed_code_and_test["code"].strip() != "":
-        code = extract_code(fixed_code_and_test["code"])
+        code = fixed_code_and_test["code"]
     if fixed_code_and_test["test"].strip() != "":
-        test = extract_code(fixed_code_and_test["test"])
+        test = fixed_code_and_test["test"]
     new_working_memory.append(
         {
@@ -628,9 +658,7 @@ class VisionAgentCoder(Agent):
         )
         self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
         self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
-        self.debugger = (
-            OpenAILMM(temperature=0.0, json_mode=True) if debugger is None else debugger
-        )
+        self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
         self.verbosity = verbosity
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
@@ -876,6 +904,40 @@ class VisionAgentCoder(Agent):
                 )
+class ClaudeVisionAgentCoder(VisionAgentCoder):
+    def __init__(
+        self,
+        planner: Optional[LMM] = None,
+        coder: Optional[LMM] = None,
+        tester: Optional[LMM] = None,
+        debugger: Optional[LMM] = None,
+        tool_recommender: Optional[Sim] = None,
+        verbosity: int = 0,
+        report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+    ) -> None:
+        # NOTE: Claude doesn't have an official JSON mode
+        self.planner = ClaudeSonnetLMM(temperature=0.0) if planner is None else planner
+        self.coder = ClaudeSonnetLMM(temperature=0.0) if coder is None else coder
+        self.tester = ClaudeSonnetLMM(temperature=0.0) if tester is None else tester
+        self.debugger = (
+            ClaudeSonnetLMM(temperature=0.0) if debugger is None else debugger
+        )
+        self.verbosity = verbosity
+        if self.verbosity > 0:
+            _LOGGER.setLevel(logging.INFO)
+        # Anthropic does not offer any embedding models and instead recomends Voyage,
+        # we're using OpenAI's embedder for now.
+        self.tool_recommender = (
+            Sim(T.TOOLS_DF, sim_key="desc")
+            if tool_recommender is None
+            else tool_recommender
+        )
+        self.report_progress_callback = report_progress_callback
+        self.code_sandbox_runtime = code_sandbox_runtime
 class OllamaVisionAgentCoder(VisionAgentCoder):
     """VisionAgentCoder that uses Ollama models for planning, coding, testing.
@@ -920,7 +982,7 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
                 else tester
             ),
             debugger=(
-                OllamaLMM(model_name="llama3.1", temperature=0.0, json_mode=True)
+                OllamaLMM(model_name="llama3.1", temperature=0.0)
                 if debugger is None
                 else debugger
             ),
@@ -983,9 +1045,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
             coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder,
             tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester,
             debugger=(
-                AzureOpenAILMM(temperature=0.0, json_mode=True)
-                if debugger is None
-                else debugger
+                AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
             ),
             tool_recommender=(
                 AzureSim(T.TOOLS_DF, sim_key="desc")

vision_agent/agent/vision_agent_coder_prompts.py CHANGED Viewed

@@ -63,6 +63,7 @@ This is the documentation for the functions you have access to. You may call any
 **Plans**:
 {plans}
+**Previous Attempts**:
 {previous_attempts}
 **Instructions**:
@@ -108,16 +109,27 @@ plan2:
 - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
-- Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
+- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
 ```python
-from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
+import numpy as np
+from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking
 # sample at 1 FPS and use the first 10 frames to reduce processing time
 frames = extract_frames("video.mp4", 1)
 frames = [f[0] for f in frames][:10]
+def remove_arrays(o):
+    if isinstance(o, list):
+        return [remove_arrays(e) for e in o]
+    elif isinstance(o, dict):
+        return {{k: remove_arrays(v) for k, v in o.items()}}
+    elif isinstance(o, np.ndarray):
+        return "array: " + str(o.shape)
+    else:
+        return o
 # plan1
 owl_v2_out = [owl_v2_image("person", f) for f in frames]
@@ -125,9 +137,10 @@ owl_v2_out = [owl_v2_image("person", f) for f in frames]
 florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
 # plan3
-countgd_out = [countgd_counting(f) for f in frames]
+f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
+remove_arrays(f2s2_tracking_out)
-final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
+final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
 print(final_out)
 ```
 """
@@ -161,9 +174,10 @@ PICK_PLAN = """
 **Instructions**:
 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
+2. Solve the problem yourself given the image and pick the plan that matches your solution the best.
 3. Output a JSON object with the following format:
 {{
+    "predicted_answer": str # the answer you would expect from the best plan
     "thoughts": str # your thought process for choosing the best plan
     "best_plan": str # the best plan you have chosen
 }}
@@ -311,6 +325,11 @@ This is the documentation for the functions you have access to. You may call any
 FIX_BUG = """
 **Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages.
+**Documentation**:
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
+{docstring}
 **Instructions**:
 Please re-complete the code to fix the error message. Here is the previous version:
 ```python
@@ -323,17 +342,24 @@ When we run this test code:
 ```
 It raises this error:
+```
 {result}
+```
 This is previous feedback provided on the code:
 {feedback}
-Please fix the bug by follow the error information and return a JSON object with the following format:
+Please fix the bug by correcting the error. Return the following JSON object followed by the fixed code in the below format:
+```json
 {{
     "reflections": str # any thoughts you have about the bug and how you fixed it
-    "code": str # the fixed code if any, else an empty string
-    "test": str # the fixed test code if any, else an empty string
+    "which_code": str # the code that was fixed, can only be 'code' or 'test'
 }}
+```
+```python
+# Your fixed code here
+```
 """

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -371,7 +371,7 @@ class ClaudeSonnetLMM(LMM):
     def __init__(
         self,
         api_key: Optional[str] = None,
-        model_name: str = "claude-3-sonnet-20240229",
+        model_name: str = "claude-3-5-sonnet-20240620",
         max_tokens: int = 4096,
         **kwargs: Any,
     ):

vision_agent/tools/__init__.py CHANGED Viewed

@@ -37,6 +37,7 @@ from .tools import (
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
+    ixc25_temporal_localization,
     ixc25_video_vqa,
     load_image,
     loca_visual_prompt_counting,

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from base64 import b64encode
 import inspect
 import logging
 import os
+from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 import pandas as pd

vision_agent/tools/tools.py CHANGED Viewed

@@ -468,7 +468,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     pil_image = Image.fromarray(image).convert("RGB")
     image_size = pil_image.size[::-1]
-    if image_size[0] < 1 and image_size[1] < 1:
+    if image_size[0] < 1 or image_size[1] < 1:
         return []
     image_buffer = io.BytesIO()
     pil_image.save(image_buffer, format="PNG")
@@ -781,6 +781,44 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data["answer"])
+def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
+    """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
+    given a prompt that can be other an object or a phrase. It returns a list of
+    boolean values indicating whether the object or phrase is present in the
+    corresponding frame.
+    Parameters:
+        prompt (str): The question about the video
+        frames (List[np.ndarray]): The reference frames used for the question
+    Returns:
+        List[bool]: A list of boolean values indicating whether the object or phrase is
+            present in the corresponding frame.
+    Example
+    -------
+        >>> output = ixc25_temporal_localization('soccer goal', frames)
+        >>> print(output)
+        [False, False, False, True, True, True, False, False, False, False]
+        >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
+    """
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "prompt": prompt,
+        "chunk_length": 2,
+        "function_name": "ixc25_temporal_localization",
+    }
+    data: List[int] = send_inference_request(
+        payload, "video-temporal-localization", files=files, v2=True
+    )
+    chunk_size = round(len(frames) / len(data))
+    data_explode = [[elt] * chunk_size for elt in data]
+    data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
+    return data_bool[: len(frames)]
 def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
     """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
     including regular images or images of documents or presentations. It returns text
@@ -1112,6 +1150,8 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
@@ -1467,7 +1507,7 @@ def extract_frames(
     Parameters:
         video_uri (Union[str, Path]): The path to the video file, url or youtube link
         fps (float, optional): The frame rate per second to extract the frames. Defaults
-            to 10.
+            to 1.
     Returns:
         List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame

vision_agent/utils/execute.py CHANGED Viewed

@@ -292,7 +292,7 @@ class Execution(BaseModel):
     error: Optional[Error] = None
     "Error object if an error occurred, None otherwise."
-    def text(self, include_logs: bool = True) -> str:
+    def text(self, include_logs: bool = True, include_results: bool = True) -> str:
         """Returns the text representation of this object, i.e. including the main
         result or the error traceback, optionally along with the logs (stdout, stderr).
         """
@@ -300,15 +300,17 @@ class Execution(BaseModel):
         if self.error:
             return prefix + "\n----- Error -----\n" + self.error.traceback
-        result_str = [
-            (
-                f"----- Final output -----\n{res.text}"
-                if res.is_main_result
-                else f"----- Intermediate output-----\n{res.text}"
-            )
-            for res in self.results
-        ]
-        return prefix + "\n" + "\n".join(result_str)
+        if include_results:
+            result_str = [
+                (
+                    f"----- Final output -----\n{res.text}"
+                    if res.is_main_result
+                    else f"----- Intermediate output-----\n{res.text}"
+                )
+                for res in self.results
+            ]
+            return prefix + "\n" + "\n".join(result_str)
+        return prefix
     @property
     def success(self) -> bool:
@@ -689,8 +691,9 @@ class CodeInterpreterFactory:
         if not code_sandbox_runtime:
             code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
         if code_sandbox_runtime == "e2b":
+            envs = _get_e2b_env()
             instance: CodeInterpreter = E2BCodeInterpreter(
-                timeout=_SESSION_TIMEOUT, remote_path=remote_path
+                timeout=_SESSION_TIMEOUT, remote_path=remote_path, envs=envs
             )
         elif code_sandbox_runtime == "local":
             instance = LocalCodeInterpreter(
@@ -703,6 +706,20 @@ class CodeInterpreterFactory:
         return instance
+def _get_e2b_env() -> Union[Dict[str, str], None]:
+    openai_api_key = os.getenv("OPENAI_API_KEY", "")
+    anthropic_api_key = os.getenv("ANTHROPIC_API_KEY", "")
+    if openai_api_key or anthropic_api_key:
+        envs = {}
+        if openai_api_key:
+            envs["OPENAI_API_KEY"] = openai_api_key
+        if anthropic_api_key:
+            envs["ANTHROPIC_API_KEY"] = anthropic_api_key
+    else:
+        envs = None
+    return envs
 def _parse_local_code_interpreter_outputs(outputs: List[Dict[str, Any]]) -> Execution:
     """Parse notebook cell outputs to Execution object. Output types:
     https://nbformat.readthedocs.io/en/latest/format_description.html#code-cell-outputs

vision_agent/utils/video.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import List, Optional, Tuple
 import av  # type: ignore
 import cv2
 import numpy as np
-from decord import VideoReader  # type: ignore
 _LOGGER = logging.getLogger(__name__)
 # The maximum length of the clip to extract frames from, in seconds
@@ -103,7 +102,7 @@ def frames_to_bytes(
 def extract_frames_from_video(
     video_uri: str, fps: float = 1.0
 ) -> List[Tuple[np.ndarray, float]]:
-    """Extract frames from a video
+    """Extract frames from a video along with the timestamp in seconds.
     Parameters:
         video_uri (str): the path to the video file or a video file url
@@ -115,12 +114,24 @@ def extract_frames_from_video(
             from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
             the video. The frames are sorted by the timestamp in ascending order.
     """
-    vr = VideoReader(video_uri)
-    orig_fps = vr.get_avg_fps()
-    if fps > orig_fps:
-        fps = orig_fps
-    s = orig_fps / fps
-    samples = [(int(i * s), int(i * s) / orig_fps) for i in range(int(len(vr) / s))]
-    frames = vr.get_batch([s[0] for s in samples]).asnumpy()
-    return [(frames[i, :, :, :], samples[i][1]) for i in range(len(samples))]
+    cap = cv2.VideoCapture(video_uri)
+    orig_fps = cap.get(cv2.CAP_PROP_FPS)
+    orig_frame_time = 1 / orig_fps
+    targ_frame_time = 1 / fps
+    frames: List[Tuple[np.ndarray, float]] = []
+    i = 0
+    elapsed_time = 0.0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        elapsed_time += orig_frame_time
+        if elapsed_time >= targ_frame_time:
+            frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
+            elapsed_time -= targ_frame_time
+        i += 1
+    cap.release()
+    return frames

{vision_agent-0.2.132.dist-info → vision_agent-0.2.134.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.132
+Version: 0.2.134
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -13,7 +13,6 @@ Requires-Dist: anthropic (>=0.31.0,<0.32.0)
 Requires-Dist: av (>=11.0.0,<12.0.0)
 Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
 Requires-Dist: e2b-code-interpreter (==0.0.11a37)
-Requires-Dist: eva-decord (>=0.6.1,<0.7.0)
 Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
 Requires-Dist: langsmith (>=0.1.58,<0.2.0)
 Requires-Dist: nbclient (>=0.10.0,<0.11.0)

{vision_agent-0.2.132.dist-info → vision_agent-0.2.134.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
 vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
-vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepovI,176
+vision_agent/agent/__init__.py,sha256=TddDT4e3JVc68Dt0zSk0B4OBORx_R2WhAGK71uqEe2w,204
 vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
-vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
+vision_agent/agent/agent_utils.py,sha256=qOYQn-wJsa4j4YjFOBQ41xyklCg8Y94CIIGw9ZXmgIU,2053
 vision_agent/agent/vision_agent.py,sha256=nfxdY5W5UME7JhwFcsB3j2-L5zsYZzJWdlS2R8U_9lE,13224
-vision_agent/agent/vision_agent_coder.py,sha256=_2QQd_nTGojkk2ZOiMevVCY6-eUA9q1QdCWH7-Noq4w,34237
-vision_agent/agent/vision_agent_coder_prompts.py,sha256=nj4iRRSAWYHjKqyUSp12aTCV1D5iUVCHeezVXoozS4M,12687
+vision_agent/agent/vision_agent_coder.py,sha256=OI95goKTqVaEEPYwkn6bVsHsHZeifoBC8rjG9nD0Znc,36909
+vision_agent/agent/vision_agent_coder_prompts.py,sha256=a7P19QscKNiaweke0zHPCfi5GQImpG-ZGKv_kXz0seg,13452
 vision_agent/agent/vision_agent_prompts.py,sha256=-fXiIIb48duXVljWYcJ0Y4ZzfNnRFi3C5cKdF4SdDo8,10075
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
@@ -12,22 +12,22 @@ vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
-vision_agent/lmm/lmm.py,sha256=092oefI65_QSRvQm2znXkjTdzlZTh-Ni_38610kfbJg,16836
+vision_agent/lmm/lmm.py,sha256=soWmEjtleQUSH2G3tYZWxOmteIqkgMVcmuZfx4mxszU,16838
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=nx60_hujcnLz3d2wQlCbcerUmT6R2vxRy66IsQjdB3M,2364
+vision_agent/tools/__init__.py,sha256=nufZNzbcLTuXwxFmvZNj99qE8EO2qtEPT8wFsuI9vyE,2397
 vision_agent/tools/meta_tools.py,sha256=qbf_dzVmhf4zhv-xY1zaqRFshDlvj_7ilFQtSr70hdQ,21213
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tool_utils.py,sha256=PjdataKjPpiFSq1QBAAWHJUGPPn4p4dr07TPSlhXvFk,7758
-vision_agent/tools/tools.py,sha256=ywfolLhf8OWnavXTTEYscUvOUM0ECNy-ff3WLMdUhhQ,68465
+vision_agent/tools/tool_utils.py,sha256=ZYqzcw_e937reoNr7gJgyKjQ7Gudxz1ttfIyo7F65w8,7758
+vision_agent/tools/tools.py,sha256=WKeB99ED0o_ISS_vZc-ch_1Dc8_Fl2fhnGlfVNwNouc,70024
 vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
-vision_agent/utils/execute.py,sha256=uIAupmUtesUvhCQJ8oeGxDeX7SYDz0TLxFhPB4HD2NA,27147
+vision_agent/utils/execute.py,sha256=BZ114WuP5oVk45E_uvUkCwYX-nmVQdNgBvhH3GegUnM,27748
 vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
 vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
-vision_agent/utils/video.py,sha256=GmJqu_3WhBMEwP4HToMMp8EwgftliHSpv5nd-QEDOcs,4528
-vision_agent-0.2.132.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.132.dist-info/METADATA,sha256=s0AXiV6qjDjTUrzFqHL-50QJ6r7sxlJrwkSKNIGgklc,12295
-vision_agent-0.2.132.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.132.dist-info/RECORD,,
+vision_agent/utils/video.py,sha256=hOjfEOZNcddYdoa0CoviXA4Vo9kwURKuojIJgLLJdp0,4745
+vision_agent-0.2.134.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.134.dist-info/METADATA,sha256=nGDpQtIHLCuDkIo4Is_YfgtLhNNPT3_Os35lGn0UyoQ,12252
+vision_agent-0.2.134.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.134.dist-info/RECORD,,

{vision_agent-0.2.132.dist-info → vision_agent-0.2.134.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.132.dist-info → vision_agent-0.2.134.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.132__py3-none-any.whl → 0.2.134__py3-none-any.whl

vision-agent 0.2.132py3-none-any.whl → 0.2.134py3-none-any.whl