PyPI - vision-agent - Versions diffs - 0.2.225__tar.gz → 0.2.226__tar.gz - Mend

vision-agent 0.2.225tar.gz → 0.2.226tar.gz

Files changed (47) hide show

{vision_agent-0.2.225 → vision_agent-0.2.226}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.225
+Version: 0.2.226
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.225 → vision_agent-0.2.226}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.225"
+version = "0.2.226"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/.sim_tools/df.csv RENAMED Viewed

@@ -65,25 +65,30 @@ desc,doc,name
             },
         ]
     ",owlv2_sam2_instance_segmentation
-"'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
-'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
-    prompt such as category names or referring expressions. The categories in the text
-    prompt are separated by commas. It returns a list of bounding boxes, label names,
-    mask file names and associated probability scores.
+"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
+'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
+    objects in a video given a text prompt such as category names or referring
+    expressions. The categories in the text prompt are separated by commas. It returns
+    a list of bounding boxes, label names, masks and associated probability scores and
+    is useful for tracking and counting without duplicating counts.
     Parameters:
         prompt (str): The prompt to ground to the image.
-        image (np.ndarray): The image to ground the prompt to.
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
+            new objects.
         fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
             fine-tuned model ID here to use it.
     Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
-            bounding box, and mask of the detected objects with normalized coordinates
-            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
-            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
-            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
-            the background.
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
+            label, segmentation mask and bounding boxes. The outer list represents each
+            frame and the inner list is the entities per frame. The detected objects
+            have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
+            and ymin are the coordinates of the top-left and xmax and ymax are the
+            coordinates of the bottom-right of the bounding box. The mask is binary 2D
+            numpy array where 1 indicates the object and 0 indicates the background.
+            The label names are prefixed with their ID represent the total count.
     Example
     -------
@@ -170,25 +175,28 @@ desc,doc,name
             },
         ]
     ",countgd_sam2_instance_segmentation
-"'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
-'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
-    prompt such as category names or referring expressions. The categories in the text
-    prompt are separated by commas. It returns a list of bounding boxes, label names,
-    mask file names and associated probability scores.
+"'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
+'countgd_sam2_video_tracking' is a tool that can track and segment multiple
+    objects in a video given a text prompt such as category names or referring
+    expressions. The categories in the text prompt are separated by commas. It returns
+    a list of bounding boxes, label names, masks and associated probability scores and
+    is useful for tracking and counting without duplicating counts.
     Parameters:
         prompt (str): The prompt to ground to the image.
-        image (np.ndarray): The image to ground the prompt to.
-        chunk_length (Optional[int]): The number of frames to re-run florence2 to find
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run countgd to find
             new objects.
     Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
-            bounding box, and mask of the detected objects with normalized coordinates
-            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
-            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
-            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
-            the background.
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
+            label, segmentation mask and bounding boxes. The outer list represents each
+            frame and the inner list is the entities per frame. The detected objects
+            have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
+            and ymin are the coordinates of the top-left and xmax and ymax are the
+            coordinates of the bottom-right of the bounding box. The mask is binary 2D
+            numpy array where 1 indicates the object and 0 indicates the background.
+            The label names are prefixed with their ID represent the total count.
     Example
     -------
@@ -265,12 +273,12 @@ desc,doc,name
             },
         ]
     ",florence2_sam2_instance_segmentation
-'florence2_sam2_video_tracking' is a tool that can segment and track multiple entities in a video given a text prompt such as category names or referring expressions. You can optionally separate the categories in the text with commas. It can find new objects every 'chunk_length' frames and is useful for tracking and counting without duplicating counts and always outputs scores of 1.0.,"florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
-'florence2_sam2_video_tracking' is a tool that can segment and track multiple
-    entities in a video given a text prompt such as category names or referring
-    expressions. You can optionally separate the categories in the text with commas. It
-    can find new objects every 'chunk_length' frames and is useful for tracking and
-    counting without duplicating counts and always outputs scores of 1.0.
+"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
+'florence2_sam2_video_tracking' is a tool that can track and segment multiple
+    objects in a video given a text prompt such as category names or referring
+    expressions. The categories in the text prompt are separated by commas. It returns
+    a list of bounding boxes, label names, masks and associated probability scores and
+    is useful for tracking and counting without duplicating counts.
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -282,10 +290,13 @@ desc,doc,name
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
-        label, segment mask and bounding boxes. The outer list represents each frame
-        and the inner list is the entities per frame. The label contains the object ID
-        followed by the label name. The objects are only identified in the first framed
-        and tracked throughout the video.
+            label, segmentation mask and bounding boxes. The outer list represents each
+            frame and the inner list is the entities per frame. The detected objects
+            have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
+            and ymin are the coordinates of the top-left and xmax and ymax are the
+            coordinates of the bottom-right of the bounding box. The mask is binary 2D
+            numpy array where 1 indicates the object and 0 indicates the background.
+            The label names are prefixed with their ID represent the total count.
     Example
     -------
@@ -445,43 +456,6 @@ desc,doc,name
         >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
         'Lionel Messi'
     ",qwen2_vl_video_vqa
-"'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects as labels, their regions as masks and their scores.","detr_segmentation(image: numpy.ndarray) -> List[Dict[str, Any]]:
-'detr_segmentation' is a tool that can segment common objects in an
-    image without any text prompt. It returns a list of detected objects
-    as labels, their regions as masks and their scores.
-    Parameters:
-        image (np.ndarray): The image used to segment things and objects
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label
-            and mask of the detected objects. The mask is binary 2D numpy array where 1
-            indicates the object and 0 indicates the background.
-    Example
-    -------
-        >>> detr_segmentation(image)
-        [
-            {
-                'score': 0.45,
-                'label': 'window',
-                'mask': array([[0, 0, 0, ..., 0, 0, 0],
-                    [0, 0, 0, ..., 0, 0, 0],
-                    ...,
-                    [0, 0, 0, ..., 0, 0, 0],
-                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
-            },
-            {
-                'score': 0.70,
-                'label': 'bird',
-                'mask': array([[0, 0, 0, ..., 0, 0, 0],
-                    [0, 0, 0, ..., 0, 0, 0],
-                    ...,
-                    [0, 0, 0, ..., 0, 0, 0],
-                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
-            },
-        ]
-    ",detr_segmentation
 'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
 'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
     depth image from a given RGB image. The returned depth image is monochrome and
@@ -522,22 +496,6 @@ desc,doc,name
                 [10, 11, 15, ..., 202, 202, 205],
                 [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
     ",generate_pose_image
-'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
-'vit_image_classification' is a tool that can classify an image. It returns a
-    list of classes and their probability scores based on image content.
-    Parameters:
-        image (np.ndarray): The image to classify or tag
-    Returns:
-        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
-            contains a list of labels and other a list of scores.
-    Example
-    -------
-        >>> vit_image_classification(image)
-        {""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
-    ",vit_image_classification
 'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
 'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
     It returns the predicted label and their probability scores based on image content.
@@ -566,7 +524,7 @@ desc,doc,name
         prompt (str): The question about the video
         frames (List[np.ndarray]): The reference frames used for the question
         model (str): The model to use for the inference. Valid values are
-            'qwen2vl', 'gpt4o', 'internlm-xcomposer'
+            'qwen2vl', 'gpt4o'.
         chunk_length_frames (Optional[int]): length of each chunk in frames
     Returns:
@@ -641,7 +599,7 @@ desc,doc,name
         >>> closest_distance(det1, det2, image_size)
         141.42
     ",minimum_distance
-"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
+"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 5) -> List[Dict[str, Union[numpy.ndarray, float]]]:
 'extract_frames_and_timestamps' extracts frames and timestamps from a video
     which can be a file path, url or youtube link, returns a list of dictionaries
     with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is
@@ -651,7 +609,7 @@ desc,doc,name
     Parameters:
         video_uri (Union[str, Path]): The path to the video file, url or youtube link
         fps (float, optional): The frame rate per second to extract the frames. Defaults
-            to 1.
+            to 5.
     Returns:
         List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/.sim_tools/embs.npy RENAMED Viewed

Binary file

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/agent/agent_utils.py RENAMED Viewed

@@ -153,6 +153,19 @@ def format_plan_v2(plan: PlanContext) -> str:
     return plan_str
+def format_conversation(chat: List[AgentMessage]) -> str:
+    chat = copy.deepcopy(chat)
+    prompt = ""
+    for chat_i in chat:
+        if chat_i.role == "user":
+            prompt += f"USER: {chat_i.content}\n\n"
+        elif chat_i.role == "observation" or chat_i.role == "coder":
+            prompt += f"OBSERVATION: {chat_i.content}\n\n"
+        elif chat_i.role == "conversation":
+            prompt += f"AGENT: {chat_i.content}\n\n"
+    return prompt
 def format_plans(plans: Dict[str, Any]) -> str:
     plan_str = ""
     for k, v in plans.items():

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_coder_prompts_v2.py RENAMED Viewed

@@ -65,7 +65,7 @@ This is the documentation for the functions you have access to. You may call any
 7. DO NOT assert the output value, run the code and assert only the output format or data structure.
 8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
 9. DO NOT import the testing function as it will available in the testing environment.
-10. Print the output of the function that is being tested.
+10. Print the output of the function that is being tested and ensure it is not empty.
 11. Use the output of the function that is being tested as the return value of the testing function.
 12. Run the testing function in the end and don't assign a variable to its output.
 13. Output your test code using <code> tags:

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_coder_v2.py RENAMED Viewed

@@ -202,7 +202,12 @@ def write_and_test_code(
         tool_docs=tool_docs,
         plan=plan,
     )
-    code = strip_function_calls(code)
+    try:
+        code = strip_function_calls(code)
+    except Exception:
+        # the code may be malformatted, this will fail in the exec call and the agent
+        # will attempt to debug it
+        pass
     test = write_test(
         tester=tester,
         chat=chat,

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_planner_prompts_v2.py RENAMED Viewed

@@ -136,8 +136,9 @@ Tool Documentation:
 countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
     'countgd_object_detection' is a tool that can detect multiple instances of an
     object given a text prompt. It is particularly useful when trying to detect and
-    count a large number of objects. It returns a list of bounding boxes with
-    normalized coordinates, label names and associated confidence scores.
+    count a large number of objects. You can optionally separate object names in the
+    prompt with commas. It returns a list of bounding boxes with normalized
+    coordinates, label names and associated confidence scores.
     Parameters:
         prompt (str): The object that needs to be counted.
@@ -272,40 +273,47 @@ OBSERVATION:
 [get_tool_for_task output]
 For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor.
-'florence2_sam2_video_tracking' is a tool that can segment and track multiple
-entities in a video given a text prompt such as category names or referring
-expressions. You can optionally separate the categories in the text with commas. It
-can find new objects every 'chunk_length' frames and is useful for tracking and
-counting without duplicating counts and always outputs scores of 1.0.
+Tool Documentation:
+def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
+    'florence2_sam2_video_tracking' is a tool that can track and segment multiple
+    objects in a video given a text prompt such as category names or referring
+    expressions. The categories in the text prompt are separated by commas. It returns
+    a list of bounding boxes, label names, masks and associated probability scores and
+    is useful for tracking and counting without duplicating counts.
-Parameters:
-    prompt (str): The prompt to ground to the video.
-    frames (List[np.ndarray]): The list of frames to ground the prompt to.
-    chunk_length (Optional[int]): The number of frames to re-run florence2 to find
-        new objects.
+    Parameters:
+        prompt (str): The prompt to ground to the video.
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run florence2 to find
+            new objects.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
-Returns:
-    List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
-    label,segment mask and bounding boxes. The outer list represents each frame and
-    the inner list is the entities per frame. The label contains the object ID
-    followed by the label name. The objects are only identified in the first framed
-    and tracked throughout the video.
+    Returns:
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
+            label, segmentation mask and bounding boxes. The outer list represents each
+            frame and the inner list is the entities per frame. The detected objects
+            have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
+            and ymin are the coordinates of the top-left and xmax and ymax are the
+            coordinates of the bottom-right of the bounding box. The mask is binary 2D
+            numpy array where 1 indicates the object and 0 indicates the background.
+            The label names are prefixed with their ID represent the total count.
-Example
--------
-    >>> florence2_sam2_video("car, dinosaur", frames)
-    [
+    Example
+    -------
+        >>> florence2_sam2_video_tracking("car, dinosaur", frames)
         [
-            {
-                'label': '0: dinosaur',
-                'bbox': [0.1, 0.11, 0.35, 0.4],
-                'mask': array([[0, 0, 0, ..., 0, 0, 0],
-                    ...,
-                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
-            },
-        ],
-        ...
-    ]
+            [
+                {
+                    'label': '0: dinosaur',
+                    'bbox': [0.1, 0.11, 0.35, 0.4],
+                    'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                        ...,
+                        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+                },
+            ],
+            ...
+        ]
 [end of get_tool_for_task output]
 <count>8</count>
@@ -691,7 +699,8 @@ FINALIZE_PLAN = """
 4. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
 5. Do not include {excluded_tools} tools in your instructions.
 6. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_file` or `save_video`.
-6. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
+7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
+8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
 <json>
 {{

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_v2.py RENAMED Viewed

@@ -1,13 +1,14 @@
 import copy
 import json
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
 from vision_agent.agent.agent_utils import (
     add_media_to_chat,
     convert_message_to_agentmessage,
     extract_tag,
+    format_conversation,
 )
 from vision_agent.agent.types import (
     AgentMessage,
@@ -22,19 +23,6 @@ from vision_agent.lmm.types import Message
 from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
-def format_conversation(chat: List[AgentMessage]) -> str:
-    chat = copy.deepcopy(chat)
-    prompt = ""
-    for chat_i in chat:
-        if chat_i.role == "user":
-            prompt += f"USER: {chat_i.content}\n\n"
-        elif chat_i.role == "observation" or chat_i.role == "coder":
-            prompt += f"OBSERVATION: {chat_i.content}\n\n"
-        elif chat_i.role == "conversation":
-            prompt += f"AGENT: {chat_i.content}\n\n"
-    return prompt
 def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
     # only keep last 10 messages
     conv = format_conversation(chat[-10:])
@@ -55,23 +43,39 @@ def check_for_interaction(chat: List[AgentMessage]) -> bool:
 def extract_conversation_for_generate_code(
     chat: List[AgentMessage],
-) -> List[AgentMessage]:
+) -> Tuple[List[AgentMessage], Optional[str]]:
     chat = copy.deepcopy(chat)
     # if we are in the middle of an interaction, return all the intermediate planning
     # steps
     if check_for_interaction(chat):
-        return chat
+        return chat, None
     extracted_chat = []
     for chat_i in chat:
         if chat_i.role == "user":
             extracted_chat.append(chat_i)
         elif chat_i.role == "coder":
-            if "<final_code>" in chat_i.content and "<final_test>" in chat_i.content:
+            if "<final_code>" in chat_i.content:
                 extracted_chat.append(chat_i)
-    return extracted_chat
+    # only keep the last <final_code> and <final_test>
+    final_code = None
+    extracted_chat_strip_code: List[AgentMessage] = []
+    for chat_i in reversed(extracted_chat):
+        if "<final_code>" in chat_i.content and final_code is None:
+            extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
+            final_code = extract_tag(chat_i.content, "final_code")
+            if final_code is not None:
+                test_code = extract_tag(chat_i.content, "final_test")
+                final_code += "\n" + test_code if test_code is not None else ""
+        if "<final_code>" in chat_i.content and final_code is not None:
+            continue
+        extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
+    return extracted_chat_strip_code[-5:], final_code
 def maybe_run_action(
@@ -81,7 +85,7 @@ def maybe_run_action(
     code_interpreter: Optional[CodeInterpreter] = None,
 ) -> Optional[List[AgentMessage]]:
     if action == "generate_or_edit_vision_code":
-        extracted_chat = extract_conversation_for_generate_code(chat)
+        extracted_chat, _ = extract_conversation_for_generate_code(chat)
         # there's an issue here because coder.generate_code will send it's code_context
         # to the outside user via it's update_callback, but we don't necessarily have
         # access to that update_callback here, so we re-create the message using
@@ -101,11 +105,15 @@ def maybe_run_action(
                 )
             ]
     elif action == "edit_code":
-        extracted_chat = extract_conversation_for_generate_code(chat)
+        extracted_chat, final_code = extract_conversation_for_generate_code(chat)
         plan_context = PlanContext(
             plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
-            instructions=[],
-            code="",
+            instructions=[
+                chat_i.content
+                for chat_i in extracted_chat
+                if chat_i.role == "user" and "<final_code>" not in chat_i.content
+            ],
+            code=final_code if final_code is not None else "",
         )
         context = coder.generate_code_from_plan(
             extracted_chat, plan_context, code_interpreter=code_interpreter

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/tools/planner_tools.py RENAMED Viewed

@@ -193,8 +193,10 @@ def get_tool_for_task(
         - Depth and pose estimation
         - Video object tracking
-    Wait until the documentation is printed to use the function so you know what the
-    input and output signatures are.
+    Only ask for one type of task at a time, for example a task needing to identify
+    text is one OCR task while needing to identify non-text objects is an OD task. Wait
+    until the documentation is printed to use the function so you know what the input
+    and output signatures are.
     Parameters:
         task: str: The task to accomplish.

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/tools/tools.py RENAMED Viewed

@@ -515,24 +515,29 @@ def owlv2_sam2_video_tracking(
     chunk_length: Optional[int] = 10,
     fine_tune_id: Optional[str] = None,
 ) -> List[List[Dict[str, Any]]]:
-    """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
-    prompt such as category names or referring expressions. The categories in the text
-    prompt are separated by commas. It returns a list of bounding boxes, label names,
-    mask file names and associated probability scores.
+    """'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
+    objects in a video given a text prompt such as category names or referring
+    expressions. The categories in the text prompt are separated by commas. It returns
+    a list of bounding boxes, label names, masks and associated probability scores and
+    is useful for tracking and counting without duplicating counts.
     Parameters:
         prompt (str): The prompt to ground to the image.
-        image (np.ndarray): The image to ground the prompt to.
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
+            new objects.
         fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
             fine-tuned model ID here to use it.
     Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
-            bounding box, and mask of the detected objects with normalized coordinates
-            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
-            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
-            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
-            the background.
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
+            label, segmentation mask and bounding boxes. The outer list represents each
+            frame and the inner list is the entities per frame. The detected objects
+            have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
+            and ymin are the coordinates of the top-left and xmax and ymax are the
+            coordinates of the bottom-right of the bounding box. The mask is binary 2D
+            numpy array where 1 indicates the object and 0 indicates the background.
+            The label names are prefixed with their ID represent the total count.
     Example
     -------
@@ -742,11 +747,11 @@ def florence2_sam2_video_tracking(
     chunk_length: Optional[int] = 10,
     fine_tune_id: Optional[str] = None,
 ) -> List[List[Dict[str, Any]]]:
-    """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
-    entities in a video given a text prompt such as category names or referring
-    expressions. You can optionally separate the categories in the text with commas. It
-    can find new objects every 'chunk_length' frames and is useful for tracking and
-    counting without duplicating counts and always outputs scores of 1.0.
+    """'florence2_sam2_video_tracking' is a tool that can track and segment multiple
+    objects in a video given a text prompt such as category names or referring
+    expressions. The categories in the text prompt are separated by commas. It returns
+    a list of bounding boxes, label names, masks and associated probability scores and
+    is useful for tracking and counting without duplicating counts.
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -758,10 +763,13 @@ def florence2_sam2_video_tracking(
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
-        label, segment mask and bounding boxes. The outer list represents each frame
-        and the inner list is the entities per frame. The label contains the object ID
-        followed by the label name. The objects are only identified in the first framed
-        and tracked throughout the video.
+            label, segmentation mask and bounding boxes. The outer list represents each
+            frame and the inner list is the entities per frame. The detected objects
+            have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
+            and ymin are the coordinates of the top-left and xmax and ymax are the
+            coordinates of the bottom-right of the bounding box. The mask is binary 2D
+            numpy array where 1 indicates the object and 0 indicates the background.
+            The label names are prefixed with their ID represent the total count.
     Example
     -------
@@ -1076,24 +1084,27 @@ def countgd_sam2_video_tracking(
     frames: List[np.ndarray],
     chunk_length: Optional[int] = 10,
 ) -> List[List[Dict[str, Any]]]:
-    """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
-    prompt such as category names or referring expressions. The categories in the text
-    prompt are separated by commas. It returns a list of bounding boxes, label names,
-    mask file names and associated probability scores.
+    """'countgd_sam2_video_tracking' is a tool that can track and segment multiple
+    objects in a video given a text prompt such as category names or referring
+    expressions. The categories in the text prompt are separated by commas. It returns
+    a list of bounding boxes, label names, masks and associated probability scores and
+    is useful for tracking and counting without duplicating counts.
     Parameters:
         prompt (str): The prompt to ground to the image.
-        image (np.ndarray): The image to ground the prompt to.
-        chunk_length (Optional[int]): The number of frames to re-run florence2 to find
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run countgd to find
             new objects.
     Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
-            bounding box, and mask of the detected objects with normalized coordinates
-            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
-            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
-            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
-            the background.
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
+            label, segmentation mask and bounding boxes. The outer list represents each
+            frame and the inner list is the entities per frame. The detected objects
+            have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
+            and ymin are the coordinates of the top-left and xmax and ymax are the
+            coordinates of the bottom-right of the bounding box. The mask is binary 2D
+            numpy array where 1 indicates the object and 0 indicates the background.
+            The label names are prefixed with their ID represent the total count.
     Example
     -------
@@ -1533,7 +1544,7 @@ def video_temporal_localization(
         prompt (str): The question about the video
         frames (List[np.ndarray]): The reference frames used for the question
         model (str): The model to use for the inference. Valid values are
-            'qwen2vl', 'gpt4o', 'internlm-xcomposer'
+            'qwen2vl', 'gpt4o'.
         chunk_length_frames (Optional[int]): length of each chunk in frames
     Returns:
@@ -2102,7 +2113,7 @@ def closest_box_distance(
 def extract_frames_and_timestamps(
-    video_uri: Union[str, Path], fps: float = 1
+    video_uri: Union[str, Path], fps: float = 5
 ) -> List[Dict[str, Union[np.ndarray, float]]]:
     """'extract_frames_and_timestamps' extracts frames and timestamps from a video
     which can be a file path, url or youtube link, returns a list of dictionaries
@@ -2113,7 +2124,7 @@ def extract_frames_and_timestamps(
     Parameters:
         video_uri (Union[str, Path]): The path to the video file, url or youtube link
         fps (float, optional): The frame rate per second to extract the frames. Defaults
-            to 1.
+            to 5.
     Returns:
         List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
@@ -2636,10 +2647,8 @@ FUNCTION_TOOLS = [
     ocr,
     qwen2_vl_images_vqa,
     qwen2_vl_video_vqa,
-    detr_segmentation,
     depth_anything_v2,
     generate_pose_image,
-    vit_image_classification,
     vit_nsfw_classification,
     video_temporal_localization,
     flux_image_inpainting,

{vision_agent-0.2.225 → vision_agent-0.2.226}/vision_agent/utils/sim.py RENAMED Viewed

@@ -133,6 +133,12 @@ class Sim:
         df: pd.DataFrame,
     ) -> bool:
         load_dir = Path(load_dir)
+        if (
+            not Path(load_dir / "df.csv").exists()
+            or not Path(load_dir / "embs.npy").exists()
+        ):
+            return False
         df_load = pd.read_csv(load_dir / "df.csv")
         if platform.system() == "Windows":
             df_load["doc"] = df_load["doc"].apply(lambda x: x.replace("\r", ""))