PyPI - vision-agent - Versions diffs - 0.2.236__py3-none-any.whl → 0.2.238__py3-none-any.whl - Mend

vision-agent 0.2.236py3-none-any.whl → 0.2.238py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

vision_agent/.sim_tools/df.csv +57 -80
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/agent.py +2 -2
vision_agent/agent/vision_agent.py +3 -2
vision_agent/agent/vision_agent_coder.py +13 -19
vision_agent/agent/vision_agent_coder_v2.py +17 -17
vision_agent/agent/vision_agent_planner.py +16 -21
vision_agent/agent/vision_agent_planner_prompts_v2.py +19 -20
vision_agent/agent/vision_agent_planner_v2.py +29 -15
vision_agent/agent/vision_agent_v2.py +12 -12
vision_agent/clients/landing_public_api.py +1 -1
vision_agent/configs/anthropic_openai_config.py +17 -3
vision_agent/configs/config.py +17 -3
vision_agent/lmm/__init__.py +0 -1
vision_agent/lmm/lmm.py +4 -3
vision_agent/models/__init__.py +11 -0
vision_agent/{lmm/types.py → models/lmm_types.py} +4 -1
vision_agent/sim/__init__.py +9 -0
vision_agent/{utils → sim}/sim.py +3 -3
vision_agent/tools/__init__.py +10 -23
vision_agent/tools/meta_tools.py +4 -5
vision_agent/tools/planner_tools.py +148 -37
vision_agent/tools/tools.py +388 -302
vision_agent/utils/__init__.py +0 -1
vision_agent/{agent/agent_utils.py → utils/agent.py} +11 -2
vision_agent/utils/image_utils.py +18 -7
vision_agent/{tools/tool_utils.py → utils/tools.py} +1 -93
vision_agent/utils/tools_doc.py +87 -0
vision_agent/utils/video.py +15 -0
vision_agent/utils/video_tracking.py +38 -5
{vision_agent-0.2.236.dist-info → vision_agent-0.2.238.dist-info}/METADATA +2 -3
vision_agent-0.2.238.dist-info/RECORD +55 -0
vision_agent-0.2.236.dist-info/RECORD +0 -52
/vision_agent/{agent/types.py → models/agent_types.py} +0 -0
/vision_agent/{tools → models}/tools_types.py +0 -0
{vision_agent-0.2.236.dist-info → vision_agent-0.2.238.dist-info}/LICENSE +0 -0
{vision_agent-0.2.236.dist-info → vision_agent-0.2.238.dist-info}/WHEEL +0 -0

vision_agent/.sim_tools/df.csv CHANGED Viewed

@@ -65,7 +65,7 @@ desc,doc,name
             },
         ]
     ",owlv2_sam2_instance_segmentation
-"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
+"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
 'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
     objects in a video given a text prompt such as category names or referring
     expressions. The categories in the text prompt are separated by commas. It returns
@@ -75,6 +75,8 @@ desc,doc,name
     Parameters:
         prompt (str): The prompt to ground to the image.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        box_threshold (float, optional): The threshold for the box detection. Defaults
+            to 0.10.
         chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
             new objects.
         fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
@@ -175,7 +177,7 @@ desc,doc,name
             },
         ]
     ",countgd_sam2_instance_segmentation
-"'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
+"'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
 'countgd_sam2_video_tracking' is a tool that can track and segment multiple
     objects in a video given a text prompt such as category names or referring
     expressions. The categories in the text prompt are separated by commas. It returns
@@ -185,6 +187,8 @@ desc,doc,name
     Parameters:
         prompt (str): The prompt to ground to the image.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        box_threshold (float, optional): The threshold for detection. Defaults
+            to 0.23.
         chunk_length (Optional[int]): The number of frames to re-run countgd to find
             new objects.
@@ -236,6 +240,34 @@ desc,doc,name
             {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
         ]
     ",florence2_ocr
+"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
+'florence2_object_detection' is a tool that can detect multiple objects given a
+    text prompt which can be object names or caption. You can optionally separate the
+    object names in the text with commas. It returns a list of bounding boxes with
+    normalized coordinates, label names and associated confidence scores of 1.0.
+    Parameters:
+        prompt (str): The prompt to ground to the image. Use exclusive categories that
+            do not overlap such as 'person, car' and NOT 'person, athlete'.
+        image (np.ndarray): The image to used to detect objects
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box. The scores are always 1.0 and cannot be thresholded
+    Example
+    -------
+        >>> florence2_object_detection('person looking at a coyote', image)
+        [
+            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
+        ]
+    ",florence2_object_detection
 "'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
 'florence2_sam2_instance_segmentation' is a tool that can segment multiple
     objects given a text prompt such as category names or referring expressions. The
@@ -274,7 +306,7 @@ desc,doc,name
             },
         ]
     ",florence2_sam2_instance_segmentation
-"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
+"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
 'florence2_sam2_video_tracking' is a tool that can track and segment multiple
     objects in a video given a text prompt such as category names or referring
     expressions. The categories in the text prompt are separated by commas. It returns
@@ -318,34 +350,6 @@ desc,doc,name
             ...
         ]
     ",florence2_sam2_video_tracking
-"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
-'florence2_object_detection' is a tool that can detect multiple objects given a
-    text prompt which can be object names or caption. You can optionally separate the
-    object names in the text with commas. It returns a list of bounding boxes with
-    normalized coordinates, label names and associated confidence scores of 1.0.
-    Parameters:
-        prompt (str): The prompt to ground to the image. Use exclusive categories that
-            do not overlap such as 'person, car' and NOT 'person, athlete'.
-        image (np.ndarray): The image to used to detect objects
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-            bounding box of the detected objects with normalized coordinates between 0
-            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
-            top-left and xmax and ymax are the coordinates of the bottom-right of the
-            bounding box. The scores are always 1.0 and cannot be thresholded
-    Example
-    -------
-        >>> florence2_object_detection('person looking at a coyote', image)
-        [
-            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
-        ]
-    ",florence2_object_detection
 'claude35_text_extraction' is a tool that can extract text from an image. It returns the extracted text as a string and can be used as an alternative to OCR if you do not need to know the exact bounding box of the text.,"claude35_text_extraction(image: numpy.ndarray) -> str:
 'claude35_text_extraction' is a tool that can extract text from an image. It
     returns the extracted text as a string and can be used as an alternative to OCR if
@@ -458,6 +462,28 @@ desc,doc,name
         >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
         'Lionel Messi'
     ",qwen2_vl_video_vqa
+'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 10) -> List[float]:
+'activity_recognition' is a tool that can recognize activities in a video given a
+    text prompt. It can be used to identify where specific activities or actions
+    happen in a video and returns a list of 0s and 1s to indicate the activity.
+    Parameters:
+        prompt (str): The event you want to identify, should be phrased as a question,
+            for example, ""Did a goal happen?"".
+        frames (List[np.ndarray]): The reference frames used for the question
+        model (str): The model to use for the inference. Valid values are
+            'claude-35', 'gpt-4o', 'qwen2vl'.
+        chunk_length_frames (int): length of each chunk in frames
+    Returns:
+        List[float]: A list of floats with a value of 1.0 if the activity is detected in
+            the chunk_length_frames of the video.
+    Example
+    -------
+        >>> activity_recognition('Did a goal happened?', frames)
+        [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
+    ",activity_recognition
 'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
 'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
     depth image from a given RGB image. The returned depth image is monochrome and
@@ -514,30 +540,6 @@ desc,doc,name
         >>> vit_nsfw_classification(image)
         {""label"": ""normal"", ""scores"": 0.68},
     ",vit_nsfw_classification
-'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
-'video_temporal_localization' will run qwen2vl on each chunk_length_frames
-    value selected for the video. It can detect multiple objects independently per
-    chunk_length_frames given a text prompt such as a referring expression
-    but does not track objects across frames.
-    It returns a list of floats with a value of 1.0 if the objects are found in a given
-    chunk_length_frames of the video.
-    Parameters:
-        prompt (str): The question about the video
-        frames (List[np.ndarray]): The reference frames used for the question
-        model (str): The model to use for the inference. Valid values are
-            'qwen2vl', 'gpt4o'.
-        chunk_length_frames (int): length of each chunk in frames
-    Returns:
-        List[float]: A list of floats with a value of 1.0 if the objects to be found
-            are present in the chunk_length_frames of the video.
-    Example
-    -------
-        >>> video_temporal_localization('Did a goal happened?', frames)
-        [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
-    ",video_temporal_localization
 "'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
 'flux_image_inpainting' performs image inpainting to fill the masked regions,
     given by mask, in the image, given image based on the text prompt and surrounding
@@ -728,28 +730,3 @@ desc,doc,name
             }],
         )
     ",overlay_segmentation_masks
-'overlay_heat_map' is a utility function that displays a heat map on an image.,"overlay_heat_map(image: numpy.ndarray, heat_map: Dict[str, Any], alpha: float = 0.8) -> numpy.ndarray:
-'overlay_heat_map' is a utility function that displays a heat map on an image.
-    Parameters:
-        image (np.ndarray): The image to display the heat map on.
-        heat_map (Dict[str, Any]): A dictionary containing the heat map under the key
-            'heat_map'.
-        alpha (float, optional): The transparency of the overlay. Defaults to 0.8.
-    Returns:
-        np.ndarray: The image with the heat map displayed.
-    Example
-    -------
-        >>> image_with_heat_map = overlay_heat_map(
-            image,
-            {
-                'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
-                    [0, 0, 0, ..., 0, 0, 0],
-                    ...,
-                    [0, 0, 0, ..., 0, 0, 0],
-                    [0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
-            },
-        )
-    ",overlay_heat_map

vision_agent/.sim_tools/embs.npy CHANGED Viewed

Binary file

vision_agent/agent/agent.py CHANGED Viewed

@@ -2,13 +2,13 @@ from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
-from vision_agent.agent.types import (
+from vision_agent.models import (
     AgentMessage,
     CodeContext,
     InteractionContext,
+    Message,
     PlanContext,
 )
-from vision_agent.lmm.types import Message
 from vision_agent.utils.execute import CodeInterpreter

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -6,7 +6,6 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_json, extract_tag
 from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE1,
     EXAMPLES_CODE2,
@@ -14,7 +13,8 @@ from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE3_EXTRA2,
     VA_CODE,
 )
-from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
+from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
+from vision_agent.models import Message
 from vision_agent.tools.meta_tools import (
     META_TOOL_DOCSTRING,
     Artifacts,
@@ -22,6 +22,7 @@ from vision_agent.tools.meta_tools import (
     use_extra_vision_agent_args,
 )
 from vision_agent.utils import CodeInterpreterFactory
+from vision_agent.utils.agent import extract_json, extract_tag
 from vision_agent.utils.execute import CodeInterpreter, Execution
 logging.basicConfig(level=logging.INFO)

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -9,16 +9,6 @@ from tabulate import tabulate
 import vision_agent.tools as T
 from vision_agent.agent.agent import Agent
-from vision_agent.agent.agent_utils import (
-    _MAX_TABULATE_COL_WIDTH,
-    DefaultImports,
-    extract_code,
-    extract_tag,
-    format_feedback,
-    print_code,
-    remove_installs_from_code,
-    strip_function_calls,
-)
 from vision_agent.agent.vision_agent_coder_prompts import (
     CODE,
     FIX_BUG,
@@ -32,16 +22,20 @@ from vision_agent.agent.vision_agent_planner import (
     OpenAIVisionAgentPlanner,
     PlanContext,
 )
-from vision_agent.lmm import (
-    LMM,
-    AnthropicLMM,
-    AzureOpenAILMM,
-    Message,
-    OllamaLMM,
-    OpenAILMM,
-)
+from vision_agent.lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
+from vision_agent.models import Message
 from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
+from vision_agent.utils.agent import (
+    _MAX_TABULATE_COL_WIDTH,
+    DefaultImports,
+    extract_code,
+    extract_tag,
+    format_feedback,
+    print_code,
+    remove_installs_from_code,
+    strip_function_calls,
+)
 from vision_agent.utils.execute import CodeInterpreter
 logging.basicConfig(stream=sys.stdout)
@@ -490,7 +484,7 @@ class VisionAgentCoder(Agent):
                 tool_info=tool_doc,
                 tool_output=tool_output_str,
                 plan_thoughts=plan_thoughts_str,
-                tool_utils=T.UTILITIES_DOCSTRING,
+                tool_utils=T.get_utilties_docstring(),
                 working_memory=working_memory,
                 coder=self.coder,
                 tester=self.tester,

vision_agent/agent/vision_agent_coder_v2.py CHANGED Viewed

@@ -5,9 +5,22 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
 from rich.console import Console
 from rich.markup import escape
-import vision_agent.tools.tools as T
 from vision_agent.agent import AgentCoder, AgentPlanner
-from vision_agent.agent.agent_utils import (
+from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
+from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
+from vision_agent.configs import Config
+from vision_agent.lmm import LMM
+from vision_agent.models import (
+    AgentMessage,
+    CodeContext,
+    InteractionContext,
+    Message,
+    PlanContext,
+)
+from vision_agent.sim import Sim, get_tool_recommender
+from vision_agent.tools.meta_tools import get_diff
+from vision_agent.tools.tools import get_utilties_docstring
+from vision_agent.utils.agent import (
     DefaultImports,
     add_media_to_chat,
     capture_media_from_exec,
@@ -18,24 +31,11 @@ from vision_agent.agent.agent_utils import (
     print_code,
     strip_function_calls,
 )
-from vision_agent.agent.types import (
-    AgentMessage,
-    CodeContext,
-    InteractionContext,
-    PlanContext,
-)
-from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
-from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
-from vision_agent.configs import Config
-from vision_agent.lmm import LMM
-from vision_agent.lmm.types import Message
-from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils.execute import (
     CodeInterpreter,
     CodeInterpreterFactory,
     Execution,
 )
-from vision_agent.utils.sim import Sim, get_tool_recommender
 CONFIG = Config()
 _CONSOLE = Console()
@@ -207,7 +207,7 @@ def test_code(
     test = write_test(
         tester=tester,
         chat=chat,
-        tool_util_docs=T.UTILITIES_DOCSTRING,
+        tool_util_docs=get_utilties_docstring(),
         code=code,
         media_list=media_list,
     )
@@ -227,7 +227,7 @@ def test_code(
     while (not result.success or len(result.logs.stdout) == 0) and count < 3:
         code, test, debug_info = debug_code(
             debugger,
-            T.UTILITIES_DOCSTRING + "\n" + tool_docs,
+            get_utilties_docstring() + "\n" + tool_docs,
             plan,
             code,
             test,

vision_agent/agent/vision_agent_planner.py CHANGED Viewed

@@ -9,15 +9,6 @@ from tabulate import tabulate
 import vision_agent.tools as T
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import (
-    _MAX_TABULATE_COL_WIDTH,
-    DefaultImports,
-    extract_code,
-    extract_json,
-    format_feedback,
-    format_plans,
-    print_code,
-)
 from vision_agent.agent.vision_agent_planner_prompts import (
     PICK_PLAN,
     PLAN,
@@ -25,20 +16,24 @@ from vision_agent.agent.vision_agent_planner_prompts import (
     TEST_PLANS,
     USER_REQ,
 )
-from vision_agent.lmm import (
-    LMM,
-    AnthropicLMM,
-    AzureOpenAILMM,
-    Message,
-    OllamaLMM,
-    OpenAILMM,
+from vision_agent.lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
+from vision_agent.models import Message
+from vision_agent.sim import AzureSim, OllamaSim, Sim
+from vision_agent.utils.agent import (
+    _MAX_TABULATE_COL_WIDTH,
+    DefaultImports,
+    extract_code,
+    extract_json,
+    format_feedback,
+    format_plans,
+    print_code,
 )
 from vision_agent.utils.execute import (
     CodeInterpreter,
     CodeInterpreterFactory,
     Execution,
 )
-from vision_agent.utils.sim import AzureSim, OllamaSim, Sim
+from vision_agent.utils.tools_doc import get_tool_descriptions_by_names
 _LOGGER = logging.getLogger(__name__)
@@ -348,7 +343,7 @@ class VisionAgentPlanner(Agent):
             _LOGGER.setLevel(logging.INFO)
         self.tool_recommender = (
-            Sim(T.TOOLS_DF, sim_key="desc")
+            Sim(T.get_tools_df(), sim_key="desc")
             if tool_recommender is None
             else tool_recommender
         )
@@ -414,7 +409,7 @@ class VisionAgentPlanner(Agent):
             plans = write_plans(
                 chat,
-                T.get_tool_descriptions_by_names(
+                get_tool_descriptions_by_names(
                     custom_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
                 ),
                 format_feedback(working_memory),
@@ -537,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
                 else planner
             ),
             tool_recommender=(
-                OllamaSim(T.TOOLS_DF, sim_key="desc")
+                OllamaSim(T.get_tools_df(), sim_key="desc")
                 if tool_recommender is None
                 else tool_recommender
             ),
@@ -559,7 +554,7 @@ class AzureVisionAgentPlanner(VisionAgentPlanner):
         super().__init__(
             planner=(AzureOpenAILMM(temperature=0.0) if planner is None else planner),
             tool_recommender=(
-                AzureSim(T.TOOLS_DF, sim_key="desc")
+                AzureSim(T.get_tools_df(), sim_key="desc")
                 if tool_recommender is None
                 else tool_recommender
             ),

vision_agent/agent/vision_agent_planner_prompts_v2.py CHANGED Viewed

@@ -9,21 +9,22 @@ PLAN = """
 **Example Planning**: Here are some examples of how you can search for a plan, in the examples the user output is denoted by USER, your output is denoted by AGENT and the observations after your code execution are denoted by OBSERVATION:
 {examples}
-**Current Planning**:
---- START PLANNING ---
+**Current Planning**: This is the plan you are currently working on
+--- START CURRENT PLANNING ---
 {planning}
---- END PLANNING ---
+--- END CURRENT PLANNING ---
 **Instructions**:
 1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You can <count> number of turns to complete the user's request.
 2. You can execute python code in the ipython notebook using <execute_python> tags. Only output one <execute_python> tag at a time.
 3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
 4. Only load/save files from {media_list} unless you specifically saved the file previously.
-5. Ensure you always call `suggestion` initially and `get_tool_for_task` to get the right tool for the subtask.
+5. Ensure you always call `suggestion` and `claude35_vqa` initially and `get_tool_for_task` to get the right tool for the subtask.
 6. Calling `plt.imshow` or `save_image` will display the image to you so you can check your results. If you see an image after <execute_python> it's generated from your code.
-7. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
-8. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool.
-9. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
+7. Be sure to print results returned for tools so you can see the output.
+8. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
+9. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool output, tools are often better at things like counting and detecting small objects.
+10. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
 <thinking>Your thought process...</thinking>
 <execute_python>Your code here</execute_python>
@@ -334,23 +335,21 @@ get_tool_for_task('Identify and track the boxes in the video', frames[:5])
 OBSERVATION:
 [get_tool_for_task output]
-For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor and using the prompt 'box'.
+For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: countgd_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor and using the prompt 'box'.
 Tool Documentation:
-def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
-    'florence2_sam2_video_tracking' is a tool that can track and segment multiple
+def countgd_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
+    'countgd_sam2_video_tracking' is a tool that can track and segment multiple
     objects in a video given a text prompt such as category names or referring
     expressions. The categories in the text prompt are separated by commas. It returns
     a list of bounding boxes, label names, masks and associated probability scores and
     is useful for tracking and counting without duplicating counts.
     Parameters:
-        prompt (str): The prompt to ground to the video.
+        prompt (str): The prompt to ground to the image.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
-        chunk_length (Optional[int]): The number of frames to re-run florence2 to find
+        chunk_length (Optional[int]): The number of frames to re-run countgd to find
             new objects.
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -364,7 +363,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l
     Example
     -------
-        >>> florence2_sam2_video_tracking("car, dinosaur", frames)
+        >>> countgd_sam2_video_tracking("car, dinosaur", frames)
         [
             [
                 {
@@ -382,7 +381,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l
 AGENT: <thinking>Now that I have the tool and prompt for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
 <execute_python>
-track_predictions = florence2_sam2_video_tracking("box", frames)
+track_predictions = countgd_sam2_video_tracking("box", frames)
 # only use first 5 frames to reduce processing time
 crops = []
@@ -512,16 +511,16 @@ PICK_PLAN = """
 CATEGORIZE_TOOL_REQUEST = """
 You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
+- "VQA" - answering questions about an image or video, can be used for most tasks, should generally be included.
 - "object detection and counting" - detecting objects or counting objects from a text prompt in an image.
 - "instance segmentation" - segmenting objects in an image given a text prompt.
 - "classification" - classifying objects in an image given a text prompt.
 - "segmentation" - segmenting objects in an image or video given a text prompt.
 - "OCR" - extracting text from an image.
-- "VQA" - answering questions about an image or video, can also be used for text extraction.
 - "DocQA" - answering questions about a document or extracting information from a document.
 - "video object tracking" - tracking objects in a video.
 - "depth and pose estimation" - estimating the depth or pose of objects in an image.
-- "temporal localization" - localizing the time period an event occurs in a video.
+- "activity recognition" - identifying time period(s) an event occurs in a video.
 - "inpainting" - filling in masked parts of an image.
 Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
@@ -718,7 +717,7 @@ PICK_TOOL = """
 FINALIZE_PLAN = """
 **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
-**Documentation**: You can use these tools to help you visualize or save the output:
+**Documentation**: You can use these tools to help you visualize or save the output (they are imported `from vision_agent.tools import *`):
 {tool_desc}
 **Planning**: Here is chain of thoughts, executions and observations from the planning agent:
@@ -730,7 +729,7 @@ FINALIZE_PLAN = """
 3. Only use tools obtained from calling `get_tool_for_task`.
 4. Do not include {excluded_tools} tools in your instructions.
 5. Ensure the function is well documented and easy to understand.
-6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and save it to a file with `save_image` or `save_video`.
+6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks`, if bounding boxes or segmentaiton masks are produced, and save it to a file with `save_image` or `save_video`.
 7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
 8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
 9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:

vision_agent/agent/vision_agent_planner_v2.py CHANGED Viewed

@@ -13,17 +13,6 @@ from rich.markup import escape
 import vision_agent.tools as T
 import vision_agent.tools.planner_tools as pt
 from vision_agent.agent import AgentPlanner
-from vision_agent.agent.agent_utils import (
-    add_media_to_chat,
-    capture_media_from_exec,
-    convert_message_to_agentmessage,
-    extract_json,
-    extract_tag,
-    print_code,
-    print_table,
-    remove_installs_from_code,
-)
-from vision_agent.agent.types import AgentMessage, InteractionContext, PlanContext
 from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     CRITIQUE_PLAN,
     EXAMPLE_PLAN1,
@@ -34,17 +23,29 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     PLAN,
 )
 from vision_agent.configs import Config
-from vision_agent.lmm import LMM, Message
-from vision_agent.tools.planner_tools import check_function_call, get_tool_documentation
+from vision_agent.lmm import LMM
+from vision_agent.models import AgentMessage, InteractionContext, Message, PlanContext
+from vision_agent.tools.planner_tools import check_function_call
+from vision_agent.utils.agent import (
+    add_media_to_chat,
+    capture_media_from_exec,
+    convert_message_to_agentmessage,
+    extract_json,
+    extract_tag,
+    print_code,
+    print_table,
+    remove_installs_from_code,
+)
 from vision_agent.utils.execute import (
     CodeInterpreter,
     CodeInterpreterFactory,
     Execution,
 )
+from vision_agent.utils.tools_doc import get_tool_documentation
 logging.basicConfig(level=logging.INFO)
 CONFIG = Config()
-UTIL_DOCSTRING = T.get_tool_documentation(
+UTIL_DOCSTRING = get_tool_documentation(
     [
         T.load_image,
         T.extract_frames_and_timestamps,
@@ -360,6 +361,16 @@ def get_steps(chat: List[AgentMessage], max_steps: int) -> int:
     return max_steps
+def format_tool_output(tool_thoughts: str, tool_docstring: str) -> str:
+    return_str = "[get_tool_for_task output]\n"
+    if tool_thoughts.strip() != "":
+        return_str += f"{tool_thoughts}\n\n"
+    return_str += (
+        f"Tool Documentation:\n{tool_docstring}\n[end of get_tool_for_task output]\n"
+    )
+    return return_str
 def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]:
     chat = copy.deepcopy(chat)
     new_chat = []
@@ -371,7 +382,10 @@ def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]
             try:
                 response = json.loads(chat[i + 1].content)
                 function_name = response["function_name"]
-                tool_doc = get_tool_documentation(function_name)
+                tools_df = T.get_tools_df()
+                tool_doc = format_tool_output(
+                    "", tools_df[tools_df["name"] == function_name]["doc"].values[0]
+                )
                 if "box_threshold" in response:
                     tool_doc = f"Use the following function with box_threshold={response['box_threshold']}. This tool and its parameters were chosen by the user so do not change them in your planning.\n\n{tool_doc}."
                 new_chat.append(AgentMessage(role="observation", content=tool_doc))

vision-agent 0.2.236__py3-none-any.whl → 0.2.238__py3-none-any.whl

vision-agent 0.2.236py3-none-any.whl → 0.2.238py3-none-any.whl