vision-agent 0.2.229__py3-none-any.whl → 0.2.230__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +10 -8
- vision_agent/agent/agent_utils.py +10 -9
- vision_agent/agent/vision_agent.py +3 -4
- vision_agent/agent/vision_agent_coder_prompts.py +6 -6
- vision_agent/agent/vision_agent_coder_v2.py +41 -26
- vision_agent/agent/vision_agent_planner_prompts.py +6 -6
- vision_agent/agent/vision_agent_planner_prompts_v2.py +16 -50
- vision_agent/agent/vision_agent_planner_v2.py +10 -12
- vision_agent/agent/vision_agent_prompts.py +11 -11
- vision_agent/agent/vision_agent_prompts_v2.py +18 -3
- vision_agent/agent/vision_agent_v2.py +29 -30
- vision_agent/configs/__init__.py +1 -0
- vision_agent/configs/anthropic_config.py +150 -0
- vision_agent/configs/anthropic_openai_config.py +150 -0
- vision_agent/configs/config.py +150 -0
- vision_agent/configs/openai_config.py +160 -0
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +63 -9
- vision_agent/tools/planner_tools.py +60 -40
- vision_agent/tools/tools.py +10 -8
- vision_agent-0.2.230.dist-info/METADATA +156 -0
- {vision_agent-0.2.229.dist-info → vision_agent-0.2.230.dist-info}/RECORD +24 -19
- vision_agent-0.2.229.dist-info/METADATA +0 -562
- {vision_agent-0.2.229.dist-info → vision_agent-0.2.230.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.229.dist-info → vision_agent-0.2.230.dist-info}/WHEEL +0 -0
    
        vision_agent/.sim_tools/df.csv
    CHANGED
    
    | @@ -244,7 +244,8 @@ desc,doc,name | |
| 244 244 | 
             
                1.0.
         | 
| 245 245 |  | 
| 246 246 | 
             
                Parameters:
         | 
| 247 | 
            -
                    prompt (str): The prompt to ground to the image.
         | 
| 247 | 
            +
                    prompt (str): The prompt to ground to the image. Use exclusive categories that
         | 
| 248 | 
            +
                        do not overlap such as 'person, car' and NOT 'person, athlete'.
         | 
| 248 249 | 
             
                    image (np.ndarray): The image to ground the prompt to.
         | 
| 249 250 | 
             
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         | 
| 250 251 | 
             
                        fine-tuned model ID here to use it.
         | 
| @@ -281,7 +282,8 @@ desc,doc,name | |
| 281 282 | 
             
                is useful for tracking and counting without duplicating counts.
         | 
| 282 283 |  | 
| 283 284 | 
             
                Parameters:
         | 
| 284 | 
            -
                    prompt (str): The prompt to ground to the  | 
| 285 | 
            +
                    prompt (str): The prompt to ground to the image. Use exclusive categories that
         | 
| 286 | 
            +
                        do not overlap such as 'person, car' and NOT 'person, athlete'.
         | 
| 285 287 | 
             
                    frames (List[np.ndarray]): The list of frames to ground the prompt to.
         | 
| 286 288 | 
             
                    chunk_length (Optional[int]): The number of frames to re-run florence2 to find
         | 
| 287 289 | 
             
                        new objects.
         | 
| @@ -317,14 +319,14 @@ desc,doc,name | |
| 317 319 | 
             
                    ]
         | 
| 318 320 | 
             
                ",florence2_sam2_video_tracking
         | 
| 319 321 | 
             
            "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
         | 
| 320 | 
            -
            'florence2_object_detection' is a tool that can detect multiple
         | 
| 321 | 
            -
                 | 
| 322 | 
            -
                 | 
| 323 | 
            -
                 | 
| 324 | 
            -
                confidence scores of 1.0.
         | 
| 322 | 
            +
            'florence2_object_detection' is a tool that can detect multiple objects given a
         | 
| 323 | 
            +
                text prompt which can be object names or caption. You can optionally separate the
         | 
| 324 | 
            +
                object names in the text with commas. It returns a list of bounding boxes with
         | 
| 325 | 
            +
                normalized coordinates, label names and associated confidence scores of 1.0.
         | 
| 325 326 |  | 
| 326 327 | 
             
                Parameters:
         | 
| 327 | 
            -
                    prompt (str): The prompt to ground to the image.
         | 
| 328 | 
            +
                    prompt (str): The prompt to ground to the image. Use exclusive categories that
         | 
| 329 | 
            +
                        do not overlap such as 'person, car' and NOT 'person, athlete'.
         | 
| 328 330 | 
             
                    image (np.ndarray): The image to used to detect objects
         | 
| 329 331 | 
             
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         | 
| 330 332 | 
             
                        fine-tuned model ID here to use it.
         | 
| @@ -157,10 +157,11 @@ def format_conversation(chat: List[AgentMessage]) -> str: | |
| 157 157 | 
             
                chat = copy.deepcopy(chat)
         | 
| 158 158 | 
             
                prompt = ""
         | 
| 159 159 | 
             
                for chat_i in chat:
         | 
| 160 | 
            -
                    if chat_i.role == "user":
         | 
| 161 | 
            -
                         | 
| 162 | 
            -
             | 
| 163 | 
            -
                         | 
| 160 | 
            +
                    if chat_i.role == "user" or chat_i.role == "coder":
         | 
| 161 | 
            +
                        if "<final_code>" in chat_i.role:
         | 
| 162 | 
            +
                            prompt += f"OBSERVATION: {chat_i.content}\n\n"
         | 
| 163 | 
            +
                        elif chat_i.role == "user":
         | 
| 164 | 
            +
                            prompt += f"USER: {chat_i.content}\n\n"
         | 
| 164 165 | 
             
                    elif chat_i.role == "conversation":
         | 
| 165 166 | 
             
                        prompt += f"AGENT: {chat_i.content}\n\n"
         | 
| 166 167 | 
             
                return prompt
         | 
| @@ -332,26 +333,26 @@ def strip_function_calls(  # noqa: C901 | |
| 332 333 | 
             
                    def __init__(self, exclusions: List[str]):
         | 
| 333 334 | 
             
                        # Store exclusions to skip removing certain function calls
         | 
| 334 335 | 
             
                        self.exclusions = exclusions
         | 
| 335 | 
            -
                        self.in_function_or_class =  | 
| 336 | 
            +
                        self.in_function_or_class: List[bool] = []
         | 
| 336 337 |  | 
| 337 338 | 
             
                    def visit_FunctionDef(self, node: cst.FunctionDef) -> Optional[bool]:
         | 
| 338 | 
            -
                        self.in_function_or_class | 
| 339 | 
            +
                        self.in_function_or_class.append(True)
         | 
| 339 340 | 
             
                        return True
         | 
| 340 341 |  | 
| 341 342 | 
             
                    def leave_FunctionDef(
         | 
| 342 343 | 
             
                        self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef
         | 
| 343 344 | 
             
                    ) -> cst.BaseStatement:
         | 
| 344 | 
            -
                        self.in_function_or_class | 
| 345 | 
            +
                        self.in_function_or_class.pop()
         | 
| 345 346 | 
             
                        return updated_node
         | 
| 346 347 |  | 
| 347 348 | 
             
                    def visit_ClassDef(self, node: cst.ClassDef) -> Optional[bool]:
         | 
| 348 | 
            -
                        self.in_function_or_class | 
| 349 | 
            +
                        self.in_function_or_class.append(True)
         | 
| 349 350 | 
             
                        return True
         | 
| 350 351 |  | 
| 351 352 | 
             
                    def leave_ClassDef(
         | 
| 352 353 | 
             
                        self, node: cst.ClassDef, updated_node: cst.ClassDef
         | 
| 353 354 | 
             
                    ) -> cst.BaseStatement:
         | 
| 354 | 
            -
                        self.in_function_or_class | 
| 355 | 
            +
                        self.in_function_or_class.pop()
         | 
| 355 356 | 
             
                        return updated_node
         | 
| 356 357 |  | 
| 357 358 | 
             
                    def leave_Expr(
         | 
| @@ -291,10 +291,9 @@ class VisionAgent(Agent): | |
| 291 291 | 
             
                        verbosity (int): The verbosity level of the agent.
         | 
| 292 292 | 
             
                        callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
         | 
| 293 293 | 
             
                            function to send intermediate update messages.
         | 
| 294 | 
            -
                         | 
| 295 | 
            -
                             | 
| 296 | 
            -
                             | 
| 297 | 
            -
                            object is provided it will use that.
         | 
| 294 | 
            +
                        code_sandbox_runtime (Optional[str]): For string values it can be one of:
         | 
| 295 | 
            +
                            None, "local" or "e2b". If None, it will read from the environment
         | 
| 296 | 
            +
                            variable "CODE_SANDBOX_RUNTIME".
         | 
| 298 297 | 
             
                    """
         | 
| 299 298 |  | 
| 300 299 | 
             
                    self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
         | 
| @@ -44,22 +44,22 @@ Can you write a program to check if each person is wearing a helmet? First detec | |
| 44 44 |  | 
| 45 45 | 
             
            ## Subtasks
         | 
| 46 46 |  | 
| 47 | 
            -
            This plan uses the  | 
| 48 | 
            -
            -Use  | 
| 47 | 
            +
            This plan uses the owlv2_object_detection tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
         | 
| 48 | 
            +
            -Use owlv2_object_detection with prompt 'person, helmet' to detect both people and helmets in the image
         | 
| 49 49 | 
             
            -Process the detections to match helmets with people based on bounding box proximity
         | 
| 50 50 | 
             
            -Count people with and without helmets based on the matching results
         | 
| 51 51 | 
             
            -Return a dictionary with the counts
         | 
| 52 52 |  | 
| 53 53 |  | 
| 54 54 | 
             
            **Tool Tests and Outputs**:
         | 
| 55 | 
            -
            After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using  | 
| 55 | 
            +
            After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owlv2_object_detection seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
         | 
| 56 56 |  | 
| 57 57 | 
             
            **Tool Output Thoughts**:
         | 
| 58 58 | 
             
            ```python
         | 
| 59 59 | 
             
            ...
         | 
| 60 60 | 
             
            ```
         | 
| 61 61 | 
             
            ----- stdout -----
         | 
| 62 | 
            -
            Plan 1 -  | 
| 62 | 
            +
            Plan 1 - owlv2_object_detection:
         | 
| 63 63 |  | 
| 64 64 | 
             
            [{{'label': 'helmet', 'score': 0.15, 'bbox': [0.85, 0.41, 0.87, 0.45]}}, {{'label': 'helmet', 'score': 0.3, 'bbox': [0.8, 0.43, 0.81, 0.46]}}, {{'label': 'helmet', 'score': 0.31, 'bbox': [0.85, 0.45, 0.86, 0.46]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.84, 0.45, 0.88, 0.58]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.78, 0.43, 0.82, 0.57]}}, {{'label': 'helmet', 'score': 0.33, 'bbox': [0.3, 0.65, 0.32, 0.67]}}, {{'label': 'person', 'score': 0.29, 'bbox': [0.28, 0.65, 0.36, 0.84]}}, {{'label': 'helmet', 'score': 0.29, 'bbox': [0.13, 0.82, 0.15, 0.85]}}, {{'label': 'person', 'score': 0.3, 'bbox': [0.1, 0.82, 0.24, 1.0]}}]
         | 
| 65 65 |  | 
| @@ -67,12 +67,12 @@ Plan 1 - owl_v2_image: | |
| 67 67 |  | 
| 68 68 | 
             
            **Input Code Snippet**:
         | 
| 69 69 | 
             
            ```python
         | 
| 70 | 
            -
            from vision_agent.tools import load_image,  | 
| 70 | 
            +
            from vision_agent.tools import load_image, owlv2_object_detection
         | 
| 71 71 |  | 
| 72 72 | 
             
            def check_helmets(image_path):
         | 
| 73 73 | 
             
                image = load_image(image_path)
         | 
| 74 74 | 
             
                # Detect people and helmets, filter out the lowest confidence helmet score of 0.15
         | 
| 75 | 
            -
                detections =  | 
| 75 | 
            +
                detections = owlv2_object_detection("person, helmet", image, box_threshold=0.15)
         | 
| 76 76 | 
             
                height, width = image.shape[:2]
         | 
| 77 77 |  | 
| 78 78 | 
             
                # Separate people and helmets
         | 
| @@ -26,7 +26,8 @@ from vision_agent.agent.types import ( | |
| 26 26 | 
             
            )
         | 
| 27 27 | 
             
            from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
         | 
| 28 28 | 
             
            from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
         | 
| 29 | 
            -
            from vision_agent. | 
| 29 | 
            +
            from vision_agent.configs import Config
         | 
| 30 | 
            +
            from vision_agent.lmm import LMM
         | 
| 30 31 | 
             
            from vision_agent.lmm.types import Message
         | 
| 31 32 | 
             
            from vision_agent.tools.meta_tools import get_diff
         | 
| 32 33 | 
             
            from vision_agent.utils.execute import (
         | 
| @@ -36,6 +37,7 @@ from vision_agent.utils.execute import ( | |
| 36 37 | 
             
            )
         | 
| 37 38 | 
             
            from vision_agent.utils.sim import Sim, get_tool_recommender
         | 
| 38 39 |  | 
| 40 | 
            +
            CONFIG = Config()
         | 
| 39 41 | 
             
            _CONSOLE = Console()
         | 
| 40 42 |  | 
| 41 43 |  | 
| @@ -185,23 +187,17 @@ def debug_code( | |
| 185 187 | 
             
                return code, test, debug_info
         | 
| 186 188 |  | 
| 187 189 |  | 
| 188 | 
            -
            def  | 
| 189 | 
            -
                coder: LMM,
         | 
| 190 | 
            +
            def test_code(
         | 
| 190 191 | 
             
                tester: LMM,
         | 
| 191 192 | 
             
                debugger: LMM,
         | 
| 192 193 | 
             
                chat: List[AgentMessage],
         | 
| 193 194 | 
             
                plan: str,
         | 
| 195 | 
            +
                code: str,
         | 
| 194 196 | 
             
                tool_docs: str,
         | 
| 195 197 | 
             
                code_interpreter: CodeInterpreter,
         | 
| 196 198 | 
             
                media_list: List[Union[str, Path]],
         | 
| 197 199 | 
             
                verbose: bool,
         | 
| 198 200 | 
             
            ) -> CodeContext:
         | 
| 199 | 
            -
                code = write_code(
         | 
| 200 | 
            -
                    coder=coder,
         | 
| 201 | 
            -
                    chat=chat,
         | 
| 202 | 
            -
                    tool_docs=tool_docs,
         | 
| 203 | 
            -
                    plan=plan,
         | 
| 204 | 
            -
                )
         | 
| 205 201 | 
             
                try:
         | 
| 206 202 | 
             
                    code = strip_function_calls(code)
         | 
| 207 203 | 
             
                except Exception:
         | 
| @@ -257,6 +253,36 @@ def write_and_test_code( | |
| 257 253 | 
             
                )
         | 
| 258 254 |  | 
| 259 255 |  | 
| 256 | 
            +
            def write_and_test_code(
         | 
| 257 | 
            +
                coder: LMM,
         | 
| 258 | 
            +
                tester: LMM,
         | 
| 259 | 
            +
                debugger: LMM,
         | 
| 260 | 
            +
                chat: List[AgentMessage],
         | 
| 261 | 
            +
                plan: str,
         | 
| 262 | 
            +
                tool_docs: str,
         | 
| 263 | 
            +
                code_interpreter: CodeInterpreter,
         | 
| 264 | 
            +
                media_list: List[Union[str, Path]],
         | 
| 265 | 
            +
                verbose: bool,
         | 
| 266 | 
            +
            ) -> CodeContext:
         | 
| 267 | 
            +
                code = write_code(
         | 
| 268 | 
            +
                    coder=coder,
         | 
| 269 | 
            +
                    chat=chat,
         | 
| 270 | 
            +
                    tool_docs=tool_docs,
         | 
| 271 | 
            +
                    plan=plan,
         | 
| 272 | 
            +
                )
         | 
| 273 | 
            +
                return test_code(
         | 
| 274 | 
            +
                    tester,
         | 
| 275 | 
            +
                    debugger,
         | 
| 276 | 
            +
                    chat,
         | 
| 277 | 
            +
                    plan,
         | 
| 278 | 
            +
                    code,
         | 
| 279 | 
            +
                    tool_docs,
         | 
| 280 | 
            +
                    code_interpreter,
         | 
| 281 | 
            +
                    media_list,
         | 
| 282 | 
            +
                    verbose,
         | 
| 283 | 
            +
                )
         | 
| 284 | 
            +
             | 
| 285 | 
            +
             | 
| 260 286 | 
             
            class VisionAgentCoderV2(AgentCoder):
         | 
| 261 287 | 
             
                """VisionAgentCoderV2 is an agent that will write vision code for you."""
         | 
| 262 288 |  | 
| @@ -300,21 +326,9 @@ class VisionAgentCoderV2(AgentCoder): | |
| 300 326 | 
             
                        )
         | 
| 301 327 | 
             
                    )
         | 
| 302 328 |  | 
| 303 | 
            -
                    self.coder = (
         | 
| 304 | 
            -
             | 
| 305 | 
            -
             | 
| 306 | 
            -
                        else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
         | 
| 307 | 
            -
                    )
         | 
| 308 | 
            -
                    self.tester = (
         | 
| 309 | 
            -
                        tester
         | 
| 310 | 
            -
                        if tester is not None
         | 
| 311 | 
            -
                        else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
         | 
| 312 | 
            -
                    )
         | 
| 313 | 
            -
                    self.debugger = (
         | 
| 314 | 
            -
                        debugger
         | 
| 315 | 
            -
                        if debugger is not None
         | 
| 316 | 
            -
                        else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
         | 
| 317 | 
            -
                    )
         | 
| 329 | 
            +
                    self.coder = coder if coder is not None else CONFIG.create_coder()
         | 
| 330 | 
            +
                    self.tester = tester if tester is not None else CONFIG.create_tester()
         | 
| 331 | 
            +
                    self.debugger = debugger if debugger is not None else CONFIG.create_debugger()
         | 
| 318 332 | 
             
                    if tool_recommender is not None:
         | 
| 319 333 | 
             
                        if isinstance(tool_recommender, str):
         | 
| 320 334 | 
             
                            self.tool_recommender = Sim.load(tool_recommender)
         | 
| @@ -440,12 +454,13 @@ class VisionAgentCoderV2(AgentCoder): | |
| 440 454 | 
             
                    ) as code_interpreter:
         | 
| 441 455 | 
             
                        int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
         | 
| 442 456 | 
             
                        tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
         | 
| 443 | 
            -
             | 
| 444 | 
            -
             | 
| 457 | 
            +
             | 
| 458 | 
            +
                        code_context = test_code(
         | 
| 445 459 | 
             
                            tester=self.tester,
         | 
| 446 460 | 
             
                            debugger=self.debugger,
         | 
| 447 461 | 
             
                            chat=int_chat,
         | 
| 448 462 | 
             
                            plan=format_plan_v2(plan_context),
         | 
| 463 | 
            +
                            code=plan_context.code,
         | 
| 449 464 | 
             
                            tool_docs=tool_docs,
         | 
| 450 465 | 
             
                            code_interpreter=code_interpreter,
         | 
| 451 466 | 
             
                            media_list=media_list,
         | 
| @@ -55,27 +55,27 @@ This is the documentation for the functions you have access to. You may call any | |
| 55 55 | 
             
            --- EXAMPLE1 ---
         | 
| 56 56 | 
             
            plan1:
         | 
| 57 57 | 
             
            - Load the image from the provided file path 'image.jpg'.
         | 
| 58 | 
            -
            - Use the ' | 
| 58 | 
            +
            - Use the 'owlv2_object_detection' tool with the prompt 'person' to detect and count the number of people in the image.
         | 
| 59 59 | 
             
            plan2:
         | 
| 60 60 | 
             
            - Load the image from the provided file path 'image.jpg'.
         | 
| 61 | 
            -
            - Use the ' | 
| 61 | 
            +
            - Use the 'florence2_sam2_instance_segmentation' tool with the prompt 'person' to detect and count the number of people in the image.
         | 
| 62 62 | 
             
            - Count the number of detected objects labeled as 'person'.
         | 
| 63 63 | 
             
            plan3:
         | 
| 64 64 | 
             
            - Load the image from the provided file path 'image.jpg'.
         | 
| 65 65 | 
             
            - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
         | 
| 66 66 |  | 
| 67 67 | 
             
            ```python
         | 
| 68 | 
            -
            from vision_agent.tools import load_image,  | 
| 68 | 
            +
            from vision_agent.tools import load_image, owlv2_object_detection, florence2_sam2_instance_segmentation, countgd_object_detection
         | 
| 69 69 | 
             
            image = load_image("image.jpg")
         | 
| 70 | 
            -
            owl_v2_out =  | 
| 70 | 
            +
            owl_v2_out = owlv2_object_detection("person", image)
         | 
| 71 71 |  | 
| 72 | 
            -
            f2s2_out =  | 
| 72 | 
            +
            f2s2_out = florence2_sam2_instance_segmentation("person", image)
         | 
| 73 73 | 
             
            # strip out the masks from the output becuase they don't provide useful information when printed
         | 
| 74 74 | 
             
            f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
         | 
| 75 75 |  | 
| 76 76 | 
             
            cgd_out = countgd_object_detection("person", image)
         | 
| 77 77 |  | 
| 78 | 
            -
            final_out = {{" | 
| 78 | 
            +
            final_out = {{"owlv2_object_detection": owl_v2_out, "florence2_sam2_instance_segmentation": f2s2, "countgd_object_detection": cgd_out}}
         | 
| 79 79 | 
             
            print(final_out)
         | 
| 80 80 | 
             
            --- END EXAMPLE1 ---
         | 
| 81 81 |  | 
| @@ -130,7 +130,7 @@ In these aerial images, I can see approximately 5-6 pedestrians walking in vario | |
| 130 130 | 
             
            [end of claude35_vqa_output]
         | 
| 131 131 |  | 
| 132 132 | 
             
            [get_tool_for_task output]
         | 
| 133 | 
            -
            After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool to count the number of pedestrians in the image.
         | 
| 133 | 
            +
            After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool with the prompt 'pedistrian' to count the number of pedestrians in the image.
         | 
| 134 134 |  | 
| 135 135 | 
             
            Tool Documentation:
         | 
| 136 136 | 
             
            countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
         | 
| @@ -162,7 +162,7 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float | |
| 162 162 | 
             
            [end of get_tool_for_task output]
         | 
| 163 163 | 
             
            <count>8</count>
         | 
| 164 164 |  | 
| 165 | 
            -
            AGENT: <thinking>I need to now merge the boxes from all  | 
| 165 | 
            +
            AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
         | 
| 166 166 | 
             
            <execute_python>
         | 
| 167 167 | 
             
            def translate_ofset(bbox, offset_x, offset_y):
         | 
| 168 168 | 
             
                return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
         | 
| @@ -271,7 +271,7 @@ get_tool_for_task('Identify and track the boxes in the video', frames[:5]) | |
| 271 271 |  | 
| 272 272 | 
             
            OBSERVATION:
         | 
| 273 273 | 
             
            [get_tool_for_task output]
         | 
| 274 | 
            -
            For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor.
         | 
| 274 | 
            +
            For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor and using the prompt 'box'.
         | 
| 275 275 |  | 
| 276 276 | 
             
            Tool Documentation:
         | 
| 277 277 | 
             
            def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
         | 
| @@ -317,7 +317,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l | |
| 317 317 | 
             
            [end of get_tool_for_task output]
         | 
| 318 318 | 
             
            <count>8</count>
         | 
| 319 319 |  | 
| 320 | 
            -
            AGENT: <thinking>Now that I have the tool for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
         | 
| 320 | 
            +
            AGENT: <thinking>Now that I have the tool and prompt for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
         | 
| 321 321 | 
             
            <execute_python>
         | 
| 322 322 | 
             
            track_predictions = florence2_sam2_video_tracking("box", frames)
         | 
| 323 323 |  | 
| @@ -338,7 +338,7 @@ get_tool_for_task('Identify if there is tape on the boxes', crops[:3]) | |
| 338 338 |  | 
| 339 339 | 
             
            OBSERVATION:
         | 
| 340 340 | 
             
            [get_tool_for_task output]
         | 
| 341 | 
            -
            owlv2_object_detection performed best as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
         | 
| 341 | 
            +
            owlv2_object_detection performed best with the prompt 'tape', as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
         | 
| 342 342 |  | 
| 343 343 | 
             
            'owlv2_object_detection' is a tool that can detect and count multiple objects given a
         | 
| 344 344 | 
             
            text prompt such as category names or referring expressions on images. The categories
         | 
| @@ -370,7 +370,7 @@ Example | |
| 370 370 | 
             
            [end of get_tool_for_task output]
         | 
| 371 371 | 
             
            <count>7</count>
         | 
| 372 372 |  | 
| 373 | 
            -
            AGENT: <thinking>Now that I have the tools for tracking and identifying tape on the boxes, I will use them to accomplish the user's request.</thinking>
         | 
| 373 | 
            +
            AGENT: <thinking>Now that I have the tools for tracking and identifying 'tape' on the boxes, I will use them to accomplish the user's request.</thinking>
         | 
| 374 374 | 
             
            <execute_python>
         | 
| 375 375 | 
             
            obj_to_info = {{}}
         | 
| 376 376 | 
             
            for frame, frame_predictions in zip(frames, track_predictions):
         | 
| @@ -487,8 +487,9 @@ TEST_TOOLS = """ | |
| 487 487 | 
             
            2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
         | 
| 488 488 | 
             
            3. Your test case MUST run only on the given images which are {media}
         | 
| 489 489 | 
             
            4. For video tracking, use chunk_length=1 and at least 3 frames to ensure the best results when evaluating the tool.
         | 
| 490 | 
            -
            5.  | 
| 491 | 
            -
            6.  | 
| 490 | 
            +
            5. Use mutually exclusive categories for prompts such as 'person, car' and not 'person, athlete' to avoid over counting.
         | 
| 491 | 
            +
            6. Print this final dictionary.
         | 
| 492 | 
            +
            7. Output your code in the following format wrapped in <code> tags:
         | 
| 492 493 | 
             
            <code>
         | 
| 493 494 | 
             
            # Your code here
         | 
| 494 495 | 
             
            </code>
         | 
| @@ -649,41 +650,6 @@ PICK_TOOL = """ | |
| 649 650 | 
             
            </json>
         | 
| 650 651 | 
             
            """
         | 
| 651 652 |  | 
| 652 | 
            -
            PICK_TOOL_V2 = """
         | 
| 653 | 
            -
            **Role**: You are an expert evaluator that can understand user requests and evaluate the output of different tools.
         | 
| 654 | 
            -
             | 
| 655 | 
            -
            **Task**: You are given the output of different tools for a user request along with the image. You must evaluate the output and determine the best tool for the user request.
         | 
| 656 | 
            -
             | 
| 657 | 
            -
            **User Request**:
         | 
| 658 | 
            -
            {user_request}
         | 
| 659 | 
            -
             | 
| 660 | 
            -
            **Tools**: This is the documentation of all the functions that were tested.
         | 
| 661 | 
            -
            {tool_docs}
         | 
| 662 | 
            -
             | 
| 663 | 
            -
            **Testing Code and Tool Output**:
         | 
| 664 | 
            -
            {context}
         | 
| 665 | 
            -
             | 
| 666 | 
            -
            **Previous Attempt**: This is the code and output of the previous attempt, if it is empty then there was no previous attempt.
         | 
| 667 | 
            -
            {previous_attempts}
         | 
| 668 | 
            -
             | 
| 669 | 
            -
            **Instructions**:
         | 
| 670 | 
            -
            1. Re-read the user request, plans, tool outputs and examine the image.
         | 
| 671 | 
            -
            2. Given the user request, try to solve the problem yourself.
         | 
| 672 | 
            -
            3. Pick which tool output best matches your solution first and the user request, then consider other factors like box size, etc. DO NOT worry about low confidence scores if the output is correct.
         | 
| 673 | 
            -
            4. DO NOT modify confidence thresholds unless the tool output is completely wrong.
         | 
| 674 | 
            -
            5. Remember for videos that in order to count objects a video some sort of tracking is needed, or else you will overcount the objects.
         | 
| 675 | 
            -
            6. Assign each tool a score from 0 to 10 based on how well it solves the user request. A score of 8+ means the tool output matches your solution and the tool is the best choice, 5-7 means the tool output is okay but needs some modifications, less than 5 means the tool output is bad and the tool should not be used. Return the the following JSON format inside <json> tags using the exact tool name as the key and the score as the value:
         | 
| 676 | 
            -
            <json>
         | 
| 677 | 
            -
            {{
         | 
| 678 | 
            -
                "predicted_answer": str # the answer you would expect from the best plan
         | 
| 679 | 
            -
                "thoughts": str # your thought process for choosing the best tool over other tools and any modifications you madeas well as the prompt you used with the tool.
         | 
| 680 | 
            -
                "first tool": int # the score for the first tool
         | 
| 681 | 
            -
                "second tool": int # the score for the second tool
         | 
| 682 | 
            -
                ...
         | 
| 683 | 
            -
            }}
         | 
| 684 | 
            -
            </json>
         | 
| 685 | 
            -
            """
         | 
| 686 | 
            -
             | 
| 687 653 | 
             
            FINALIZE_PLAN = """
         | 
| 688 654 | 
             
            **Role**: You are an expert AI model that can understand the user request and construct plans to accomplish it.
         | 
| 689 655 |  | 
| @@ -693,13 +659,13 @@ FINALIZE_PLAN = """ | |
| 693 659 | 
             
            {planning}
         | 
| 694 660 |  | 
| 695 661 | 
             
            **Instructions**:
         | 
| 696 | 
            -
            1.  | 
| 697 | 
            -
            2.  | 
| 698 | 
            -
            3.  | 
| 699 | 
            -
            4.  | 
| 700 | 
            -
            5.  | 
| 701 | 
            -
            6.  | 
| 702 | 
            -
            7.  | 
| 662 | 
            +
            1. Summarize the plan that the planning agent found.
         | 
| 663 | 
            +
            2. Write a single function that solves the problem based on what the planner found.
         | 
| 664 | 
            +
            3. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
         | 
| 665 | 
            +
            4. Do not include {excluded_tools} tools in your instructions.
         | 
| 666 | 
            +
            5. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_image` or `save_video`.
         | 
| 667 | 
            +
            6. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
         | 
| 668 | 
            +
            7. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
         | 
| 703 669 | 
             
            8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
         | 
| 704 670 |  | 
| 705 671 | 
             
            <json>
         | 
| @@ -32,7 +32,8 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import ( | |
| 32 32 | 
             
                PICK_PLAN,
         | 
| 33 33 | 
             
                PLAN,
         | 
| 34 34 | 
             
            )
         | 
| 35 | 
            -
            from vision_agent. | 
| 35 | 
            +
            from vision_agent.configs import Config
         | 
| 36 | 
            +
            from vision_agent.lmm import LMM, Message
         | 
| 36 37 | 
             
            from vision_agent.tools.planner_tools import check_function_call, get_tool_documentation
         | 
| 37 38 | 
             
            from vision_agent.utils.execute import (
         | 
| 38 39 | 
             
                CodeInterpreter,
         | 
| @@ -41,6 +42,7 @@ from vision_agent.utils.execute import ( | |
| 41 42 | 
             
            )
         | 
| 42 43 |  | 
| 43 44 | 
             
            logging.basicConfig(level=logging.INFO)
         | 
| 45 | 
            +
            CONFIG = Config()
         | 
| 44 46 | 
             
            UTIL_DOCSTRING = T.get_tool_documentation(
         | 
| 45 47 | 
             
                [
         | 
| 46 48 | 
             
                    T.load_image,
         | 
| @@ -315,8 +317,8 @@ def maybe_run_code( | |
| 315 317 |  | 
| 316 318 |  | 
| 317 319 | 
             
            def create_finalize_plan(
         | 
| 318 | 
            -
                chat: List[AgentMessage],
         | 
| 319 320 | 
             
                model: LMM,
         | 
| 321 | 
            +
                chat: List[AgentMessage],
         | 
| 320 322 | 
             
                verbose: bool = False,
         | 
| 321 323 | 
             
            ) -> Tuple[List[AgentMessage], PlanContext]:
         | 
| 322 324 | 
             
                # if we're in the middle of an interaction, don't finalize the plan
         | 
| @@ -385,6 +387,7 @@ class VisionAgentPlannerV2(AgentPlanner): | |
| 385 387 | 
             
                def __init__(
         | 
| 386 388 | 
             
                    self,
         | 
| 387 389 | 
             
                    planner: Optional[LMM] = None,
         | 
| 390 | 
            +
                    summarizer: Optional[LMM] = None,
         | 
| 388 391 | 
             
                    critic: Optional[LMM] = None,
         | 
| 389 392 | 
             
                    max_steps: int = 10,
         | 
| 390 393 | 
             
                    use_multi_trial_planning: bool = False,
         | 
| @@ -414,16 +417,11 @@ class VisionAgentPlannerV2(AgentPlanner): | |
| 414 417 | 
             
                            that will send back intermediate conversation messages.
         | 
| 415 418 | 
             
                    """
         | 
| 416 419 |  | 
| 417 | 
            -
                    self.planner = (
         | 
| 418 | 
            -
             | 
| 419 | 
            -
                        if  | 
| 420 | 
            -
                        else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
         | 
| 421 | 
            -
                    )
         | 
| 422 | 
            -
                    self.critic = (
         | 
| 423 | 
            -
                        critic
         | 
| 424 | 
            -
                        if critic is not None
         | 
| 425 | 
            -
                        else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
         | 
| 420 | 
            +
                    self.planner = planner if planner is not None else CONFIG.create_planner()
         | 
| 421 | 
            +
                    self.summarizer = (
         | 
| 422 | 
            +
                        summarizer if summarizer is not None else CONFIG.create_summarizer()
         | 
| 426 423 | 
             
                    )
         | 
| 424 | 
            +
                    self.critic = critic if critic is not None else CONFIG.create_critic()
         | 
| 427 425 | 
             
                    self.max_steps = max_steps
         | 
| 428 426 | 
             
                    self.use_multi_trial_planning = use_multi_trial_planning
         | 
| 429 427 | 
             
                    self.critique_steps = critique_steps
         | 
| @@ -561,7 +559,7 @@ class VisionAgentPlannerV2(AgentPlanner): | |
| 561 559 | 
             
                            context = InteractionContext(chat=int_chat)
         | 
| 562 560 | 
             
                        else:
         | 
| 563 561 | 
             
                            updated_chat, context = create_finalize_plan(
         | 
| 564 | 
            -
                                 | 
| 562 | 
            +
                                self.summarizer, int_chat, self.verbose
         | 
| 565 563 | 
             
                            )
         | 
| 566 564 | 
             
                            int_chat.extend(updated_chat)
         | 
| 567 565 | 
             
                            for chat_elt in updated_chat:
         | 
| @@ -55,10 +55,10 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect | |
| 55 55 |  | 
| 56 56 | 
             
            OBSERVATION:
         | 
| 57 57 | 
             
            [Artifact dog_detector.py (5 lines total)]
         | 
| 58 | 
            -
            0|from vision_agent.tools import load_image,  | 
| 58 | 
            +
            0|from vision_agent.tools import load_image, owlv2_object_detection
         | 
| 59 59 | 
             
            1|def detect_dogs(image_path: str):
         | 
| 60 60 | 
             
            2|    image = load_image(image_path)
         | 
| 61 | 
            -
            3|    dogs =  | 
| 61 | 
            +
            3|    dogs = owlv2_object_detection("dog", image)
         | 
| 62 62 | 
             
            4|    return dogs
         | 
| 63 63 | 
             
            [End of artifact]
         | 
| 64 64 |  | 
| @@ -96,10 +96,10 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do | |
| 96 96 |  | 
| 97 97 | 
             
            OBSERVATION:
         | 
| 98 98 | 
             
            [Artifact dog_detector.py (5 lines total)]
         | 
| 99 | 
            -
            0|from vision_agent.tools import load_image,  | 
| 99 | 
            +
            0|from vision_agent.tools import load_image, owlv2_object_detection
         | 
| 100 100 | 
             
            1|def detect_dogs(image_path: str):
         | 
| 101 101 | 
             
            2|    image = load_image(image_path)
         | 
| 102 | 
            -
            3|    dogs =  | 
| 102 | 
            +
            3|    dogs = owlv2_object_detection("dog", image, threshold=0.24)
         | 
| 103 103 | 
             
            4|    return dogs
         | 
| 104 104 | 
             
            [End of artifact]
         | 
| 105 105 |  | 
| @@ -168,10 +168,10 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w | |
| 168 168 |  | 
| 169 169 | 
             
            OBSERVATION:
         | 
| 170 170 | 
             
            [Artifact code.py (17 lines total)]
         | 
| 171 | 
            -
            0|from vision_agent.tools import load_image,  | 
| 171 | 
            +
            0|from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
         | 
| 172 172 | 
             
            1|def count_workers_with_helmets(image_path: str):
         | 
| 173 173 | 
             
            2|    image = load_image(image_path)
         | 
| 174 | 
            -
            3|    detections =  | 
| 174 | 
            +
            3|    detections = florence2_object_detection("worker, helmet", image)
         | 
| 175 175 | 
             
            4|    workers = [d for d in detections if d['label'] == 'worker']
         | 
| 176 176 | 
             
            5|    helmets = [d for d in detections if d['label'] == 'helmet']
         | 
| 177 177 | 
             
            6|    count = 0
         | 
| @@ -217,7 +217,7 @@ object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', | |
| 217 217 | 
             
            OBSERVATION:
         | 
| 218 218 | 
             
            [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
         | 
| 219 219 |  | 
| 220 | 
            -
            AGENT: <thinking>The model has finished fine tuning, I will now replace the original  | 
| 220 | 
            +
            AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_object_detection call with the fine tuning id.</thinking>
         | 
| 221 221 | 
             
            <execute_python>
         | 
| 222 222 | 
             
            use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
         | 
| 223 223 | 
             
            </execute_python>
         | 
| @@ -228,11 +228,11 @@ OBSERVATION: | |
| 228 228 | 
             
            ---
         | 
| 229 229 | 
             
            +++
         | 
| 230 230 | 
             
            @@ -1,7 +1,7 @@
         | 
| 231 | 
            -
             from vision_agent.tools import load_image,  | 
| 231 | 
            +
             from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
         | 
| 232 232 | 
             
             def count_workers_with_helmets(image_path: str, output_path: str):
         | 
| 233 233 | 
             
                 image = load_image(image_path)
         | 
| 234 | 
            -
            -    detections =  | 
| 235 | 
            -
            +    detections =  | 
| 234 | 
            +
            -    detections = florence2_object_detection("worker, helmet", image)
         | 
| 235 | 
            +
            +    detections = florence2_object_detection("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
         | 
| 236 236 | 
             
                 workers = [d for d in detections if d['label'] == 'worker']
         | 
| 237 237 | 
             
                 helmets = [d for d in detections if d['label'] == 'helmet']
         | 
| 238 238 | 
             
                 count = 0
         | 
| @@ -250,7 +250,7 @@ OBSERVATION: | |
| 250 250 | 
             
            3
         | 
| 251 251 |  | 
| 252 252 | 
             
            AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
         | 
| 253 | 
            -
            <response>I have updated the code using the fine tuned  | 
| 253 | 
            +
            <response>I have updated the code using the fine tuned florence2_object_detection model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
         | 
| 254 254 | 
             
            <let_user_respond>true</let_user_respond>
         | 
| 255 255 | 
             
            """
         | 
| 256 256 |  | 
| @@ -1,5 +1,5 @@ | |
| 1 1 | 
             
            CONVERSATION = """
         | 
| 2 | 
            -
            **Role**: You are a help agent that assists users  | 
| 2 | 
            +
            **Role**: You are a help agent that called VisionAent, built by LandingAI, that assists users write code to solve vision tasks.
         | 
| 3 3 |  | 
| 4 4 | 
             
            **Actions**:
         | 
| 5 5 | 
             
            `generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
         | 
| @@ -8,7 +8,7 @@ CONVERSATION = """ | |
| 8 8 |  | 
| 9 9 | 
             
            **Exampels**:
         | 
| 10 10 | 
             
            Here is an example of how you can interact with a user and Actions:
         | 
| 11 | 
            -
            ---  | 
| 11 | 
            +
            --- EXAMPLE1 ---
         | 
| 12 12 | 
             
            USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
         | 
| 13 13 |  | 
| 14 14 | 
             
            AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
         | 
| @@ -28,8 +28,23 @@ def test_detect_dogs(): | |
| 28 28 | 
             
            </final_test>
         | 
| 29 29 |  | 
| 30 30 | 
             
            AGENT: <response>Here is the code to detect dogs in the image.</response>
         | 
| 31 | 
            +
            --- END EXAMPLE1 ---
         | 
| 31 32 |  | 
| 32 | 
            -
            ---  | 
| 33 | 
            +
            --- EXAMPLE2 ---
         | 
| 34 | 
            +
            USER: Who are you and what can you do?
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users write code to solve vision tasks. I can write code to do the following tasks:
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            - Detecting objects in an image with bounding boxes
         | 
| 39 | 
            +
            - Detecting objects in an image with masks
         | 
| 40 | 
            +
            - Tracking object in a video
         | 
| 41 | 
            +
            - Classifying images
         | 
| 42 | 
            +
            - Understanding documents
         | 
| 43 | 
            +
            - Pose estimation
         | 
| 44 | 
            +
            - Visual question answering for both images and videos
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            How can I help you?</response>
         | 
| 47 | 
            +
            --- END EXAMPLE2 ---
         | 
| 33 48 |  | 
| 34 49 | 
             
            **Conversation**:
         | 
| 35 50 | 
             
            Here is the current conversation so far:
         |