vision-agent 0.2.240__py3-none-any.whl → 0.2.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent.py +3 -2
 - vision_agent/agent/vision_agent_coder_v2.py +6 -1
 - vision_agent/agent/vision_agent_planner_v2.py +27 -10
 - vision_agent/agent/vision_agent_prompts_v2.py +15 -3
 - vision_agent/agent/vision_agent_v2.py +25 -6
 - vision_agent/models/__init__.py +7 -1
 - vision_agent/models/agent_types.py +16 -1
 - vision_agent/tools/__init__.py +0 -2
 - vision_agent/tools/meta_tools.py +1 -124
 - vision_agent/tools/tools.py +15 -104
 - vision_agent/utils/agent.py +5 -4
 - vision_agent/utils/exceptions.py +0 -7
 - vision_agent/utils/video_tracking.py +8 -3
 - {vision_agent-0.2.240.dist-info → vision_agent-0.2.242.dist-info}/METADATA +1 -1
 - {vision_agent-0.2.240.dist-info → vision_agent-0.2.242.dist-info}/RECORD +17 -18
 - vision_agent/clients/landing_public_api.py +0 -38
 - {vision_agent-0.2.240.dist-info → vision_agent-0.2.242.dist-info}/LICENSE +0 -0
 - {vision_agent-0.2.240.dist-info → vision_agent-0.2.242.dist-info}/WHEEL +0 -0
 
    
        vision_agent/agent/agent.py
    CHANGED
    
    | 
         @@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union 
     | 
|
| 
       5 
5 
     | 
    
         
             
            from vision_agent.models import (
         
     | 
| 
       6 
6 
     | 
    
         
             
                AgentMessage,
         
     | 
| 
       7 
7 
     | 
    
         
             
                CodeContext,
         
     | 
| 
      
 8 
     | 
    
         
            +
                ErrorContext,
         
     | 
| 
       8 
9 
     | 
    
         
             
                InteractionContext,
         
     | 
| 
       9 
10 
     | 
    
         
             
                Message,
         
     | 
| 
       10 
11 
     | 
    
         
             
                PlanContext,
         
     | 
| 
         @@ -36,7 +37,7 @@ class AgentCoder(Agent): 
     | 
|
| 
       36 
37 
     | 
    
         
             
                    chat: List[AgentMessage],
         
     | 
| 
       37 
38 
     | 
    
         
             
                    max_steps: Optional[int] = None,
         
     | 
| 
       38 
39 
     | 
    
         
             
                    code_interpreter: Optional[CodeInterpreter] = None,
         
     | 
| 
       39 
     | 
    
         
            -
                ) -> Union[CodeContext, InteractionContext]:
         
     | 
| 
      
 40 
     | 
    
         
            +
                ) -> Union[CodeContext, InteractionContext, ErrorContext]:
         
     | 
| 
       40 
41 
     | 
    
         
             
                    pass
         
     | 
| 
       41 
42 
     | 
    
         | 
| 
       42 
43 
     | 
    
         
             
                @abstractmethod
         
     | 
| 
         @@ -56,5 +57,5 @@ class AgentPlanner(Agent): 
     | 
|
| 
       56 
57 
     | 
    
         
             
                    chat: List[AgentMessage],
         
     | 
| 
       57 
58 
     | 
    
         
             
                    max_steps: Optional[int] = None,
         
     | 
| 
       58 
59 
     | 
    
         
             
                    code_interpreter: Optional[CodeInterpreter] = None,
         
     | 
| 
       59 
     | 
    
         
            -
                ) -> Union[PlanContext, InteractionContext]:
         
     | 
| 
      
 60 
     | 
    
         
            +
                ) -> Union[PlanContext, InteractionContext, ErrorContext]:
         
     | 
| 
       60 
61 
     | 
    
         
             
                    pass
         
     | 
| 
         @@ -13,6 +13,7 @@ from vision_agent.lmm import LMM 
     | 
|
| 
       13 
13 
     | 
    
         
             
            from vision_agent.models import (
         
     | 
| 
       14 
14 
     | 
    
         
             
                AgentMessage,
         
     | 
| 
       15 
15 
     | 
    
         
             
                CodeContext,
         
     | 
| 
      
 16 
     | 
    
         
            +
                ErrorContext,
         
     | 
| 
       16 
17 
     | 
    
         
             
                InteractionContext,
         
     | 
| 
       17 
18 
     | 
    
         
             
                Message,
         
     | 
| 
       18 
19 
     | 
    
         
             
                PlanContext,
         
     | 
| 
         @@ -365,6 +366,8 @@ class VisionAgentCoderV2(AgentCoder): 
     | 
|
| 
       365 
366 
     | 
    
         
             
                    code_or_interaction = self.generate_code(input_msg)
         
     | 
| 
       366 
367 
     | 
    
         
             
                    if isinstance(code_or_interaction, InteractionContext):
         
     | 
| 
       367 
368 
     | 
    
         
             
                        return code_or_interaction.chat[-1].content
         
     | 
| 
      
 369 
     | 
    
         
            +
                    elif isinstance(code_or_interaction, ErrorContext):
         
     | 
| 
      
 370 
     | 
    
         
            +
                        return code_or_interaction.error
         
     | 
| 
       368 
371 
     | 
    
         
             
                    return code_or_interaction.code
         
     | 
| 
       369 
372 
     | 
    
         | 
| 
       370 
373 
     | 
    
         
             
                def generate_code(
         
     | 
| 
         @@ -372,7 +375,7 @@ class VisionAgentCoderV2(AgentCoder): 
     | 
|
| 
       372 
375 
     | 
    
         
             
                    chat: List[AgentMessage],
         
     | 
| 
       373 
376 
     | 
    
         
             
                    max_steps: Optional[int] = None,
         
     | 
| 
       374 
377 
     | 
    
         
             
                    code_interpreter: Optional[CodeInterpreter] = None,
         
     | 
| 
       375 
     | 
    
         
            -
                ) -> Union[CodeContext, InteractionContext]:
         
     | 
| 
      
 378 
     | 
    
         
            +
                ) -> Union[CodeContext, InteractionContext, ErrorContext]:
         
     | 
| 
       376 
379 
     | 
    
         
             
                    """Generate vision code from a conversation.
         
     | 
| 
       377 
380 
     | 
    
         | 
| 
       378 
381 
     | 
    
         
             
                    Parameters:
         
     | 
| 
         @@ -404,6 +407,8 @@ class VisionAgentCoderV2(AgentCoder): 
     | 
|
| 
       404 
407 
     | 
    
         
             
                        # the planner needs an interaction, so return before generating code
         
     | 
| 
       405 
408 
     | 
    
         
             
                        if isinstance(plan_context, InteractionContext):
         
     | 
| 
       406 
409 
     | 
    
         
             
                            return plan_context
         
     | 
| 
      
 410 
     | 
    
         
            +
                        elif isinstance(plan_context, ErrorContext):
         
     | 
| 
      
 411 
     | 
    
         
            +
                            return plan_context
         
     | 
| 
       407 
412 
     | 
    
         | 
| 
       408 
413 
     | 
    
         
             
                        code_context = self.generate_code_from_plan(
         
     | 
| 
       409 
414 
     | 
    
         
             
                            orig_chat,
         
     | 
| 
         @@ -24,7 +24,13 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import ( 
     | 
|
| 
       24 
24 
     | 
    
         
             
            )
         
     | 
| 
       25 
25 
     | 
    
         
             
            from vision_agent.configs import Config
         
     | 
| 
       26 
26 
     | 
    
         
             
            from vision_agent.lmm import LMM
         
     | 
| 
       27 
     | 
    
         
            -
            from vision_agent.models import  
     | 
| 
      
 27 
     | 
    
         
            +
            from vision_agent.models import (
         
     | 
| 
      
 28 
     | 
    
         
            +
                AgentMessage,
         
     | 
| 
      
 29 
     | 
    
         
            +
                ErrorContext,
         
     | 
| 
      
 30 
     | 
    
         
            +
                InteractionContext,
         
     | 
| 
      
 31 
     | 
    
         
            +
                Message,
         
     | 
| 
      
 32 
     | 
    
         
            +
                PlanContext,
         
     | 
| 
      
 33 
     | 
    
         
            +
            )
         
     | 
| 
       28 
34 
     | 
    
         
             
            from vision_agent.tools.planner_tools import check_function_call
         
     | 
| 
       29 
35 
     | 
    
         
             
            from vision_agent.utils.agent import (
         
     | 
| 
       30 
36 
     | 
    
         
             
                add_media_to_chat,
         
     | 
| 
         @@ -322,7 +328,7 @@ def create_finalize_plan( 
     | 
|
| 
       322 
328 
     | 
    
         
             
                model: LMM,
         
     | 
| 
       323 
329 
     | 
    
         
             
                chat: List[AgentMessage],
         
     | 
| 
       324 
330 
     | 
    
         
             
                verbose: bool = False,
         
     | 
| 
       325 
     | 
    
         
            -
            ) -> Tuple[List[AgentMessage], PlanContext]:
         
     | 
| 
      
 331 
     | 
    
         
            +
            ) -> Tuple[List[AgentMessage], Union[PlanContext, ErrorContext]]:
         
     | 
| 
       326 
332 
     | 
    
         
             
                # if we're in the middle of an interaction, don't finalize the plan
         
     | 
| 
       327 
333 
     | 
    
         
             
                if chat[-1].role == "interaction":
         
     | 
| 
       328 
334 
     | 
    
         
             
                    return [], PlanContext(plan="", instructions=[], code="")
         
     | 
| 
         @@ -337,11 +343,19 @@ def create_finalize_plan( 
     | 
|
| 
       337 
343 
     | 
    
         
             
                return_chat = [AgentMessage(role="planner", content=plan_str, media=None)]
         
     | 
| 
       338 
344 
     | 
    
         | 
| 
       339 
345 
     | 
    
         
             
                plan_json = extract_tag(plan_str, "json")
         
     | 
| 
       340 
     | 
    
         
            -
             
     | 
| 
       341 
     | 
    
         
            -
             
     | 
| 
       342 
     | 
    
         
            -
             
     | 
| 
       343 
     | 
    
         
            -
             
     | 
| 
       344 
     | 
    
         
            -
                 
     | 
| 
      
 346 
     | 
    
         
            +
             
     | 
| 
      
 347 
     | 
    
         
            +
                # sometimes the planner model will refuse to answer a question becuase of some
         
     | 
| 
      
 348 
     | 
    
         
            +
                # safety concern, we then wont be able to parse the response so we have to send
         
     | 
| 
      
 349 
     | 
    
         
            +
                # it back to the user/conversation agent
         
     | 
| 
      
 350 
     | 
    
         
            +
                try:
         
     | 
| 
      
 351 
     | 
    
         
            +
                    plan = (
         
     | 
| 
      
 352 
     | 
    
         
            +
                        extract_json(plan_json)
         
     | 
| 
      
 353 
     | 
    
         
            +
                        if plan_json is not None
         
     | 
| 
      
 354 
     | 
    
         
            +
                        else {"plan": plan_str, "instructions": [], "code": ""}
         
     | 
| 
      
 355 
     | 
    
         
            +
                    )
         
     | 
| 
      
 356 
     | 
    
         
            +
                except json.JSONDecodeError:
         
     | 
| 
      
 357 
     | 
    
         
            +
                    return return_chat, ErrorContext(error=plan_str)
         
     | 
| 
      
 358 
     | 
    
         
            +
             
     | 
| 
       345 
359 
     | 
    
         
             
                code_snippets = extract_tag(plan_str, "code")
         
     | 
| 
       346 
360 
     | 
    
         
             
                plan["code"] = code_snippets if code_snippets is not None else ""
         
     | 
| 
       347 
361 
     | 
    
         
             
                if verbose:
         
     | 
| 
         @@ -473,14 +487,17 @@ class VisionAgentPlannerV2(AgentPlanner): 
     | 
|
| 
       473 
487 
     | 
    
         
             
                    plan_or_interaction = self.generate_plan(input_msg)
         
     | 
| 
       474 
488 
     | 
    
         
             
                    if isinstance(plan_or_interaction, InteractionContext):
         
     | 
| 
       475 
489 
     | 
    
         
             
                        return plan_or_interaction.chat[-1].content
         
     | 
| 
       476 
     | 
    
         
            -
                     
     | 
| 
      
 490 
     | 
    
         
            +
                    elif isinstance(plan_or_interaction, PlanContext):
         
     | 
| 
      
 491 
     | 
    
         
            +
                        return plan_or_interaction.plan
         
     | 
| 
      
 492 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 493 
     | 
    
         
            +
                        return plan_or_interaction.error
         
     | 
| 
       477 
494 
     | 
    
         | 
| 
       478 
495 
     | 
    
         
             
                def generate_plan(
         
     | 
| 
       479 
496 
     | 
    
         
             
                    self,
         
     | 
| 
       480 
497 
     | 
    
         
             
                    chat: List[AgentMessage],
         
     | 
| 
       481 
498 
     | 
    
         
             
                    max_steps: Optional[int] = None,
         
     | 
| 
       482 
499 
     | 
    
         
             
                    code_interpreter: Optional[CodeInterpreter] = None,
         
     | 
| 
       483 
     | 
    
         
            -
                ) -> Union[PlanContext, InteractionContext]:
         
     | 
| 
      
 500 
     | 
    
         
            +
                ) -> Union[PlanContext, InteractionContext, ErrorContext]:
         
     | 
| 
       484 
501 
     | 
    
         
             
                    """Generate a plan to solve a vision task.
         
     | 
| 
       485 
502 
     | 
    
         | 
| 
       486 
503 
     | 
    
         
             
                    Parameters:
         
     | 
| 
         @@ -571,7 +588,7 @@ class VisionAgentPlannerV2(AgentPlanner): 
     | 
|
| 
       571 
588 
     | 
    
         
             
                            for chat_elt in updated_chat:
         
     | 
| 
       572 
589 
     | 
    
         
             
                                self.update_callback(chat_elt.model_dump())
         
     | 
| 
       573 
590 
     | 
    
         | 
| 
       574 
     | 
    
         
            -
                        context: Union[PlanContext, InteractionContext]
         
     | 
| 
      
 591 
     | 
    
         
            +
                        context: Union[PlanContext, InteractionContext, ErrorContext]
         
     | 
| 
       575 
592 
     | 
    
         
             
                        if interaction:
         
     | 
| 
       576 
593 
     | 
    
         
             
                            context = InteractionContext(chat=int_chat)
         
     | 
| 
       577 
594 
     | 
    
         
             
                        else:
         
     | 
| 
         @@ -16,17 +16,29 @@ AGENT: <response>Yes, I can help you with that. I will write the code to detect 
     | 
|
| 
       16 
16 
     | 
    
         
             
            OBSERVATION:
         
     | 
| 
       17 
17 
     | 
    
         
             
            <final_code>
         
     | 
| 
       18 
18 
     | 
    
         
             
            from vision_agent.tools import load_image, owl_v2_image
         
     | 
| 
       19 
     | 
    
         
            -
            def detect_dogs(image_path: str):
         
     | 
| 
      
 19 
     | 
    
         
            +
            def detect_dogs(image_path: str) -> int:
         
     | 
| 
       20 
20 
     | 
    
         
             
                image = load_image(image_path)
         
     | 
| 
       21 
21 
     | 
    
         
             
                dogs = owl_v2_image(image)
         
     | 
| 
       22 
     | 
    
         
            -
                return dogs
         
     | 
| 
      
 22 
     | 
    
         
            +
                return len(dogs)
         
     | 
| 
       23 
23 
     | 
    
         
             
            </final_code>
         
     | 
| 
       24 
24 
     | 
    
         
             
            <final_test>
         
     | 
| 
       25 
25 
     | 
    
         
             
            def test_detect_dogs():
         
     | 
| 
       26 
26 
     | 
    
         
             
                dogs = detect_dogs("images/dogs.jpg")
         
     | 
| 
       27 
     | 
    
         
            -
                assert  
     | 
| 
      
 27 
     | 
    
         
            +
                assert isinstance(dogs, int)
         
     | 
| 
      
 28 
     | 
    
         
            +
                print(f"Number of dogs detected: {{dogs}}")
         
     | 
| 
      
 29 
     | 
    
         
            +
                return dogs
         
     | 
| 
       28 
30 
     | 
    
         
             
            </final_test>
         
     | 
| 
       29 
31 
     | 
    
         | 
| 
      
 32 
     | 
    
         
            +
            OBSERVATION: ----- stdout -----
         
     | 
| 
      
 33 
     | 
    
         
            +
            Number of dogs detected: 8
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
            ----- stderr -----
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
            ----- Intermediate output-----
         
     | 
| 
      
 38 
     | 
    
         
            +
            None
         
     | 
| 
      
 39 
     | 
    
         
            +
            ----- Final output -----
         
     | 
| 
      
 40 
     | 
    
         
            +
            8
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
       30 
42 
     | 
    
         
             
            AGENT: <response>Here is the code to detect dogs in the image.</response>
         
     | 
| 
       31 
43 
     | 
    
         
             
            --- END EXAMPLE1 ---
         
     | 
| 
       32 
44 
     | 
    
         | 
| 
         @@ -11,6 +11,7 @@ from vision_agent.lmm import LMM 
     | 
|
| 
       11 
11 
     | 
    
         
             
            from vision_agent.models import (
         
     | 
| 
       12 
12 
     | 
    
         
             
                AgentMessage,
         
     | 
| 
       13 
13 
     | 
    
         
             
                CodeContext,
         
     | 
| 
      
 14 
     | 
    
         
            +
                ErrorContext,
         
     | 
| 
       14 
15 
     | 
    
         
             
                InteractionContext,
         
     | 
| 
       15 
16 
     | 
    
         
             
                Message,
         
     | 
| 
       16 
17 
     | 
    
         
             
                PlanContext,
         
     | 
| 
         @@ -27,7 +28,9 @@ CONFIG = Config() 
     | 
|
| 
       27 
28 
     | 
    
         | 
| 
       28 
29 
     | 
    
         | 
| 
       29 
30 
     | 
    
         
             
            def extract_conversation(
         
     | 
| 
       30 
     | 
    
         
            -
                chat: List[AgentMessage], 
     | 
| 
      
 31 
     | 
    
         
            +
                chat: List[AgentMessage],
         
     | 
| 
      
 32 
     | 
    
         
            +
                include_conv: bool = False,
         
     | 
| 
      
 33 
     | 
    
         
            +
                include_errors: bool = False,
         
     | 
| 
       31 
34 
     | 
    
         
             
            ) -> Tuple[List[AgentMessage], Optional[str]]:
         
     | 
| 
       32 
35 
     | 
    
         
             
                chat = copy.deepcopy(chat)
         
     | 
| 
       33 
36 
     | 
    
         | 
| 
         @@ -43,13 +46,18 @@ def extract_conversation( 
     | 
|
| 
       43 
46 
     | 
    
         
             
                    elif chat_i.role == "coder":
         
     | 
| 
       44 
47 
     | 
    
         
             
                        if "<final_code>" in chat_i.content:
         
     | 
| 
       45 
48 
     | 
    
         
             
                            extracted_chat.append(chat_i)
         
     | 
| 
      
 49 
     | 
    
         
            +
                    elif chat_i.role == "final_observation":
         
     | 
| 
      
 50 
     | 
    
         
            +
                        extracted_chat.append(chat_i)
         
     | 
| 
       46 
51 
     | 
    
         
             
                    elif include_conv and chat_i.role == "conversation":
         
     | 
| 
       47 
52 
     | 
    
         
             
                        extracted_chat.append(chat_i)
         
     | 
| 
      
 53 
     | 
    
         
            +
                    elif include_errors and chat_i.role == "error_observation":
         
     | 
| 
      
 54 
     | 
    
         
            +
                        extracted_chat.append(chat_i)
         
     | 
| 
       48 
55 
     | 
    
         | 
| 
       49 
     | 
    
         
            -
                # only keep the last <final_code 
     | 
| 
      
 56 
     | 
    
         
            +
                # only keep the last <final_code>, <final_test>
         
     | 
| 
       50 
57 
     | 
    
         
             
                final_code = None
         
     | 
| 
       51 
58 
     | 
    
         
             
                extracted_chat_strip_code: List[AgentMessage] = []
         
     | 
| 
       52 
     | 
    
         
            -
                for chat_i in reversed(extracted_chat):
         
     | 
| 
      
 59 
     | 
    
         
            +
                for chat_i in reversed((extracted_chat)):
         
     | 
| 
      
 60 
     | 
    
         
            +
                    # don't check role here because user could send updated <final_code>
         
     | 
| 
       53 
61 
     | 
    
         
             
                    if "<final_code>" in chat_i.content and final_code is None:
         
     | 
| 
       54 
62 
     | 
    
         
             
                        extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
         
     | 
| 
       55 
63 
     | 
    
         
             
                        final_code = extract_tag(chat_i.content, "final_code")
         
     | 
| 
         @@ -66,7 +74,12 @@ def extract_conversation( 
     | 
|
| 
       66 
74 
     | 
    
         | 
| 
       67 
75 
     | 
    
         | 
| 
       68 
76 
     | 
    
         
             
            def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
         
     | 
| 
       69 
     | 
    
         
            -
                 
     | 
| 
      
 77 
     | 
    
         
            +
                # Include conversation and error messages. The error messages can come from one of
         
     | 
| 
      
 78 
     | 
    
         
            +
                # the agents refusing to write a correctly formatted message, want to inform the
         
     | 
| 
      
 79 
     | 
    
         
            +
                # conversation agent of this.
         
     | 
| 
      
 80 
     | 
    
         
            +
                extracted_chat, _ = extract_conversation(
         
     | 
| 
      
 81 
     | 
    
         
            +
                    chat, include_conv=True, include_errors=True
         
     | 
| 
      
 82 
     | 
    
         
            +
                )
         
     | 
| 
       70 
83 
     | 
    
         | 
| 
       71 
84 
     | 
    
         
             
                conv = format_conversation(extracted_chat)
         
     | 
| 
       72 
85 
     | 
    
         
             
                prompt = CONVERSATION.format(
         
     | 
| 
         @@ -101,7 +114,9 @@ def maybe_run_action( 
     | 
|
| 
       101 
114 
     | 
    
         
             
                    if isinstance(context, CodeContext):
         
     | 
| 
       102 
115 
     | 
    
         
             
                        return [
         
     | 
| 
       103 
116 
     | 
    
         
             
                            AgentMessage(role="coder", content=format_code_context(context)),
         
     | 
| 
       104 
     | 
    
         
            -
                            AgentMessage( 
     | 
| 
      
 117 
     | 
    
         
            +
                            AgentMessage(
         
     | 
| 
      
 118 
     | 
    
         
            +
                                role="final_observation", content=context.test_result.text()
         
     | 
| 
      
 119 
     | 
    
         
            +
                            ),
         
     | 
| 
       105 
120 
     | 
    
         
             
                        ]
         
     | 
| 
       106 
121 
     | 
    
         
             
                    elif isinstance(context, InteractionContext):
         
     | 
| 
       107 
122 
     | 
    
         
             
                        return [
         
     | 
| 
         @@ -110,6 +125,10 @@ def maybe_run_action( 
     | 
|
| 
       110 
125 
     | 
    
         
             
                                content=json.dumps([elt.model_dump() for elt in context.chat]),
         
     | 
| 
       111 
126 
     | 
    
         
             
                            )
         
     | 
| 
       112 
127 
     | 
    
         
             
                        ]
         
     | 
| 
      
 128 
     | 
    
         
            +
                    elif isinstance(context, ErrorContext):
         
     | 
| 
      
 129 
     | 
    
         
            +
                        return [
         
     | 
| 
      
 130 
     | 
    
         
            +
                            AgentMessage(role="error_observation", content=context.error),
         
     | 
| 
      
 131 
     | 
    
         
            +
                        ]
         
     | 
| 
       113 
132 
     | 
    
         
             
                elif action == "edit_code":
         
     | 
| 
       114 
133 
     | 
    
         
             
                    # We don't want to pass code in plan_context.code so the coder will generate
         
     | 
| 
       115 
134 
     | 
    
         
             
                    # new code from plan_context.plan
         
     | 
| 
         @@ -129,7 +148,7 @@ def maybe_run_action( 
     | 
|
| 
       129 
148 
     | 
    
         
             
                    )
         
     | 
| 
       130 
149 
     | 
    
         
             
                    return [
         
     | 
| 
       131 
150 
     | 
    
         
             
                        AgentMessage(role="coder", content=format_code_context(context)),
         
     | 
| 
       132 
     | 
    
         
            -
                        AgentMessage(role=" 
     | 
| 
      
 151 
     | 
    
         
            +
                        AgentMessage(role="final_observation", content=context.test_result.text()),
         
     | 
| 
       133 
152 
     | 
    
         
             
                    ]
         
     | 
| 
       134 
153 
     | 
    
         
             
                elif action == "view_image":
         
     | 
| 
       135 
154 
     | 
    
         
             
                    pass
         
     | 
    
        vision_agent/models/__init__.py
    CHANGED
    
    | 
         @@ -1,4 +1,10 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            from .agent_types import  
     | 
| 
      
 1 
     | 
    
         
            +
            from .agent_types import (
         
     | 
| 
      
 2 
     | 
    
         
            +
                AgentMessage,
         
     | 
| 
      
 3 
     | 
    
         
            +
                CodeContext,
         
     | 
| 
      
 4 
     | 
    
         
            +
                ErrorContext,
         
     | 
| 
      
 5 
     | 
    
         
            +
                InteractionContext,
         
     | 
| 
      
 6 
     | 
    
         
            +
                PlanContext,
         
     | 
| 
      
 7 
     | 
    
         
            +
            )
         
     | 
| 
       2 
8 
     | 
    
         
             
            from .lmm_types import Message, TextOrImage
         
     | 
| 
       3 
9 
     | 
    
         
             
            from .tools_types import (
         
     | 
| 
       4 
10 
     | 
    
         
             
                BboxInput,
         
     | 
| 
         @@ -29,11 +29,15 @@ class AgentMessage(BaseModel): 
     | 
|
| 
       29 
29 
     | 
    
         
             
                    Literal["user"],
         
     | 
| 
       30 
30 
     | 
    
         
             
                    Literal["assistant"],  # planner, coder and conversation are of type assistant
         
     | 
| 
       31 
31 
     | 
    
         
             
                    Literal["observation"],
         
     | 
| 
      
 32 
     | 
    
         
            +
                    Literal["final_observation"],  # the observation from the final code output
         
     | 
| 
      
 33 
     | 
    
         
            +
                    Literal["error_observation"],  # the observation from the error message
         
     | 
| 
       32 
34 
     | 
    
         
             
                    Literal["interaction"],
         
     | 
| 
       33 
35 
     | 
    
         
             
                    Literal["interaction_response"],
         
     | 
| 
       34 
36 
     | 
    
         
             
                    Literal["conversation"],
         
     | 
| 
       35 
37 
     | 
    
         
             
                    Literal["planner"],
         
     | 
| 
       36 
     | 
    
         
            -
                    Literal[ 
     | 
| 
      
 38 
     | 
    
         
            +
                    Literal[
         
     | 
| 
      
 39 
     | 
    
         
            +
                        "planner_update"
         
     | 
| 
      
 40 
     | 
    
         
            +
                    ],  # an intermediate update from the planner to show partial information
         
     | 
| 
       37 
41 
     | 
    
         
             
                    Literal["coder"],
         
     | 
| 
       38 
42 
     | 
    
         
             
                ]
         
     | 
| 
       39 
43 
     | 
    
         
             
                content: str
         
     | 
| 
         @@ -75,3 +79,14 @@ class InteractionContext(BaseModel): 
     | 
|
| 
       75 
79 
     | 
    
         
             
                """
         
     | 
| 
       76 
80 
     | 
    
         | 
| 
       77 
81 
     | 
    
         
             
                chat: List[AgentMessage]
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
            class ErrorContext(BaseModel):
         
     | 
| 
      
 85 
     | 
    
         
            +
                """ErrorContext is a data model that represents an error message. These errors can
         
     | 
| 
      
 86 
     | 
    
         
            +
                happen in the planning phase when a model does not output correctly formatted
         
     | 
| 
      
 87 
     | 
    
         
            +
                messages (often because it considers some response to be a safety issue).
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
                error: The error message.
         
     | 
| 
      
 90 
     | 
    
         
            +
                """
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                error: str
         
     | 
    
        vision_agent/tools/__init__.py
    CHANGED
    
    
    
        vision_agent/tools/meta_tools.py
    CHANGED
    
    | 
         @@ -11,11 +11,9 @@ import libcst as cst 
     | 
|
| 
       11 
11 
     | 
    
         
             
            from IPython.display import display
         
     | 
| 
       12 
12 
     | 
    
         | 
| 
       13 
13 
     | 
    
         
             
            import vision_agent as va
         
     | 
| 
       14 
     | 
    
         
            -
            from vision_agent. 
     | 
| 
       15 
     | 
    
         
            -
            from vision_agent.models import BboxInput, BboxInputBase64, Message, PromptTask
         
     | 
| 
      
 14 
     | 
    
         
            +
            from vision_agent.models import Message
         
     | 
| 
       16 
15 
     | 
    
         
             
            from vision_agent.tools.tools import get_tools_descriptions as _get_tool_descriptions
         
     | 
| 
       17 
16 
     | 
    
         
             
            from vision_agent.utils.execute import Execution, MimeType
         
     | 
| 
       18 
     | 
    
         
            -
            from vision_agent.utils.image_utils import convert_to_b64
         
     | 
| 
       19 
17 
     | 
    
         
             
            from vision_agent.utils.tools_doc import get_tool_documentation
         
     | 
| 
       20 
18 
     | 
    
         | 
| 
       21 
19 
     | 
    
         
             
            CURRENT_FILE = None
         
     | 
| 
         @@ -573,48 +571,6 @@ def get_tool_descriptions() -> str: 
     | 
|
| 
       573 
571 
     | 
    
         
             
                return _get_tool_descriptions()
         
     | 
| 
       574 
572 
     | 
    
         | 
| 
       575 
573 
     | 
    
         | 
| 
       576 
     | 
    
         
            -
            def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
         
     | 
| 
       577 
     | 
    
         
            -
                """DO NOT use this function unless the user has supplied you with bboxes.
         
     | 
| 
       578 
     | 
    
         
            -
                'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
         
     | 
| 
       579 
     | 
    
         
            -
                be able to detect objects in an image based on a given dataset. It returns the fine
         
     | 
| 
       580 
     | 
    
         
            -
                tuning job id.
         
     | 
| 
       581 
     | 
    
         
            -
             
     | 
| 
       582 
     | 
    
         
            -
                Parameters:
         
     | 
| 
       583 
     | 
    
         
            -
                    bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
         
     | 
| 
       584 
     | 
    
         
            -
                        and bounding boxes. The coordinates are unnormalized.
         
     | 
| 
       585 
     | 
    
         
            -
             
     | 
| 
       586 
     | 
    
         
            -
                Returns:
         
     | 
| 
       587 
     | 
    
         
            -
                    str: The fine tuning job id, this id will used to retrieve the fine tuned
         
     | 
| 
       588 
     | 
    
         
            -
                        model.
         
     | 
| 
       589 
     | 
    
         
            -
             
     | 
| 
       590 
     | 
    
         
            -
                Example
         
     | 
| 
       591 
     | 
    
         
            -
                -------
         
     | 
| 
       592 
     | 
    
         
            -
                    >>> fine_tuning_job_id = object_detection_fine_tuning(
         
     | 
| 
       593 
     | 
    
         
            -
                        [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
         
     | 
| 
       594 
     | 
    
         
            -
                         {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
         
     | 
| 
       595 
     | 
    
         
            -
                         "phrase_grounding"
         
     | 
| 
       596 
     | 
    
         
            -
                    )
         
     | 
| 
       597 
     | 
    
         
            -
                """
         
     | 
| 
       598 
     | 
    
         
            -
                task = "phrase_grounding"
         
     | 
| 
       599 
     | 
    
         
            -
                bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
         
     | 
| 
       600 
     | 
    
         
            -
                task_type = PromptTask[task.upper()]
         
     | 
| 
       601 
     | 
    
         
            -
                fine_tuning_request = [
         
     | 
| 
       602 
     | 
    
         
            -
                    BboxInputBase64(
         
     | 
| 
       603 
     | 
    
         
            -
                        image=convert_to_b64(bbox_input.image_path),
         
     | 
| 
       604 
     | 
    
         
            -
                        filename=Path(bbox_input.image_path).name,
         
     | 
| 
       605 
     | 
    
         
            -
                        labels=bbox_input.labels,
         
     | 
| 
       606 
     | 
    
         
            -
                        bboxes=bbox_input.bboxes,
         
     | 
| 
       607 
     | 
    
         
            -
                    )
         
     | 
| 
       608 
     | 
    
         
            -
                    for bbox_input in bboxes_input
         
     | 
| 
       609 
     | 
    
         
            -
                ]
         
     | 
| 
       610 
     | 
    
         
            -
                landing_api = LandingPublicAPI()
         
     | 
| 
       611 
     | 
    
         
            -
                fine_tune_id = str(
         
     | 
| 
       612 
     | 
    
         
            -
                    landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
         
     | 
| 
       613 
     | 
    
         
            -
                )
         
     | 
| 
       614 
     | 
    
         
            -
                print(f"[Fine tuning id: {fine_tune_id}]")
         
     | 
| 
       615 
     | 
    
         
            -
                return fine_tune_id
         
     | 
| 
       616 
     | 
    
         
            -
             
     | 
| 
       617 
     | 
    
         
            -
             
     | 
| 
       618 
574 
     | 
    
         
             
            def get_diff(before: str, after: str) -> str:
         
     | 
| 
       619 
575 
     | 
    
         
             
                return "".join(
         
     | 
| 
       620 
576 
     | 
    
         
             
                    difflib.unified_diff(
         
     | 
| 
         @@ -721,83 +677,6 @@ def use_extra_vision_agent_args( 
     | 
|
| 
       721 
677 
     | 
    
         
             
                return modified_tree.code
         
     | 
| 
       722 
678 
     | 
    
         | 
| 
       723 
679 
     | 
    
         | 
| 
       724 
     | 
    
         
            -
            def use_object_detection_fine_tuning(
         
     | 
| 
       725 
     | 
    
         
            -
                artifacts: Artifacts, name: str, fine_tune_id: str
         
     | 
| 
       726 
     | 
    
         
            -
            ) -> str:
         
     | 
| 
       727 
     | 
    
         
            -
                """Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
         
     | 
| 
       728 
     | 
    
         
            -
                'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
         
     | 
| 
       729 
     | 
    
         
            -
                the fined tuned florence2 model. Returns the diff between the original code and the
         
     | 
| 
       730 
     | 
    
         
            -
                new code.
         
     | 
| 
       731 
     | 
    
         
            -
             
     | 
| 
       732 
     | 
    
         
            -
                Parameters:
         
     | 
| 
       733 
     | 
    
         
            -
                    artifacts (Artifacts): The artifacts object to edit the code from.
         
     | 
| 
       734 
     | 
    
         
            -
                    name (str): The name of the artifact to edit.
         
     | 
| 
       735 
     | 
    
         
            -
                    fine_tune_id (str): The fine tuning job id.
         
     | 
| 
       736 
     | 
    
         
            -
             
     | 
| 
       737 
     | 
    
         
            -
                Examples
         
     | 
| 
       738 
     | 
    
         
            -
                --------
         
     | 
| 
       739 
     | 
    
         
            -
                    >>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
         
     | 
| 
       740 
     | 
    
         
            -
                """
         
     | 
| 
       741 
     | 
    
         
            -
             
     | 
| 
       742 
     | 
    
         
            -
                if name not in artifacts:
         
     | 
| 
       743 
     | 
    
         
            -
                    output_str = f"[Artifact {name} does not exist]"
         
     | 
| 
       744 
     | 
    
         
            -
                    print(output_str)
         
     | 
| 
       745 
     | 
    
         
            -
                    return output_str
         
     | 
| 
       746 
     | 
    
         
            -
             
     | 
| 
       747 
     | 
    
         
            -
                code = artifacts[name]
         
     | 
| 
       748 
     | 
    
         
            -
             
     | 
| 
       749 
     | 
    
         
            -
                patterns_with_fine_tune_id = [
         
     | 
| 
       750 
     | 
    
         
            -
                    (
         
     | 
| 
       751 
     | 
    
         
            -
                        r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
         
     | 
| 
       752 
     | 
    
         
            -
                        lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
         
     | 
| 
       753 
     | 
    
         
            -
                    ),
         
     | 
| 
       754 
     | 
    
         
            -
                    (
         
     | 
| 
       755 
     | 
    
         
            -
                        r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
         
     | 
| 
       756 
     | 
    
         
            -
                        lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
         
     | 
| 
       757 
     | 
    
         
            -
                    ),
         
     | 
| 
       758 
     | 
    
         
            -
                    (
         
     | 
| 
       759 
     | 
    
         
            -
                        r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
         
     | 
| 
       760 
     | 
    
         
            -
                        lambda match: f'owl_v2_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
         
     | 
| 
       761 
     | 
    
         
            -
                    ),
         
     | 
| 
       762 
     | 
    
         
            -
                    (
         
     | 
| 
       763 
     | 
    
         
            -
                        r'florence2_sam2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
         
     | 
| 
       764 
     | 
    
         
            -
                        lambda match: f'florence2_sam2_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
         
     | 
| 
       765 
     | 
    
         
            -
                    ),
         
     | 
| 
       766 
     | 
    
         
            -
                ]
         
     | 
| 
       767 
     | 
    
         
            -
             
     | 
| 
       768 
     | 
    
         
            -
                new_code = code
         
     | 
| 
       769 
     | 
    
         
            -
                for (
         
     | 
| 
       770 
     | 
    
         
            -
                    pattern_with_fine_tune_id,
         
     | 
| 
       771 
     | 
    
         
            -
                    replacer_with_fine_tune_id,
         
     | 
| 
       772 
     | 
    
         
            -
                ) in patterns_with_fine_tune_id:
         
     | 
| 
       773 
     | 
    
         
            -
                    if re.search(pattern_with_fine_tune_id, new_code):
         
     | 
| 
       774 
     | 
    
         
            -
                        new_code = re.sub(
         
     | 
| 
       775 
     | 
    
         
            -
                            pattern_with_fine_tune_id, replacer_with_fine_tune_id, new_code
         
     | 
| 
       776 
     | 
    
         
            -
                        )
         
     | 
| 
       777 
     | 
    
         
            -
             
     | 
| 
       778 
     | 
    
         
            -
                if new_code == code:
         
     | 
| 
       779 
     | 
    
         
            -
                    output_str = (
         
     | 
| 
       780 
     | 
    
         
            -
                        f"[No function calls to replace with fine tuning id in artifact {name}]"
         
     | 
| 
       781 
     | 
    
         
            -
                    )
         
     | 
| 
       782 
     | 
    
         
            -
                    print(output_str)
         
     | 
| 
       783 
     | 
    
         
            -
                    return output_str
         
     | 
| 
       784 
     | 
    
         
            -
             
     | 
| 
       785 
     | 
    
         
            -
                artifacts[name] = new_code
         
     | 
| 
       786 
     | 
    
         
            -
             
     | 
| 
       787 
     | 
    
         
            -
                diff = get_diff_with_prompts(name, code, new_code)
         
     | 
| 
       788 
     | 
    
         
            -
                print(diff)
         
     | 
| 
       789 
     | 
    
         
            -
             
     | 
| 
       790 
     | 
    
         
            -
                display(
         
     | 
| 
       791 
     | 
    
         
            -
                    {
         
     | 
| 
       792 
     | 
    
         
            -
                        MimeType.APPLICATION_ARTIFACT: json.dumps(
         
     | 
| 
       793 
     | 
    
         
            -
                            {"name": name, "content": new_code, "action": "edit"}
         
     | 
| 
       794 
     | 
    
         
            -
                        )
         
     | 
| 
       795 
     | 
    
         
            -
                    },
         
     | 
| 
       796 
     | 
    
         
            -
                    raw=True,
         
     | 
| 
       797 
     | 
    
         
            -
                )
         
     | 
| 
       798 
     | 
    
         
            -
                return diff
         
     | 
| 
       799 
     | 
    
         
            -
             
     | 
| 
       800 
     | 
    
         
            -
             
     | 
| 
       801 
680 
     | 
    
         
             
            META_TOOL_DOCSTRING = get_tool_documentation(
         
     | 
| 
       802 
681 
     | 
    
         
             
                [
         
     | 
| 
       803 
682 
     | 
    
         
             
                    get_tool_descriptions,
         
     | 
| 
         @@ -807,8 +686,6 @@ META_TOOL_DOCSTRING = get_tool_documentation( 
     | 
|
| 
       807 
686 
     | 
    
         
             
                    generate_vision_code,
         
     | 
| 
       808 
687 
     | 
    
         
             
                    edit_vision_code,
         
     | 
| 
       809 
688 
     | 
    
         
             
                    view_media_artifact,
         
     | 
| 
       810 
     | 
    
         
            -
                    object_detection_fine_tuning,
         
     | 
| 
       811 
     | 
    
         
            -
                    use_object_detection_fine_tuning,
         
     | 
| 
       812 
689 
     | 
    
         
             
                    list_artifacts,
         
     | 
| 
       813 
690 
     | 
    
         
             
                ]
         
     | 
| 
       814 
691 
     | 
    
         
             
            )
         
     | 
    
        vision_agent/tools/tools.py
    CHANGED
    
    | 
         @@ -9,7 +9,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed 
     | 
|
| 
       9 
9 
     | 
    
         
             
            from importlib import resources
         
     | 
| 
       10 
10 
     | 
    
         
             
            from pathlib import Path
         
     | 
| 
       11 
11 
     | 
    
         
             
            from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
         
     | 
| 
       12 
     | 
    
         
            -
            from uuid import UUID
         
     | 
| 
       13 
12 
     | 
    
         | 
| 
       14 
13 
     | 
    
         
             
            import cv2
         
     | 
| 
       15 
14 
     | 
    
         
             
            import numpy as np
         
     | 
| 
         @@ -20,10 +19,7 @@ from PIL import Image, ImageDraw, ImageFont 
     | 
|
| 
       20 
19 
     | 
    
         
             
            from pillow_heif import register_heif_opener  # type: ignore
         
     | 
| 
       21 
20 
     | 
    
         
             
            from pytube import YouTube  # type: ignore
         
     | 
| 
       22 
21 
     | 
    
         | 
| 
       23 
     | 
    
         
            -
            from vision_agent.clients.landing_public_api import LandingPublicAPI
         
     | 
| 
       24 
22 
     | 
    
         
             
            from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
         
     | 
| 
       25 
     | 
    
         
            -
            from vision_agent.models import JobStatus
         
     | 
| 
       26 
     | 
    
         
            -
            from vision_agent.utils.exceptions import FineTuneModelIsNotReady
         
     | 
| 
       27 
23 
     | 
    
         
             
            from vision_agent.utils.execute import FileSerializer, MimeType
         
     | 
| 
       28 
24 
     | 
    
         
             
            from vision_agent.utils.image_utils import (
         
     | 
| 
       29 
25 
     | 
    
         
             
                b64_to_pil,
         
     | 
| 
         @@ -239,7 +235,7 @@ def od_sam2_video_tracking( 
     | 
|
| 
       239 
235 
     | 
    
         
             
                frames: List[np.ndarray],
         
     | 
| 
       240 
236 
     | 
    
         
             
                box_threshold: float = 0.30,
         
     | 
| 
       241 
237 
     | 
    
         
             
                chunk_length: Optional[int] = 50,
         
     | 
| 
       242 
     | 
    
         
            -
                 
     | 
| 
      
 238 
     | 
    
         
            +
                deployment_id: Optional[str] = None,
         
     | 
| 
       243 
239 
     | 
    
         
             
            ) -> Dict[str, Any]:
         
     | 
| 
       244 
240 
     | 
    
         
             
                chunk_length = 50 if chunk_length is None else chunk_length
         
     | 
| 
       245 
241 
     | 
    
         
             
                segment_size = chunk_length
         
     | 
| 
         @@ -262,7 +258,7 @@ def od_sam2_video_tracking( 
     | 
|
| 
       262 
258 
     | 
    
         
             
                    prompt: str,
         
     | 
| 
       263 
259 
     | 
    
         
             
                    segment_index: int,
         
     | 
| 
       264 
260 
     | 
    
         
             
                    frame_number: int,
         
     | 
| 
       265 
     | 
    
         
            -
                     
     | 
| 
      
 261 
     | 
    
         
            +
                    deployment_id: str,
         
     | 
| 
       266 
262 
     | 
    
         
             
                    segment_frames: list,
         
     | 
| 
       267 
263 
     | 
    
         
             
                ) -> tuple:
         
     | 
| 
       268 
264 
     | 
    
         
             
                    """
         
     | 
| 
         @@ -273,7 +269,7 @@ def od_sam2_video_tracking( 
     | 
|
| 
       273 
269 
     | 
    
         
             
                        prompt: The prompt for the object detection model.
         
     | 
| 
       274 
270 
     | 
    
         
             
                        segment_index: The index of the current segment.
         
     | 
| 
       275 
271 
     | 
    
         
             
                        frame_number: The number of the current frame.
         
     | 
| 
       276 
     | 
    
         
            -
                         
     | 
| 
      
 272 
     | 
    
         
            +
                        deployment_id: Optional The Model deployment ID.
         
     | 
| 
       277 
273 
     | 
    
         
             
                        segment_frames: List of frames for the current segment.
         
     | 
| 
       278 
274 
     | 
    
         | 
| 
       279 
275 
     | 
    
         
             
                    Returns:
         
     | 
| 
         @@ -293,7 +289,6 @@ def od_sam2_video_tracking( 
     | 
|
| 
       293 
289 
     | 
    
         
             
                            prompt=prompt,
         
     | 
| 
       294 
290 
     | 
    
         
             
                            image=segment_frames[frame_number],
         
     | 
| 
       295 
291 
     | 
    
         
             
                            box_threshold=box_threshold,
         
     | 
| 
       296 
     | 
    
         
            -
                            fine_tune_id=fine_tune_id,
         
     | 
| 
       297 
292 
     | 
    
         
             
                        )
         
     | 
| 
       298 
293 
     | 
    
         
             
                        function_name = "owlv2_object_detection"
         
     | 
| 
       299 
294 
     | 
    
         | 
| 
         @@ -301,7 +296,6 @@ def od_sam2_video_tracking( 
     | 
|
| 
       301 
296 
     | 
    
         
             
                        segment_results = florence2_object_detection(
         
     | 
| 
       302 
297 
     | 
    
         
             
                            prompt=prompt,
         
     | 
| 
       303 
298 
     | 
    
         
             
                            image=segment_frames[frame_number],
         
     | 
| 
       304 
     | 
    
         
            -
                            fine_tune_id=fine_tune_id,
         
     | 
| 
       305 
299 
     | 
    
         
             
                        )
         
     | 
| 
       306 
300 
     | 
    
         
             
                        function_name = "florence2_object_detection"
         
     | 
| 
       307 
301 
     | 
    
         | 
| 
         @@ -309,13 +303,12 @@ def od_sam2_video_tracking( 
     | 
|
| 
       309 
303 
     | 
    
         
             
                        segment_results = agentic_object_detection(
         
     | 
| 
       310 
304 
     | 
    
         
             
                            prompt=prompt,
         
     | 
| 
       311 
305 
     | 
    
         
             
                            image=segment_frames[frame_number],
         
     | 
| 
       312 
     | 
    
         
            -
                            fine_tune_id=fine_tune_id,
         
     | 
| 
       313 
306 
     | 
    
         
             
                        )
         
     | 
| 
       314 
307 
     | 
    
         
             
                        function_name = "agentic_object_detection"
         
     | 
| 
       315 
308 
     | 
    
         | 
| 
       316 
309 
     | 
    
         
             
                    elif od_model == ODModels.CUSTOM:
         
     | 
| 
       317 
310 
     | 
    
         
             
                        segment_results = custom_object_detection(
         
     | 
| 
       318 
     | 
    
         
            -
                            deployment_id= 
     | 
| 
      
 311 
     | 
    
         
            +
                            deployment_id=deployment_id,
         
     | 
| 
       319 
312 
     | 
    
         
             
                            image=segment_frames[frame_number],
         
     | 
| 
       320 
313 
     | 
    
         
             
                            box_threshold=box_threshold,
         
     | 
| 
       321 
314 
     | 
    
         
             
                        )
         
     | 
| 
         @@ -337,7 +330,7 @@ def od_sam2_video_tracking( 
     | 
|
| 
       337 
330 
     | 
    
         
             
                            segment_frames=segment,
         
     | 
| 
       338 
331 
     | 
    
         
             
                            od_model=od_model,
         
     | 
| 
       339 
332 
     | 
    
         
             
                            prompt=prompt,
         
     | 
| 
       340 
     | 
    
         
            -
                             
     | 
| 
      
 333 
     | 
    
         
            +
                            deployment_id=deployment_id,
         
     | 
| 
       341 
334 
     | 
    
         
             
                            chunk_length=chunk_length,
         
     | 
| 
       342 
335 
     | 
    
         
             
                            image_size=image_size,
         
     | 
| 
       343 
336 
     | 
    
         
             
                            segment_index=segment_index,
         
     | 
| 
         @@ -376,7 +369,6 @@ def _owlv2_object_detection( 
     | 
|
| 
       376 
369 
     | 
    
         
             
                box_threshold: float,
         
     | 
| 
       377 
370 
     | 
    
         
             
                image_size: Tuple[int, ...],
         
     | 
| 
       378 
371 
     | 
    
         
             
                image_bytes: Optional[bytes] = None,
         
     | 
| 
       379 
     | 
    
         
            -
                fine_tune_id: Optional[str] = None,
         
     | 
| 
       380 
372 
     | 
    
         
             
            ) -> Dict[str, Any]:
         
     | 
| 
       381 
373 
     | 
    
         
             
                if image_bytes is None:
         
     | 
| 
       382 
374 
     | 
    
         
             
                    image_bytes = numpy_to_bytes(image)
         
     | 
| 
         @@ -389,21 +381,6 @@ def _owlv2_object_detection( 
     | 
|
| 
       389 
381 
     | 
    
         
             
                }
         
     | 
| 
       390 
382 
     | 
    
         
             
                metadata = {"function_name": "owlv2_object_detection"}
         
     | 
| 
       391 
383 
     | 
    
         | 
| 
       392 
     | 
    
         
            -
                if fine_tune_id is not None:
         
     | 
| 
       393 
     | 
    
         
            -
                    landing_api = LandingPublicAPI()
         
     | 
| 
       394 
     | 
    
         
            -
                    status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
         
     | 
| 
       395 
     | 
    
         
            -
                    if status is not JobStatus.SUCCEEDED:
         
     | 
| 
       396 
     | 
    
         
            -
                        raise FineTuneModelIsNotReady(
         
     | 
| 
       397 
     | 
    
         
            -
                            f"Fine-tuned model {fine_tune_id} is not ready yet"
         
     | 
| 
       398 
     | 
    
         
            -
                        )
         
     | 
| 
       399 
     | 
    
         
            -
             
     | 
| 
       400 
     | 
    
         
            -
                    # we can only execute fine-tuned models with florence2
         
     | 
| 
       401 
     | 
    
         
            -
                    payload = {
         
     | 
| 
       402 
     | 
    
         
            -
                        "prompts": payload["prompts"],
         
     | 
| 
       403 
     | 
    
         
            -
                        "jobId": fine_tune_id,
         
     | 
| 
       404 
     | 
    
         
            -
                        "model": "florence2",
         
     | 
| 
       405 
     | 
    
         
            -
                    }
         
     | 
| 
       406 
     | 
    
         
            -
             
     | 
| 
       407 
384 
     | 
    
         
             
                detections = send_task_inference_request(
         
     | 
| 
       408 
385 
     | 
    
         
             
                    payload,
         
     | 
| 
       409 
386 
     | 
    
         
             
                    "text-to-object-detection",
         
     | 
| 
         @@ -440,7 +417,6 @@ def owlv2_object_detection( 
     | 
|
| 
       440 
417 
     | 
    
         
             
                prompt: str,
         
     | 
| 
       441 
418 
     | 
    
         
             
                image: np.ndarray,
         
     | 
| 
       442 
419 
     | 
    
         
             
                box_threshold: float = 0.10,
         
     | 
| 
       443 
     | 
    
         
            -
                fine_tune_id: Optional[str] = None,
         
     | 
| 
       444 
420 
     | 
    
         
             
            ) -> List[Dict[str, Any]]:
         
     | 
| 
       445 
421 
     | 
    
         
             
                """'owlv2_object_detection' is a tool that can detect and count multiple objects
         
     | 
| 
       446 
422 
     | 
    
         
             
                given a text prompt such as category names or referring expressions on images. The
         
     | 
| 
         @@ -452,8 +428,6 @@ def owlv2_object_detection( 
     | 
|
| 
       452 
428 
     | 
    
         
             
                    image (np.ndarray): The image to ground the prompt to.
         
     | 
| 
       453 
429 
     | 
    
         
             
                    box_threshold (float, optional): The threshold for the box detection. Defaults
         
     | 
| 
       454 
430 
     | 
    
         
             
                        to 0.10.
         
     | 
| 
       455 
     | 
    
         
            -
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         
     | 
| 
       456 
     | 
    
         
            -
                        fine-tuned model ID here to use it.
         
     | 
| 
       457 
431 
     | 
    
         | 
| 
       458 
432 
     | 
    
         
             
                Returns:
         
     | 
| 
       459 
433 
     | 
    
         
             
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
         
     | 
| 
         @@ -475,9 +449,7 @@ def owlv2_object_detection( 
     | 
|
| 
       475 
449 
     | 
    
         
             
                if image_size[0] < 1 or image_size[1] < 1:
         
     | 
| 
       476 
450 
     | 
    
         
             
                    return []
         
     | 
| 
       477 
451 
     | 
    
         | 
| 
       478 
     | 
    
         
            -
                ret = _owlv2_object_detection(
         
     | 
| 
       479 
     | 
    
         
            -
                    prompt, image, box_threshold, image_size, fine_tune_id=fine_tune_id
         
     | 
| 
       480 
     | 
    
         
            -
                )
         
     | 
| 
      
 452 
     | 
    
         
            +
                ret = _owlv2_object_detection(prompt, image, box_threshold, image_size)
         
     | 
| 
       481 
453 
     | 
    
         | 
| 
       482 
454 
     | 
    
         
             
                _display_tool_trace(
         
     | 
| 
       483 
455 
     | 
    
         
             
                    owlv2_object_detection.__name__,
         
     | 
| 
         @@ -556,7 +528,6 @@ def owlv2_sam2_video_tracking( 
     | 
|
| 
       556 
528 
     | 
    
         
             
                frames: List[np.ndarray],
         
     | 
| 
       557 
529 
     | 
    
         
             
                box_threshold: float = 0.10,
         
     | 
| 
       558 
530 
     | 
    
         
             
                chunk_length: Optional[int] = 25,
         
     | 
| 
       559 
     | 
    
         
            -
                fine_tune_id: Optional[str] = None,
         
     | 
| 
       560 
531 
     | 
    
         
             
            ) -> List[List[Dict[str, Any]]]:
         
     | 
| 
       561 
532 
     | 
    
         
             
                """'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
         
     | 
| 
       562 
533 
     | 
    
         
             
                objects in a video given a text prompt such as category names or referring
         
     | 
| 
         @@ -571,8 +542,6 @@ def owlv2_sam2_video_tracking( 
     | 
|
| 
       571 
542 
     | 
    
         
             
                        to 0.10.
         
     | 
| 
       572 
543 
     | 
    
         
             
                    chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
         
     | 
| 
       573 
544 
     | 
    
         
             
                        new objects.
         
     | 
| 
       574 
     | 
    
         
            -
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         
     | 
| 
       575 
     | 
    
         
            -
                        fine-tuned model ID here to use it.
         
     | 
| 
       576 
545 
     | 
    
         | 
| 
       577 
546 
     | 
    
         
             
                Returns:
         
     | 
| 
       578 
547 
     | 
    
         
             
                    List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
         
     | 
| 
         @@ -609,7 +578,6 @@ def owlv2_sam2_video_tracking( 
     | 
|
| 
       609 
578 
     | 
    
         
             
                    frames=frames,
         
     | 
| 
       610 
579 
     | 
    
         
             
                    box_threshold=box_threshold,
         
     | 
| 
       611 
580 
     | 
    
         
             
                    chunk_length=chunk_length,
         
     | 
| 
       612 
     | 
    
         
            -
                    fine_tune_id=fine_tune_id,
         
     | 
| 
       613 
581 
     | 
    
         
             
                )
         
     | 
| 
       614 
582 
     | 
    
         
             
                _display_tool_trace(
         
     | 
| 
       615 
583 
     | 
    
         
             
                    owlv2_sam2_video_tracking.__name__,
         
     | 
| 
         @@ -624,7 +592,8 @@ def owlv2_sam2_video_tracking( 
     | 
|
| 
       624 
592 
     | 
    
         | 
| 
       625 
593 
     | 
    
         | 
| 
       626 
594 
     | 
    
         
             
            def florence2_object_detection(
         
     | 
| 
       627 
     | 
    
         
            -
                prompt: str, 
     | 
| 
      
 595 
     | 
    
         
            +
                prompt: str,
         
     | 
| 
      
 596 
     | 
    
         
            +
                image: np.ndarray,
         
     | 
| 
       628 
597 
     | 
    
         
             
            ) -> List[Dict[str, Any]]:
         
     | 
| 
       629 
598 
     | 
    
         
             
                """'florence2_object_detection' is a tool that can detect multiple objects given a
         
     | 
| 
       630 
599 
     | 
    
         
             
                text prompt which can be object names or caption. You can optionally separate the
         
     | 
| 
         @@ -635,8 +604,6 @@ def florence2_object_detection( 
     | 
|
| 
       635 
604 
     | 
    
         
             
                    prompt (str): The prompt to ground to the image. Use exclusive categories that
         
     | 
| 
       636 
605 
     | 
    
         
             
                        do not overlap such as 'person, car' and NOT 'person, athlete'.
         
     | 
| 
       637 
606 
     | 
    
         
             
                    image (np.ndarray): The image to used to detect objects
         
     | 
| 
       638 
     | 
    
         
            -
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         
     | 
| 
       639 
     | 
    
         
            -
                        fine-tuned model ID here to use it.
         
     | 
| 
       640 
607 
     | 
    
         | 
| 
       641 
608 
     | 
    
         
             
                Returns:
         
     | 
| 
       642 
609 
     | 
    
         
             
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
         
     | 
| 
         @@ -653,6 +620,7 @@ def florence2_object_detection( 
     | 
|
| 
       653 
620 
     | 
    
         
             
                        {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
         
     | 
| 
       654 
621 
     | 
    
         
             
                    ]
         
     | 
| 
       655 
622 
     | 
    
         
             
                """
         
     | 
| 
      
 623 
     | 
    
         
            +
             
     | 
| 
       656 
624 
     | 
    
         
             
                image_size = image.shape[:2]
         
     | 
| 
       657 
625 
     | 
    
         
             
                if image_size[0] < 1 or image_size[1] < 1:
         
     | 
| 
       658 
626 
     | 
    
         
             
                    return []
         
     | 
| 
         @@ -665,16 +633,6 @@ def florence2_object_detection( 
     | 
|
| 
       665 
633 
     | 
    
         
             
                }
         
     | 
| 
       666 
634 
     | 
    
         
             
                metadata = {"function_name": "florence2_object_detection"}
         
     | 
| 
       667 
635 
     | 
    
         | 
| 
       668 
     | 
    
         
            -
                if fine_tune_id is not None:
         
     | 
| 
       669 
     | 
    
         
            -
                    landing_api = LandingPublicAPI()
         
     | 
| 
       670 
     | 
    
         
            -
                    status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
         
     | 
| 
       671 
     | 
    
         
            -
                    if status is not JobStatus.SUCCEEDED:
         
     | 
| 
       672 
     | 
    
         
            -
                        raise FineTuneModelIsNotReady(
         
     | 
| 
       673 
     | 
    
         
            -
                            f"Fine-tuned model {fine_tune_id} is not ready yet"
         
     | 
| 
       674 
     | 
    
         
            -
                        )
         
     | 
| 
       675 
     | 
    
         
            -
             
     | 
| 
       676 
     | 
    
         
            -
                    payload["jobId"] = fine_tune_id
         
     | 
| 
       677 
     | 
    
         
            -
             
     | 
| 
       678 
636 
     | 
    
         
             
                detections = send_task_inference_request(
         
     | 
| 
       679 
637 
     | 
    
         
             
                    payload,
         
     | 
| 
       680 
638 
     | 
    
         
             
                    "text-to-object-detection",
         
     | 
| 
         @@ -703,7 +661,8 @@ def florence2_object_detection( 
     | 
|
| 
       703 
661 
     | 
    
         | 
| 
       704 
662 
     | 
    
         | 
| 
       705 
663 
     | 
    
         
             
            def florence2_sam2_instance_segmentation(
         
     | 
| 
       706 
     | 
    
         
            -
                prompt: str, 
     | 
| 
      
 664 
     | 
    
         
            +
                prompt: str,
         
     | 
| 
      
 665 
     | 
    
         
            +
                image: np.ndarray,
         
     | 
| 
       707 
666 
     | 
    
         
             
            ) -> List[Dict[str, Any]]:
         
     | 
| 
       708 
667 
     | 
    
         
             
                """'florence2_sam2_instance_segmentation' is a tool that can segment multiple
         
     | 
| 
       709 
668 
     | 
    
         
             
                objects given a text prompt such as category names or referring expressions. The
         
     | 
| 
         @@ -715,8 +674,6 @@ def florence2_sam2_instance_segmentation( 
     | 
|
| 
       715 
674 
     | 
    
         
             
                    prompt (str): The prompt to ground to the image. Use exclusive categories that
         
     | 
| 
       716 
675 
     | 
    
         
             
                        do not overlap such as 'person, car' and NOT 'person, athlete'.
         
     | 
| 
       717 
676 
     | 
    
         
             
                    image (np.ndarray): The image to ground the prompt to.
         
     | 
| 
       718 
     | 
    
         
            -
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         
     | 
| 
       719 
     | 
    
         
            -
                        fine-tuned model ID here to use it.
         
     | 
| 
       720 
677 
     | 
    
         | 
| 
       721 
678 
     | 
    
         
             
                Returns:
         
     | 
| 
       722 
679 
     | 
    
         
             
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label,
         
     | 
| 
         @@ -742,6 +699,7 @@ def florence2_sam2_instance_segmentation( 
     | 
|
| 
       742 
699 
     | 
    
         
             
                        },
         
     | 
| 
       743 
700 
     | 
    
         
             
                    ]
         
     | 
| 
       744 
701 
     | 
    
         
             
                """
         
     | 
| 
      
 702 
     | 
    
         
            +
             
     | 
| 
       745 
703 
     | 
    
         
             
                if image.shape[0] < 1 or image.shape[1] < 1:
         
     | 
| 
       746 
704 
     | 
    
         
             
                    return []
         
     | 
| 
       747 
705 
     | 
    
         | 
| 
         @@ -753,16 +711,6 @@ def florence2_sam2_instance_segmentation( 
     | 
|
| 
       753 
711 
     | 
    
         
             
                }
         
     | 
| 
       754 
712 
     | 
    
         
             
                metadata = {"function_name": "florence2_sam2_instance_segmentation"}
         
     | 
| 
       755 
713 
     | 
    
         | 
| 
       756 
     | 
    
         
            -
                if fine_tune_id is not None:
         
     | 
| 
       757 
     | 
    
         
            -
                    landing_api = LandingPublicAPI()
         
     | 
| 
       758 
     | 
    
         
            -
                    status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
         
     | 
| 
       759 
     | 
    
         
            -
                    if status is not JobStatus.SUCCEEDED:
         
     | 
| 
       760 
     | 
    
         
            -
                        raise FineTuneModelIsNotReady(
         
     | 
| 
       761 
     | 
    
         
            -
                            f"Fine-tuned model {fine_tune_id} is not ready yet"
         
     | 
| 
       762 
     | 
    
         
            -
                        )
         
     | 
| 
       763 
     | 
    
         
            -
             
     | 
| 
       764 
     | 
    
         
            -
                    payload["jobId"] = fine_tune_id
         
     | 
| 
       765 
     | 
    
         
            -
             
     | 
| 
       766 
714 
     | 
    
         
             
                detections = send_task_inference_request(
         
     | 
| 
       767 
715 
     | 
    
         
             
                    payload,
         
     | 
| 
       768 
716 
     | 
    
         
             
                    "text-to-instance-segmentation",
         
     | 
| 
         @@ -792,7 +740,6 @@ def florence2_sam2_video_tracking( 
     | 
|
| 
       792 
740 
     | 
    
         
             
                prompt: str,
         
     | 
| 
       793 
741 
     | 
    
         
             
                frames: List[np.ndarray],
         
     | 
| 
       794 
742 
     | 
    
         
             
                chunk_length: Optional[int] = 25,
         
     | 
| 
       795 
     | 
    
         
            -
                fine_tune_id: Optional[str] = None,
         
     | 
| 
       796 
743 
     | 
    
         
             
            ) -> List[List[Dict[str, Any]]]:
         
     | 
| 
       797 
744 
     | 
    
         
             
                """'florence2_sam2_video_tracking' is a tool that can track and segment multiple
         
     | 
| 
       798 
745 
     | 
    
         
             
                objects in a video given a text prompt such as category names or referring
         
     | 
| 
         @@ -806,8 +753,6 @@ def florence2_sam2_video_tracking( 
     | 
|
| 
       806 
753 
     | 
    
         
             
                    frames (List[np.ndarray]): The list of frames to ground the prompt to.
         
     | 
| 
       807 
754 
     | 
    
         
             
                    chunk_length (Optional[int]): The number of frames to re-run florence2 to find
         
     | 
| 
       808 
755 
     | 
    
         
             
                        new objects.
         
     | 
| 
       809 
     | 
    
         
            -
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         
     | 
| 
       810 
     | 
    
         
            -
                        fine-tuned model ID here to use it.
         
     | 
| 
       811 
756 
     | 
    
         | 
| 
       812 
757 
     | 
    
         
             
                Returns:
         
     | 
| 
       813 
758 
     | 
    
         
             
                    List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
         
     | 
| 
         @@ -837,6 +782,7 @@ def florence2_sam2_video_tracking( 
     | 
|
| 
       837 
782 
     | 
    
         
             
                        ...
         
     | 
| 
       838 
783 
     | 
    
         
             
                    ]
         
     | 
| 
       839 
784 
     | 
    
         
             
                """
         
     | 
| 
      
 785 
     | 
    
         
            +
             
     | 
| 
       840 
786 
     | 
    
         
             
                if len(frames) == 0 or not isinstance(frames, List):
         
     | 
| 
       841 
787 
     | 
    
         
             
                    raise ValueError("Must provide a list of numpy arrays for frames")
         
     | 
| 
       842 
788 
     | 
    
         | 
| 
         @@ -851,16 +797,6 @@ def florence2_sam2_video_tracking( 
     | 
|
| 
       851 
797 
     | 
    
         
             
                if chunk_length is not None:
         
     | 
| 
       852 
798 
     | 
    
         
             
                    payload["chunk_length_frames"] = chunk_length  # type: ignore
         
     | 
| 
       853 
799 
     | 
    
         | 
| 
       854 
     | 
    
         
            -
                if fine_tune_id is not None:
         
     | 
| 
       855 
     | 
    
         
            -
                    landing_api = LandingPublicAPI()
         
     | 
| 
       856 
     | 
    
         
            -
                    status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
         
     | 
| 
       857 
     | 
    
         
            -
                    if status is not JobStatus.SUCCEEDED:
         
     | 
| 
       858 
     | 
    
         
            -
                        raise FineTuneModelIsNotReady(
         
     | 
| 
       859 
     | 
    
         
            -
                            f"Fine-tuned model {fine_tune_id} is not ready yet"
         
     | 
| 
       860 
     | 
    
         
            -
                        )
         
     | 
| 
       861 
     | 
    
         
            -
             
     | 
| 
       862 
     | 
    
         
            -
                    payload["jobId"] = fine_tune_id
         
     | 
| 
       863 
     | 
    
         
            -
             
     | 
| 
       864 
800 
     | 
    
         
             
                detections = send_task_inference_request(
         
     | 
| 
       865 
801 
     | 
    
         
             
                    payload,
         
     | 
| 
       866 
802 
     | 
    
         
             
                    "text-to-instance-segmentation",
         
     | 
| 
         @@ -1397,7 +1333,7 @@ def custom_od_sam2_video_tracking( 
     | 
|
| 
       1397 
1333 
     | 
    
         
             
                    prompt="",
         
     | 
| 
       1398 
1334 
     | 
    
         
             
                    frames=frames,
         
     | 
| 
       1399 
1335 
     | 
    
         
             
                    chunk_length=chunk_length,
         
     | 
| 
       1400 
     | 
    
         
            -
                     
     | 
| 
      
 1336 
     | 
    
         
            +
                    deployment_id=deployment_id,
         
     | 
| 
       1401 
1337 
     | 
    
         
             
                )
         
     | 
| 
       1402 
1338 
     | 
    
         
             
                _display_tool_trace(
         
     | 
| 
       1403 
1339 
     | 
    
         
             
                    custom_od_sam2_video_tracking.__name__,
         
     | 
| 
         @@ -1416,7 +1352,6 @@ def _agentic_object_detection( 
     | 
|
| 
       1416 
1352 
     | 
    
         
             
                image: np.ndarray,
         
     | 
| 
       1417 
1353 
     | 
    
         
             
                image_size: Tuple[int, ...],
         
     | 
| 
       1418 
1354 
     | 
    
         
             
                image_bytes: Optional[bytes] = None,
         
     | 
| 
       1419 
     | 
    
         
            -
                fine_tune_id: Optional[str] = None,
         
     | 
| 
       1420 
1355 
     | 
    
         
             
            ) -> Dict[str, Any]:
         
     | 
| 
       1421 
1356 
     | 
    
         
             
                if image_bytes is None:
         
     | 
| 
       1422 
1357 
     | 
    
         
             
                    image_bytes = numpy_to_bytes(image)
         
     | 
| 
         @@ -1428,21 +1363,6 @@ def _agentic_object_detection( 
     | 
|
| 
       1428 
1363 
     | 
    
         
             
                }
         
     | 
| 
       1429 
1364 
     | 
    
         
             
                metadata = {"function_name": "agentic_object_detection"}
         
     | 
| 
       1430 
1365 
     | 
    
         | 
| 
       1431 
     | 
    
         
            -
                if fine_tune_id is not None:
         
     | 
| 
       1432 
     | 
    
         
            -
                    landing_api = LandingPublicAPI()
         
     | 
| 
       1433 
     | 
    
         
            -
                    status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
         
     | 
| 
       1434 
     | 
    
         
            -
                    if status is not JobStatus.SUCCEEDED:
         
     | 
| 
       1435 
     | 
    
         
            -
                        raise FineTuneModelIsNotReady(
         
     | 
| 
       1436 
     | 
    
         
            -
                            f"Fine-tuned model {fine_tune_id} is not ready yet"
         
     | 
| 
       1437 
     | 
    
         
            -
                        )
         
     | 
| 
       1438 
     | 
    
         
            -
             
     | 
| 
       1439 
     | 
    
         
            -
                    # we can only execute fine-tuned models with florence2
         
     | 
| 
       1440 
     | 
    
         
            -
                    payload = {
         
     | 
| 
       1441 
     | 
    
         
            -
                        "prompts": payload["prompts"],
         
     | 
| 
       1442 
     | 
    
         
            -
                        "jobId": fine_tune_id,
         
     | 
| 
       1443 
     | 
    
         
            -
                        "model": "florence2",
         
     | 
| 
       1444 
     | 
    
         
            -
                    }
         
     | 
| 
       1445 
     | 
    
         
            -
             
     | 
| 
       1446 
1366 
     | 
    
         
             
                detections = send_task_inference_request(
         
     | 
| 
       1447 
1367 
     | 
    
         
             
                    payload,
         
     | 
| 
       1448 
1368 
     | 
    
         
             
                    "text-to-object-detection",
         
     | 
| 
         @@ -1478,7 +1398,6 @@ def _agentic_object_detection( 
     | 
|
| 
       1478 
1398 
     | 
    
         
             
            def agentic_object_detection(
         
     | 
| 
       1479 
1399 
     | 
    
         
             
                prompt: str,
         
     | 
| 
       1480 
1400 
     | 
    
         
             
                image: np.ndarray,
         
     | 
| 
       1481 
     | 
    
         
            -
                fine_tune_id: Optional[str] = None,
         
     | 
| 
       1482 
1401 
     | 
    
         
             
            ) -> List[Dict[str, Any]]:
         
     | 
| 
       1483 
1402 
     | 
    
         
             
                """'agentic_object_detection' is a tool that can detect multiple objects given a
         
     | 
| 
       1484 
1403 
     | 
    
         
             
                text prompt such as object names or referring expressions on images. It's
         
     | 
| 
         @@ -1490,8 +1409,6 @@ def agentic_object_detection( 
     | 
|
| 
       1490 
1409 
     | 
    
         
             
                    prompt (str): The prompt to ground to the image, only supports a single prompt
         
     | 
| 
       1491 
1410 
     | 
    
         
             
                        with no commas or periods.
         
     | 
| 
       1492 
1411 
     | 
    
         
             
                    image (np.ndarray): The image to ground the prompt to.
         
     | 
| 
       1493 
     | 
    
         
            -
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         
     | 
| 
       1494 
     | 
    
         
            -
                        fine-tuned model ID here to use it.
         
     | 
| 
       1495 
1412 
     | 
    
         | 
| 
       1496 
1413 
     | 
    
         
             
                Returns:
         
     | 
| 
       1497 
1414 
     | 
    
         
             
                    List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
         
     | 
| 
         @@ -1513,9 +1430,7 @@ def agentic_object_detection( 
     | 
|
| 
       1513 
1430 
     | 
    
         
             
                if image_size[0] < 1 or image_size[1] < 1:
         
     | 
| 
       1514 
1431 
     | 
    
         
             
                    return []
         
     | 
| 
       1515 
1432 
     | 
    
         | 
| 
       1516 
     | 
    
         
            -
                ret = _agentic_object_detection(
         
     | 
| 
       1517 
     | 
    
         
            -
                    prompt, image, image_size, fine_tune_id=fine_tune_id
         
     | 
| 
       1518 
     | 
    
         
            -
                )
         
     | 
| 
      
 1433 
     | 
    
         
            +
                ret = _agentic_object_detection(prompt, image, image_size)
         
     | 
| 
       1519 
1434 
     | 
    
         | 
| 
       1520 
1435 
     | 
    
         
             
                _display_tool_trace(
         
     | 
| 
       1521 
1436 
     | 
    
         
             
                    agentic_object_detection.__name__,
         
     | 
| 
         @@ -1586,7 +1501,6 @@ def agentic_sam2_video_tracking( 
     | 
|
| 
       1586 
1501 
     | 
    
         
             
                prompt: str,
         
     | 
| 
       1587 
1502 
     | 
    
         
             
                frames: List[np.ndarray],
         
     | 
| 
       1588 
1503 
     | 
    
         
             
                chunk_length: Optional[int] = 25,
         
     | 
| 
       1589 
     | 
    
         
            -
                fine_tune_id: Optional[str] = None,
         
     | 
| 
       1590 
1504 
     | 
    
         
             
            ) -> List[List[Dict[str, Any]]]:
         
     | 
| 
       1591 
1505 
     | 
    
         
             
                """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
         
     | 
| 
       1592 
1506 
     | 
    
         
             
                objects in a video given a text prompt such as object names or referring
         
     | 
| 
         @@ -1601,8 +1515,6 @@ def agentic_sam2_video_tracking( 
     | 
|
| 
       1601 
1515 
     | 
    
         
             
                    frames (List[np.ndarray]): The list of frames to ground the prompt to.
         
     | 
| 
       1602 
1516 
     | 
    
         
             
                    chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
         
     | 
| 
       1603 
1517 
     | 
    
         
             
                        to find new objects.
         
     | 
| 
       1604 
     | 
    
         
            -
                    fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
         
     | 
| 
       1605 
     | 
    
         
            -
                        fine-tuned model ID here to use it.
         
     | 
| 
       1606 
1518 
     | 
    
         | 
| 
       1607 
1519 
     | 
    
         
             
                Returns:
         
     | 
| 
       1608 
1520 
     | 
    
         
             
                    List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
         
     | 
| 
         @@ -1638,7 +1550,6 @@ def agentic_sam2_video_tracking( 
     | 
|
| 
       1638 
1550 
     | 
    
         
             
                    prompt=prompt,
         
     | 
| 
       1639 
1551 
     | 
    
         
             
                    frames=frames,
         
     | 
| 
       1640 
1552 
     | 
    
         
             
                    chunk_length=chunk_length,
         
     | 
| 
       1641 
     | 
    
         
            -
                    fine_tune_id=fine_tune_id,
         
     | 
| 
       1642 
1553 
     | 
    
         
             
                )
         
     | 
| 
       1643 
1554 
     | 
    
         
             
                _display_tool_trace(
         
     | 
| 
       1644 
1555 
     | 
    
         
             
                    agentic_sam2_video_tracking.__name__,
         
     | 
    
        vision_agent/utils/agent.py
    CHANGED
    
    | 
         @@ -159,11 +159,12 @@ def format_conversation(chat: List[AgentMessage]) -> str: 
     | 
|
| 
       159 
159 
     | 
    
         
             
                chat = copy.deepcopy(chat)
         
     | 
| 
       160 
160 
     | 
    
         
             
                prompt = ""
         
     | 
| 
       161 
161 
     | 
    
         
             
                for chat_i in chat:
         
     | 
| 
       162 
     | 
    
         
            -
                     
     | 
| 
       163 
     | 
    
         
            -
             
     | 
| 
       164 
     | 
    
         
            -
             
     | 
| 
       165 
     | 
    
         
            -
                        elif chat_i.role == "user":
         
     | 
| 
      
 162 
     | 
    
         
            +
                    # we want to print user messages, final code, final code observations or errors
         
     | 
| 
      
 163 
     | 
    
         
            +
                    if chat_i.role in ["user", "coder", "final_observation", "error_observation"]:
         
     | 
| 
      
 164 
     | 
    
         
            +
                        if chat_i.role == "user":
         
     | 
| 
       166 
165 
     | 
    
         
             
                            prompt += f"USER: {chat_i.content}\n\n"
         
     | 
| 
      
 166 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 167 
     | 
    
         
            +
                            prompt += f"OBSERVATION: {chat_i.content}\n\n"
         
     | 
| 
       167 
168 
     | 
    
         
             
                    elif chat_i.role == "conversation":
         
     | 
| 
       168 
169 
     | 
    
         
             
                        prompt += f"AGENT: {chat_i.content}\n\n"
         
     | 
| 
       169 
170 
     | 
    
         
             
                return prompt
         
     | 
    
        vision_agent/utils/exceptions.py
    CHANGED
    
    | 
         @@ -51,13 +51,6 @@ class RemoteSandboxClosedError(RemoteSandboxError): 
     | 
|
| 
       51 
51 
     | 
    
         
             
                is_retryable = True
         
     | 
| 
       52 
52 
     | 
    
         | 
| 
       53 
53 
     | 
    
         | 
| 
       54 
     | 
    
         
            -
            class FineTuneModelIsNotReady(Exception):
         
     | 
| 
       55 
     | 
    
         
            -
                """Exception raised when the fine-tune model is not ready.
         
     | 
| 
       56 
     | 
    
         
            -
                If this is raised, it's recommended to wait 5 seconds before trying to use
         
     | 
| 
       57 
     | 
    
         
            -
                the model again.
         
     | 
| 
       58 
     | 
    
         
            -
                """
         
     | 
| 
       59 
     | 
    
         
            -
             
     | 
| 
       60 
     | 
    
         
            -
             
     | 
| 
       61 
54 
     | 
    
         
             
            class FineTuneModelNotFound(Exception):
         
     | 
| 
       62 
55 
     | 
    
         
             
                """Exception raised when the fine-tune model is not found.
         
     | 
| 
       63 
56 
     | 
    
         
             
                If this is raised, it's recommended to try another model id.
         
     | 
| 
         @@ -54,7 +54,7 @@ def process_segment( 
     | 
|
| 
       54 
54 
     | 
    
         
             
                segment_frames: List[np.ndarray],
         
     | 
| 
       55 
55 
     | 
    
         
             
                od_model: ODModels,
         
     | 
| 
       56 
56 
     | 
    
         
             
                prompt: str,
         
     | 
| 
       57 
     | 
    
         
            -
                 
     | 
| 
      
 57 
     | 
    
         
            +
                deployment_id: Optional[str],
         
     | 
| 
       58 
58 
     | 
    
         
             
                chunk_length: Optional[int],
         
     | 
| 
       59 
59 
     | 
    
         
             
                image_size: Tuple[int, ...],
         
     | 
| 
       60 
60 
     | 
    
         
             
                segment_index: int,
         
     | 
| 
         @@ -67,7 +67,7 @@ def process_segment( 
     | 
|
| 
       67 
67 
     | 
    
         
             
                    segment_frames (List[np.ndarray]): Frames in the segment.
         
     | 
| 
       68 
68 
     | 
    
         
             
                    od_model (ODModels): Object detection model to use.
         
     | 
| 
       69 
69 
     | 
    
         
             
                    prompt (str): Prompt for the model.
         
     | 
| 
       70 
     | 
    
         
            -
                     
     | 
| 
      
 70 
     | 
    
         
            +
                    deployment_id (Optional[str]): The model deployment ID.
         
     | 
| 
       71 
71 
     | 
    
         
             
                    chunk_length (Optional[int]): Chunk length for processing.
         
     | 
| 
       72 
72 
     | 
    
         
             
                    image_size (Tuple[int, int]): Size of the images.
         
     | 
| 
       73 
73 
     | 
    
         
             
                    segment_index (int): Index of the segment.
         
     | 
| 
         @@ -90,7 +90,12 @@ def process_segment( 
     | 
|
| 
       90 
90 
     | 
    
         
             
                for idx in range(0, len(segment_frames), step):
         
     | 
| 
       91 
91 
     | 
    
         
             
                    frame_number = idx
         
     | 
| 
       92 
92 
     | 
    
         
             
                    segment_results[idx], function_name = object_detection_tool(
         
     | 
| 
       93 
     | 
    
         
            -
                         
     | 
| 
      
 93 
     | 
    
         
            +
                        deployment_id=deployment_id,
         
     | 
| 
      
 94 
     | 
    
         
            +
                        frame_number=frame_number,
         
     | 
| 
      
 95 
     | 
    
         
            +
                        od_model=od_model,
         
     | 
| 
      
 96 
     | 
    
         
            +
                        prompt=prompt,
         
     | 
| 
      
 97 
     | 
    
         
            +
                        segment_frames=segment_frames,
         
     | 
| 
      
 98 
     | 
    
         
            +
                        segment_index=segment_index,
         
     | 
| 
       94 
99 
     | 
    
         
             
                    )
         
     | 
| 
       95 
100 
     | 
    
         | 
| 
       96 
101 
     | 
    
         
             
                transformed_detections = transform_detections(
         
     | 
| 
         @@ -3,22 +3,21 @@ vision_agent/.sim_tools/embs.npy,sha256=pi7h3NHlrKncIGNR-oPn_XoTe2PzBb9-aFMi7qK0 
     | 
|
| 
       3 
3 
     | 
    
         
             
            vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
         
     | 
| 
       4 
4 
     | 
    
         
             
            vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
         
     | 
| 
       5 
5 
     | 
    
         
             
            vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
         
     | 
| 
       6 
     | 
    
         
            -
            vision_agent/agent/agent.py,sha256= 
     | 
| 
      
 6 
     | 
    
         
            +
            vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
         
     | 
| 
       7 
7 
     | 
    
         
             
            vision_agent/agent/vision_agent.py,sha256=4LqvwPTSsiuJEDwBbMx9Dg9ALJwNR6x1c63TZvOMm8A,23486
         
     | 
| 
       8 
8 
     | 
    
         
             
            vision_agent/agent/vision_agent_coder.py,sha256=Ry6AiyAj3hsSeYPu_5guMcTzf2E4SoebPzpHyJtSPbQ,27360
         
     | 
| 
       9 
9 
     | 
    
         
             
            vision_agent/agent/vision_agent_coder_prompts.py,sha256=D4RJxTWoxpl-WtYRvHNxaLSdWVHsdYb0jJIQ2ZCGU0A,12277
         
     | 
| 
       10 
10 
     | 
    
         
             
            vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=53b_DhQtffX5wxLuCbNQ83AJhB0P_3wEnuKr-v5bx-o,4866
         
     | 
| 
       11 
     | 
    
         
            -
            vision_agent/agent/vision_agent_coder_v2.py,sha256= 
     | 
| 
      
 11 
     | 
    
         
            +
            vision_agent/agent/vision_agent_coder_v2.py,sha256=I4gWrneFIqhX6W-MxiaNyPKGk5tRKgC8xryV-YdeSZU,17289
         
     | 
| 
       12 
12 
     | 
    
         
             
            vision_agent/agent/vision_agent_planner.py,sha256=rp_atRMDg35WFXNKOTkjUpGPrpSCsiMhcfZtqK-DIV4,18668
         
     | 
| 
       13 
13 
     | 
    
         
             
            vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
         
     | 
| 
       14 
14 
     | 
    
         
             
            vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=TiiF5BGnFVraFlQnDaeRU67927LvszvpcMUOgVgo0ps,35843
         
     | 
| 
       15 
     | 
    
         
            -
            vision_agent/agent/vision_agent_planner_v2.py,sha256= 
     | 
| 
      
 15 
     | 
    
         
            +
            vision_agent/agent/vision_agent_planner_v2.py,sha256=GOhaTsVCh02X09IKkC4k9z79lsmU4VgRW7WJLKjdG1k,21755
         
     | 
| 
       16 
16 
     | 
    
         
             
            vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
         
     | 
| 
       17 
     | 
    
         
            -
            vision_agent/agent/vision_agent_prompts_v2.py,sha256= 
     | 
| 
       18 
     | 
    
         
            -
            vision_agent/agent/vision_agent_v2.py,sha256= 
     | 
| 
      
 17 
     | 
    
         
            +
            vision_agent/agent/vision_agent_prompts_v2.py,sha256=jTfu_heNTBaHj1UNI0XIyyFDgDOjPTPP83vrS-g3A1U,2961
         
     | 
| 
      
 18 
     | 
    
         
            +
            vision_agent/agent/vision_agent_v2.py,sha256=QPAyDjnRRHUCD4Pw4TQYffWkucbn4WkEjYn8dBIWll4,11682
         
     | 
| 
       19 
19 
     | 
    
         
             
            vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
       20 
20 
     | 
    
         
             
            vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
         
     | 
| 
       21 
     | 
    
         
            -
            vision_agent/clients/landing_public_api.py,sha256=Vz9lldtNbaJRWzT7T8-uQrC-dMnt47LIsDrxHgoVdEw,1492
         
     | 
| 
       22 
21 
     | 
    
         
             
            vision_agent/configs/__init__.py,sha256=Iu75-w9_nlPmnB_qKA7nYaaaHf7xtTrDmK8N4v2WV34,27
         
     | 
| 
       23 
22 
     | 
    
         
             
            vision_agent/configs/anthropic_config.py,sha256=T1UuESgiY8913A6wA42P7-cg8FTk9-LkJpyywo7OnIQ,4298
         
     | 
| 
       24 
23 
     | 
    
         
             
            vision_agent/configs/anthropic_openai_config.py,sha256=rUz5zca4Pn5dTUwJXiJzRDYua5PWizApCKI3y0zOvhc,4699
         
     | 
| 
         @@ -28,28 +27,28 @@ vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF 
     | 
|
| 
       28 
27 
     | 
    
         
             
            vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
         
     | 
| 
       29 
28 
     | 
    
         
             
            vision_agent/lmm/__init__.py,sha256=4qX2lmGnKWHeKftXueEi9xj_ieK2nQh_ipHf72nKGFk,84
         
     | 
| 
       30 
29 
     | 
    
         
             
            vision_agent/lmm/lmm.py,sha256=XYp1frrqQ-6q-0y2IWwM8-EIH5UrFZ21SAhkcM32J9w,19355
         
     | 
| 
       31 
     | 
    
         
            -
            vision_agent/models/__init__.py,sha256= 
     | 
| 
       32 
     | 
    
         
            -
            vision_agent/models/agent_types.py,sha256= 
     | 
| 
      
 30 
     | 
    
         
            +
            vision_agent/models/__init__.py,sha256=eIP0pD5dYog8zUA7uuTmUxCF6SIutbLRLRE0cmuCJgQ,326
         
     | 
| 
      
 31 
     | 
    
         
            +
            vision_agent/models/agent_types.py,sha256=vBZ9-ns5lHDdFMO7ulCGGeZ6OwRo3gK4O3vN0814IWc,3064
         
     | 
| 
       33 
32 
     | 
    
         
             
            vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1cXmw,305
         
     | 
| 
       34 
33 
     | 
    
         
             
            vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
         
     | 
| 
       35 
34 
     | 
    
         
             
            vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
         
     | 
| 
       36 
35 
     | 
    
         
             
            vision_agent/sim/sim.py,sha256=VSU_1rYd4ifvF45xKWBEYugxdeeEQVpj0QL6rjx49i4,9801
         
     | 
| 
       37 
     | 
    
         
            -
            vision_agent/tools/__init__.py,sha256= 
     | 
| 
       38 
     | 
    
         
            -
            vision_agent/tools/meta_tools.py,sha256 
     | 
| 
      
 36 
     | 
    
         
            +
            vision_agent/tools/__init__.py,sha256=bYrOPuqrpwFA3TeY_pxRXVv61oJsxVWVgv1psJlBEcc,2391
         
     | 
| 
      
 37 
     | 
    
         
            +
            vision_agent/tools/meta_tools.py,sha256=DNRXHX9nZ1GBeqeLiq87sBshoe0aiZeYasETbG-9neI,24053
         
     | 
| 
       39 
38 
     | 
    
         
             
            vision_agent/tools/planner_tools.py,sha256=orBTdJQz2NKoLuX9WE6XixaYuG305xz0UBYvZOiuquQ,19474
         
     | 
| 
       40 
39 
     | 
    
         
             
            vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
         
     | 
| 
       41 
     | 
    
         
            -
            vision_agent/tools/tools.py,sha256= 
     | 
| 
      
 40 
     | 
    
         
            +
            vision_agent/tools/tools.py,sha256=uhvgPeAzhOV2vfBa216vq-JVItqgzIRKs1JMBezj2Es,107631
         
     | 
| 
       42 
41 
     | 
    
         
             
            vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
         
     | 
| 
       43 
     | 
    
         
            -
            vision_agent/utils/agent.py,sha256= 
     | 
| 
       44 
     | 
    
         
            -
            vision_agent/utils/exceptions.py,sha256= 
     | 
| 
      
 42 
     | 
    
         
            +
            vision_agent/utils/agent.py,sha256=8z4Ei0q397lVWUga8v9nQKuenGAsh2wfkAKQOB8CwpI,14701
         
     | 
| 
      
 43 
     | 
    
         
            +
            vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
         
     | 
| 
       45 
44 
     | 
    
         
             
            vision_agent/utils/execute.py,sha256=vOEP5Ys7S2lc0_7pOJbgk7OaWi85hrCNu9_8Bo3zk6I,29356
         
     | 
| 
       46 
45 
     | 
    
         
             
            vision_agent/utils/image_utils.py,sha256=bJM2mEvB6E__M9pxi74yQYzAiZ7mu3KE2ptyVrp5vzQ,12533
         
     | 
| 
       47 
46 
     | 
    
         
             
            vision_agent/utils/tools.py,sha256=USZL0MKsiJgqA8RFiYRTcj_Kn2FVYKLHK4wIk0gP1Ow,7694
         
     | 
| 
       48 
47 
     | 
    
         
             
            vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
         
     | 
| 
       49 
48 
     | 
    
         
             
            vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
         
     | 
| 
       50 
49 
     | 
    
         
             
            vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
         
     | 
| 
       51 
     | 
    
         
            -
            vision_agent/utils/video_tracking.py,sha256= 
     | 
| 
       52 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       53 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       54 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       55 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
      
 50 
     | 
    
         
            +
            vision_agent/utils/video_tracking.py,sha256=eMIiWOG24bgXbqOy1DTtepO2gPo1ClW6Y0tdbEF_14k,12227
         
     | 
| 
      
 51 
     | 
    
         
            +
            vision_agent-0.2.242.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         
     | 
| 
      
 52 
     | 
    
         
            +
            vision_agent-0.2.242.dist-info/METADATA,sha256=Lvr9OdngkgZJd-ifod6Wp8FuX0BnAmR6fZIelqAmjz8,5712
         
     | 
| 
      
 53 
     | 
    
         
            +
            vision_agent-0.2.242.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
         
     | 
| 
      
 54 
     | 
    
         
            +
            vision_agent-0.2.242.dist-info/RECORD,,
         
     | 
| 
         @@ -1,38 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            import os
         
     | 
| 
       2 
     | 
    
         
            -
            from typing import List
         
     | 
| 
       3 
     | 
    
         
            -
            from uuid import UUID
         
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
            from requests.exceptions import HTTPError
         
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
            from vision_agent.clients.http import BaseHTTP
         
     | 
| 
       8 
     | 
    
         
            -
            from vision_agent.models import BboxInputBase64, JobStatus, PromptTask
         
     | 
| 
       9 
     | 
    
         
            -
            from vision_agent.utils.exceptions import FineTuneModelNotFound
         
     | 
| 
       10 
     | 
    
         
            -
            from vision_agent.utils.type_defs import LandingaiAPIKey
         
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
            class LandingPublicAPI(BaseHTTP):
         
     | 
| 
       14 
     | 
    
         
            -
                def __init__(self) -> None:
         
     | 
| 
       15 
     | 
    
         
            -
                    landing_url = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
         
     | 
| 
       16 
     | 
    
         
            -
                    landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
         
     | 
| 
       17 
     | 
    
         
            -
                    headers = {"Content-Type": "application/json", "apikey": landing_api_key}
         
     | 
| 
       18 
     | 
    
         
            -
                    super().__init__(base_endpoint=landing_url, headers=headers)
         
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
                def launch_fine_tuning_job(
         
     | 
| 
       21 
     | 
    
         
            -
                    self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
         
     | 
| 
       22 
     | 
    
         
            -
                ) -> UUID:
         
     | 
| 
       23 
     | 
    
         
            -
                    url = "v1/agent/jobs/fine-tuning"
         
     | 
| 
       24 
     | 
    
         
            -
                    data = {
         
     | 
| 
       25 
     | 
    
         
            -
                        "model": {"name": model_name, "task": task.value},
         
     | 
| 
       26 
     | 
    
         
            -
                        "bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
         
     | 
| 
       27 
     | 
    
         
            -
                    }
         
     | 
| 
       28 
     | 
    
         
            -
                    response = self.post(url, payload=data)
         
     | 
| 
       29 
     | 
    
         
            -
                    return UUID(response["jobId"])
         
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
       31 
     | 
    
         
            -
                def check_fine_tuning_job(self, job_id: UUID) -> JobStatus:
         
     | 
| 
       32 
     | 
    
         
            -
                    url = f"v1/agent/jobs/fine-tuning/{job_id}/status"
         
     | 
| 
       33 
     | 
    
         
            -
                    try:
         
     | 
| 
       34 
     | 
    
         
            -
                        get_job = self.get(url)
         
     | 
| 
       35 
     | 
    
         
            -
                    except HTTPError as err:
         
     | 
| 
       36 
     | 
    
         
            -
                        if err.response.status_code == 404:
         
     | 
| 
       37 
     | 
    
         
            -
                            raise FineTuneModelNotFound()
         
     | 
| 
       38 
     | 
    
         
            -
                    return JobStatus(get_job["status"])
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     |