PyPI - vision-agent - Versions diffs - 0.2.228__py3-none-any.whl → 0.2.230__py3-none-any.whl - Mend

vision-agent 0.2.228py3-none-any.whl → 0.2.230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

vision_agent/.sim_tools/df.csv +10 -8
vision_agent/agent/agent_utils.py +10 -9
vision_agent/agent/vision_agent.py +3 -4
vision_agent/agent/vision_agent_coder_prompts.py +6 -6
vision_agent/agent/vision_agent_coder_v2.py +41 -26
vision_agent/agent/vision_agent_planner_prompts.py +6 -6
vision_agent/agent/vision_agent_planner_prompts_v2.py +16 -50
vision_agent/agent/vision_agent_planner_v2.py +10 -12
vision_agent/agent/vision_agent_prompts.py +11 -11
vision_agent/agent/vision_agent_prompts_v2.py +18 -3
vision_agent/agent/vision_agent_v2.py +29 -30
vision_agent/configs/__init__.py +1 -0
vision_agent/configs/anthropic_config.py +150 -0
vision_agent/configs/anthropic_openai_config.py +150 -0
vision_agent/configs/config.py +150 -0
vision_agent/configs/openai_config.py +160 -0
vision_agent/lmm/__init__.py +1 -1
vision_agent/lmm/lmm.py +63 -9
vision_agent/tools/planner_tools.py +60 -40
vision_agent/tools/tool_utils.py +1 -2
vision_agent/tools/tools.py +10 -8
vision_agent-0.2.230.dist-info/METADATA +156 -0
{vision_agent-0.2.228.dist-info → vision_agent-0.2.230.dist-info}/RECORD +25 -20
vision_agent-0.2.228.dist-info/METADATA +0 -562
{vision_agent-0.2.228.dist-info → vision_agent-0.2.230.dist-info}/LICENSE +0 -0
{vision_agent-0.2.228.dist-info → vision_agent-0.2.230.dist-info}/WHEEL +0 -0

vision_agent/.sim_tools/df.csv CHANGED Viewed

@@ -244,7 +244,8 @@ desc,doc,name
     1.0.
     Parameters:
-        prompt (str): The prompt to ground to the image.
+        prompt (str): The prompt to ground to the image. Use exclusive categories that
+            do not overlap such as 'person, car' and NOT 'person, athlete'.
         image (np.ndarray): The image to ground the prompt to.
         fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
             fine-tuned model ID here to use it.
@@ -281,7 +282,8 @@ desc,doc,name
     is useful for tracking and counting without duplicating counts.
     Parameters:
-        prompt (str): The prompt to ground to the video.
+        prompt (str): The prompt to ground to the image. Use exclusive categories that
+            do not overlap such as 'person, car' and NOT 'person, athlete'.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
         chunk_length (Optional[int]): The number of frames to re-run florence2 to find
             new objects.
@@ -317,14 +319,14 @@ desc,doc,name
         ]
     ",florence2_sam2_video_tracking
 "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
-'florence2_object_detection' is a tool that can detect multiple
-    objects given a text prompt which can be object names or caption. You
-    can optionally separate the object names in the text with commas. It returns a list
-    of bounding boxes with normalized coordinates, label names and associated
-    confidence scores of 1.0.
+'florence2_object_detection' is a tool that can detect multiple objects given a
+    text prompt which can be object names or caption. You can optionally separate the
+    object names in the text with commas. It returns a list of bounding boxes with
+    normalized coordinates, label names and associated confidence scores of 1.0.
     Parameters:
-        prompt (str): The prompt to ground to the image.
+        prompt (str): The prompt to ground to the image. Use exclusive categories that
+            do not overlap such as 'person, car' and NOT 'person, athlete'.
         image (np.ndarray): The image to used to detect objects
         fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
             fine-tuned model ID here to use it.

vision_agent/agent/agent_utils.py CHANGED Viewed

@@ -157,10 +157,11 @@ def format_conversation(chat: List[AgentMessage]) -> str:
     chat = copy.deepcopy(chat)
     prompt = ""
     for chat_i in chat:
-        if chat_i.role == "user":
-            prompt += f"USER: {chat_i.content}\n\n"
-        elif chat_i.role == "observation" or chat_i.role == "coder":
-            prompt += f"OBSERVATION: {chat_i.content}\n\n"
+        if chat_i.role == "user" or chat_i.role == "coder":
+            if "<final_code>" in chat_i.role:
+                prompt += f"OBSERVATION: {chat_i.content}\n\n"
+            elif chat_i.role == "user":
+                prompt += f"USER: {chat_i.content}\n\n"
         elif chat_i.role == "conversation":
             prompt += f"AGENT: {chat_i.content}\n\n"
     return prompt
@@ -332,26 +333,26 @@ def strip_function_calls(  # noqa: C901
         def __init__(self, exclusions: List[str]):
             # Store exclusions to skip removing certain function calls
             self.exclusions = exclusions
-            self.in_function_or_class = False
+            self.in_function_or_class: List[bool] = []
         def visit_FunctionDef(self, node: cst.FunctionDef) -> Optional[bool]:
-            self.in_function_or_class = True
+            self.in_function_or_class.append(True)
             return True
         def leave_FunctionDef(
             self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef
         ) -> cst.BaseStatement:
-            self.in_function_or_class = False
+            self.in_function_or_class.pop()
             return updated_node
         def visit_ClassDef(self, node: cst.ClassDef) -> Optional[bool]:
-            self.in_function_or_class = True
+            self.in_function_or_class.append(True)
             return True
         def leave_ClassDef(
             self, node: cst.ClassDef, updated_node: cst.ClassDef
         ) -> cst.BaseStatement:
-            self.in_function_or_class = False
+            self.in_function_or_class.pop()
             return updated_node
         def leave_Expr(

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -291,10 +291,9 @@ class VisionAgent(Agent):
             verbosity (int): The verbosity level of the agent.
             callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
                 function to send intermediate update messages.
-            code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
-                it can be one of: None, "local" or "e2b". If None, it will read from
-                the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
-                object is provided it will use that.
+            code_sandbox_runtime (Optional[str]): For string values it can be one of:
+                None, "local" or "e2b". If None, it will read from the environment
+                variable "CODE_SANDBOX_RUNTIME".
         """
         self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent

vision_agent/agent/vision_agent_coder_prompts.py CHANGED Viewed

@@ -44,22 +44,22 @@ Can you write a program to check if each person is wearing a helmet? First detec
 ## Subtasks
-This plan uses the owl_v2_image tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
--Use owl_v2_image with prompt 'person, helmet' to detect both people and helmets in the image
+This plan uses the owlv2_object_detection tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
+-Use owlv2_object_detection with prompt 'person, helmet' to detect both people and helmets in the image
 -Process the detections to match helmets with people based on bounding box proximity
 -Count people with and without helmets based on the matching results
 -Return a dictionary with the counts
 **Tool Tests and Outputs**:
-After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owl_v2_image seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
+After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owlv2_object_detection seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
 **Tool Output Thoughts**:
 ```python
 ...
 ```
 ----- stdout -----
-Plan 1 - owl_v2_image:
+Plan 1 - owlv2_object_detection:
 [{{'label': 'helmet', 'score': 0.15, 'bbox': [0.85, 0.41, 0.87, 0.45]}}, {{'label': 'helmet', 'score': 0.3, 'bbox': [0.8, 0.43, 0.81, 0.46]}}, {{'label': 'helmet', 'score': 0.31, 'bbox': [0.85, 0.45, 0.86, 0.46]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.84, 0.45, 0.88, 0.58]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.78, 0.43, 0.82, 0.57]}}, {{'label': 'helmet', 'score': 0.33, 'bbox': [0.3, 0.65, 0.32, 0.67]}}, {{'label': 'person', 'score': 0.29, 'bbox': [0.28, 0.65, 0.36, 0.84]}}, {{'label': 'helmet', 'score': 0.29, 'bbox': [0.13, 0.82, 0.15, 0.85]}}, {{'label': 'person', 'score': 0.3, 'bbox': [0.1, 0.82, 0.24, 1.0]}}]
@@ -67,12 +67,12 @@ Plan 1 - owl_v2_image:
 **Input Code Snippet**:
 ```python
-from vision_agent.tools import load_image, owl_v2_image
+from vision_agent.tools import load_image, owlv2_object_detection
 def check_helmets(image_path):
     image = load_image(image_path)
     # Detect people and helmets, filter out the lowest confidence helmet score of 0.15
-    detections = owl_v2_image("person, helmet", image, box_threshold=0.15)
+    detections = owlv2_object_detection("person, helmet", image, box_threshold=0.15)
     height, width = image.shape[:2]
     # Separate people and helmets

vision_agent/agent/vision_agent_coder_v2.py CHANGED Viewed

@@ -26,7 +26,8 @@ from vision_agent.agent.types import (
 )
 from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
 from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
-from vision_agent.lmm import LMM, AnthropicLMM
+from vision_agent.configs import Config
+from vision_agent.lmm import LMM
 from vision_agent.lmm.types import Message
 from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils.execute import (
@@ -36,6 +37,7 @@ from vision_agent.utils.execute import (
 )
 from vision_agent.utils.sim import Sim, get_tool_recommender
+CONFIG = Config()
 _CONSOLE = Console()
@@ -185,23 +187,17 @@ def debug_code(
     return code, test, debug_info
-def write_and_test_code(
-    coder: LMM,
+def test_code(
     tester: LMM,
     debugger: LMM,
     chat: List[AgentMessage],
     plan: str,
+    code: str,
     tool_docs: str,
     code_interpreter: CodeInterpreter,
     media_list: List[Union[str, Path]],
     verbose: bool,
 ) -> CodeContext:
-    code = write_code(
-        coder=coder,
-        chat=chat,
-        tool_docs=tool_docs,
-        plan=plan,
-    )
     try:
         code = strip_function_calls(code)
     except Exception:
@@ -257,6 +253,36 @@ def write_and_test_code(
     )
+def write_and_test_code(
+    coder: LMM,
+    tester: LMM,
+    debugger: LMM,
+    chat: List[AgentMessage],
+    plan: str,
+    tool_docs: str,
+    code_interpreter: CodeInterpreter,
+    media_list: List[Union[str, Path]],
+    verbose: bool,
+) -> CodeContext:
+    code = write_code(
+        coder=coder,
+        chat=chat,
+        tool_docs=tool_docs,
+        plan=plan,
+    )
+    return test_code(
+        tester,
+        debugger,
+        chat,
+        plan,
+        code,
+        tool_docs,
+        code_interpreter,
+        media_list,
+        verbose,
+    )
 class VisionAgentCoderV2(AgentCoder):
     """VisionAgentCoderV2 is an agent that will write vision code for you."""
@@ -300,21 +326,9 @@ class VisionAgentCoderV2(AgentCoder):
             )
         )
-        self.coder = (
-            coder
-            if coder is not None
-            else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
-        )
-        self.tester = (
-            tester
-            if tester is not None
-            else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
-        )
-        self.debugger = (
-            debugger
-            if debugger is not None
-            else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
-        )
+        self.coder = coder if coder is not None else CONFIG.create_coder()
+        self.tester = tester if tester is not None else CONFIG.create_tester()
+        self.debugger = debugger if debugger is not None else CONFIG.create_debugger()
         if tool_recommender is not None:
             if isinstance(tool_recommender, str):
                 self.tool_recommender = Sim.load(tool_recommender)
@@ -440,12 +454,13 @@ class VisionAgentCoderV2(AgentCoder):
         ) as code_interpreter:
             int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
             tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
-            code_context = write_and_test_code(
-                coder=self.coder,
+            code_context = test_code(
                 tester=self.tester,
                 debugger=self.debugger,
                 chat=int_chat,
                 plan=format_plan_v2(plan_context),
+                code=plan_context.code,
                 tool_docs=tool_docs,
                 code_interpreter=code_interpreter,
                 media_list=media_list,

vision_agent/agent/vision_agent_planner_prompts.py CHANGED Viewed

@@ -55,27 +55,27 @@ This is the documentation for the functions you have access to. You may call any
 --- EXAMPLE1 ---
 plan1:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
+- Use the 'owlv2_object_detection' tool with the prompt 'person' to detect and count the number of people in the image.
 plan2:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
+- Use the 'florence2_sam2_instance_segmentation' tool with the prompt 'person' to detect and count the number of people in the image.
 - Count the number of detected objects labeled as 'person'.
 plan3:
 - Load the image from the provided file path 'image.jpg'.
 - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
 ```python
-from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
+from vision_agent.tools import load_image, owlv2_object_detection, florence2_sam2_instance_segmentation, countgd_object_detection
 image = load_image("image.jpg")
-owl_v2_out = owl_v2_image("person", image)
+owl_v2_out = owlv2_object_detection("person", image)
-f2s2_out = florence2_sam2_image("person", image)
+f2s2_out = florence2_sam2_instance_segmentation("person", image)
 # strip out the masks from the output becuase they don't provide useful information when printed
 f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
 cgd_out = countgd_object_detection("person", image)
-final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
+final_out = {{"owlv2_object_detection": owl_v2_out, "florence2_sam2_instance_segmentation": f2s2, "countgd_object_detection": cgd_out}}
 print(final_out)
 --- END EXAMPLE1 ---

vision_agent/agent/vision_agent_planner_prompts_v2.py CHANGED Viewed

@@ -130,7 +130,7 @@ In these aerial images, I can see approximately 5-6 pedestrians walking in vario
 [end of claude35_vqa_output]
 [get_tool_for_task output]
-After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool to count the number of pedestrians in the image.
+After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool with the prompt 'pedistrian' to count the number of pedestrians in the image.
 Tool Documentation:
 countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
@@ -162,7 +162,7 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float
 [end of get_tool_for_task output]
 <count>8</count>
-AGENT: <thinking>I need to now merge the boxes from all regions and count the unique pedestrians.</thinking>
+AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
 <execute_python>
 def translate_ofset(bbox, offset_x, offset_y):
     return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
@@ -271,7 +271,7 @@ get_tool_for_task('Identify and track the boxes in the video', frames[:5])
 OBSERVATION:
 [get_tool_for_task output]
-For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor.
+For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor and using the prompt 'box'.
 Tool Documentation:
 def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
@@ -317,7 +317,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l
 [end of get_tool_for_task output]
 <count>8</count>
-AGENT: <thinking>Now that I have the tool for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
+AGENT: <thinking>Now that I have the tool and prompt for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
 <execute_python>
 track_predictions = florence2_sam2_video_tracking("box", frames)
@@ -338,7 +338,7 @@ get_tool_for_task('Identify if there is tape on the boxes', crops[:3])
 OBSERVATION:
 [get_tool_for_task output]
-owlv2_object_detection performed best as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
+owlv2_object_detection performed best with the prompt 'tape', as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
 'owlv2_object_detection' is a tool that can detect and count multiple objects given a
 text prompt such as category names or referring expressions on images. The categories
@@ -370,7 +370,7 @@ Example
 [end of get_tool_for_task output]
 <count>7</count>
-AGENT: <thinking>Now that I have the tools for tracking and identifying tape on the boxes, I will use them to accomplish the user's request.</thinking>
+AGENT: <thinking>Now that I have the tools for tracking and identifying 'tape' on the boxes, I will use them to accomplish the user's request.</thinking>
 <execute_python>
 obj_to_info = {{}}
 for frame, frame_predictions in zip(frames, track_predictions):
@@ -487,8 +487,9 @@ TEST_TOOLS = """
 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
 3. Your test case MUST run only on the given images which are {media}
 4. For video tracking, use chunk_length=1 and at least 3 frames to ensure the best results when evaluating the tool.
-5. Print this final dictionary.
-6. Output your code in the following format wrapped in <code> tags:
+5. Use mutually exclusive categories for prompts such as 'person, car' and not 'person, athlete' to avoid over counting.
+6. Print this final dictionary.
+7. Output your code in the following format wrapped in <code> tags:
 <code>
 # Your code here
 </code>
@@ -649,41 +650,6 @@ PICK_TOOL = """
 </json>
 """
-PICK_TOOL_V2 = """
-**Role**: You are an expert evaluator that can understand user requests and evaluate the output of different tools.
-**Task**: You are given the output of different tools for a user request along with the image. You must evaluate the output and determine the best tool for the user request.
-**User Request**:
-{user_request}
-**Tools**: This is the documentation of all the functions that were tested.
-{tool_docs}
-**Testing Code and Tool Output**:
-{context}
-**Previous Attempt**: This is the code and output of the previous attempt, if it is empty then there was no previous attempt.
-{previous_attempts}
-**Instructions**:
-1. Re-read the user request, plans, tool outputs and examine the image.
-2. Given the user request, try to solve the problem yourself.
-3. Pick which tool output best matches your solution first and the user request, then consider other factors like box size, etc. DO NOT worry about low confidence scores if the output is correct.
-4. DO NOT modify confidence thresholds unless the tool output is completely wrong.
-5. Remember for videos that in order to count objects a video some sort of tracking is needed, or else you will overcount the objects.
-6. Assign each tool a score from 0 to 10 based on how well it solves the user request. A score of 8+ means the tool output matches your solution and the tool is the best choice, 5-7 means the tool output is okay but needs some modifications, less than 5 means the tool output is bad and the tool should not be used. Return the the following JSON format inside <json> tags using the exact tool name as the key and the score as the value:
-<json>
-{{
-    "predicted_answer": str # the answer you would expect from the best plan
-    "thoughts": str # your thought process for choosing the best tool over other tools and any modifications you madeas well as the prompt you used with the tool.
-    "first tool": int # the score for the first tool
-    "second tool": int # the score for the second tool
-    ...
-}}
-</json>
-"""
 FINALIZE_PLAN = """
 **Role**: You are an expert AI model that can understand the user request and construct plans to accomplish it.
@@ -693,13 +659,13 @@ FINALIZE_PLAN = """
 {planning}
 **Instructions**:
-1. Read the chain of thoughts and python executions.
-2. Summarize the plan that the planning agent found.
-3. Include ALL relevant python code in your plan to accomplish the user request.
-4. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
-5. Do not include {excluded_tools} tools in your instructions.
-6. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_file` or `save_video`.
-7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
+1. Summarize the plan that the planning agent found.
+2. Write a single function that solves the problem based on what the planner found.
+3. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
+4. Do not include {excluded_tools} tools in your instructions.
+5. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_image` or `save_video`.
+6. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
+7. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
 8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
 <json>

vision_agent/agent/vision_agent_planner_v2.py CHANGED Viewed

@@ -32,7 +32,8 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     PICK_PLAN,
     PLAN,
 )
-from vision_agent.lmm import LMM, AnthropicLMM, Message
+from vision_agent.configs import Config
+from vision_agent.lmm import LMM, Message
 from vision_agent.tools.planner_tools import check_function_call, get_tool_documentation
 from vision_agent.utils.execute import (
     CodeInterpreter,
@@ -41,6 +42,7 @@ from vision_agent.utils.execute import (
 )
 logging.basicConfig(level=logging.INFO)
+CONFIG = Config()
 UTIL_DOCSTRING = T.get_tool_documentation(
     [
         T.load_image,
@@ -315,8 +317,8 @@ def maybe_run_code(
 def create_finalize_plan(
-    chat: List[AgentMessage],
     model: LMM,
+    chat: List[AgentMessage],
     verbose: bool = False,
 ) -> Tuple[List[AgentMessage], PlanContext]:
     # if we're in the middle of an interaction, don't finalize the plan
@@ -385,6 +387,7 @@ class VisionAgentPlannerV2(AgentPlanner):
     def __init__(
         self,
         planner: Optional[LMM] = None,
+        summarizer: Optional[LMM] = None,
         critic: Optional[LMM] = None,
         max_steps: int = 10,
         use_multi_trial_planning: bool = False,
@@ -414,16 +417,11 @@ class VisionAgentPlannerV2(AgentPlanner):
                 that will send back intermediate conversation messages.
         """
-        self.planner = (
-            planner
-            if planner is not None
-            else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
-        )
-        self.critic = (
-            critic
-            if critic is not None
-            else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
+        self.planner = planner if planner is not None else CONFIG.create_planner()
+        self.summarizer = (
+            summarizer if summarizer is not None else CONFIG.create_summarizer()
         )
+        self.critic = critic if critic is not None else CONFIG.create_critic()
         self.max_steps = max_steps
         self.use_multi_trial_planning = use_multi_trial_planning
         self.critique_steps = critique_steps
@@ -561,7 +559,7 @@ class VisionAgentPlannerV2(AgentPlanner):
                 context = InteractionContext(chat=int_chat)
             else:
                 updated_chat, context = create_finalize_plan(
-                    int_chat, self.planner, self.verbose
+                    self.summarizer, int_chat, self.verbose
                 )
                 int_chat.extend(updated_chat)
                 for chat_elt in updated_chat:

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -55,10 +55,10 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
 OBSERVATION:
 [Artifact dog_detector.py (5 lines total)]
-0|from vision_agent.tools import load_image, owl_v2_image
+0|from vision_agent.tools import load_image, owlv2_object_detection
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
-3|    dogs = owl_v2_image("dog", image)
+3|    dogs = owlv2_object_detection("dog", image)
 4|    return dogs
 [End of artifact]
@@ -96,10 +96,10 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
 OBSERVATION:
 [Artifact dog_detector.py (5 lines total)]
-0|from vision_agent.tools import load_image, owl_v2_image
+0|from vision_agent.tools import load_image, owlv2_object_detection
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
-3|    dogs = owl_v2_image("dog", image, threshold=0.24)
+3|    dogs = owlv2_object_detection("dog", image, threshold=0.24)
 4|    return dogs
 [End of artifact]
@@ -168,10 +168,10 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w
 OBSERVATION:
 [Artifact code.py (17 lines total)]
-0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+0|from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
 1|def count_workers_with_helmets(image_path: str):
 2|    image = load_image(image_path)
-3|    detections = florence2_phrase_grounding("worker, helmet", image)
+3|    detections = florence2_object_detection("worker, helmet", image)
 4|    workers = [d for d in detections if d['label'] == 'worker']
 5|    helmets = [d for d in detections if d['label'] == 'helmet']
 6|    count = 0
@@ -217,7 +217,7 @@ object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker',
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.</thinking>
+AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_object_detection call with the fine tuning id.</thinking>
 <execute_python>
 use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
 </execute_python>
@@ -228,11 +228,11 @@ OBSERVATION:
 ---
 +++
@@ -1,7 +1,7 @@
- from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+ from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
  def count_workers_with_helmets(image_path: str, output_path: str):
      image = load_image(image_path)
--    detections = florence2_phrase_grounding("worker, helmet", image)
-+    detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+-    detections = florence2_object_detection("worker, helmet", image)
++    detections = florence2_object_detection("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
      workers = [d for d in detections if d['label'] == 'worker']
      helmets = [d for d in detections if d['label'] == 'helmet']
      count = 0
@@ -250,7 +250,7 @@ OBSERVATION:
 3
 AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
-<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
+<response>I have updated the code using the fine tuned florence2_object_detection model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
 <let_user_respond>true</let_user_respond>
 """

vision_agent/agent/vision_agent_prompts_v2.py CHANGED Viewed

@@ -1,5 +1,5 @@
 CONVERSATION = """
-**Role**: You are a help agent that assists users with their vision-related questions.
+**Role**: You are a help agent that called VisionAent, built by LandingAI, that assists users write code to solve vision tasks.
 **Actions**:
 `generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
@@ -8,7 +8,7 @@ CONVERSATION = """
 **Exampels**:
 Here is an example of how you can interact with a user and Actions:
---- START EXAMPLES ---
+--- EXAMPLE1 ---
 USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
 AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
@@ -28,8 +28,23 @@ def test_detect_dogs():
 </final_test>
 AGENT: <response>Here is the code to detect dogs in the image.</response>
+--- END EXAMPLE1 ---
---- END EXAMPLES ---
+--- EXAMPLE2 ---
+USER: Who are you and what can you do?
+AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users write code to solve vision tasks. I can write code to do the following tasks:
+- Detecting objects in an image with bounding boxes
+- Detecting objects in an image with masks
+- Tracking object in a video
+- Classifying images
+- Understanding documents
+- Pose estimation
+- Visual question answering for both images and videos
+How can I help you?</response>
+--- END EXAMPLE2 ---
 **Conversation**:
 Here is the current conversation so far:

vision-agent 0.2.228__py3-none-any.whl → 0.2.230__py3-none-any.whl

vision-agent 0.2.228py3-none-any.whl → 0.2.230py3-none-any.whl