vision-agent 0.2.232__py3-none-any.whl → 0.2.233__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent_utils.py +1 -1
- vision_agent/agent/vision_agent_coder_v2.py +1 -1
- vision_agent/agent/vision_agent_planner_prompts_v2.py +126 -63
- vision_agent/agent/vision_agent_v2.py +6 -5
- {vision_agent-0.2.232.dist-info → vision_agent-0.2.233.dist-info}/METADATA +1 -1
- {vision_agent-0.2.232.dist-info → vision_agent-0.2.233.dist-info}/RECORD +8 -8
- {vision_agent-0.2.232.dist-info → vision_agent-0.2.233.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.232.dist-info → vision_agent-0.2.233.dist-info}/WHEEL +0 -0
| @@ -160,7 +160,7 @@ def format_conversation(chat: List[AgentMessage]) -> str: | |
| 160 160 | 
             
                prompt = ""
         | 
| 161 161 | 
             
                for chat_i in chat:
         | 
| 162 162 | 
             
                    if chat_i.role == "user" or chat_i.role == "coder":
         | 
| 163 | 
            -
                        if "<final_code>" in chat_i. | 
| 163 | 
            +
                        if "<final_code>" in chat_i.content:
         | 
| 164 164 | 
             
                            prompt += f"OBSERVATION: {chat_i.content}\n\n"
         | 
| 165 165 | 
             
                        elif chat_i.role == "user":
         | 
| 166 166 | 
             
                            prompt += f"USER: {chat_i.content}\n\n"
         | 
| @@ -443,7 +443,7 @@ class VisionAgentCoderV2(AgentCoder): | |
| 443 443 |  | 
| 444 444 | 
             
                    # we don't need the user_interaction response for generating code since it's
         | 
| 445 445 | 
             
                    # already in the plan context
         | 
| 446 | 
            -
                    while chat[-1].role != "user":
         | 
| 446 | 
            +
                    while len(chat) > 0 and chat[-1].role != "user":
         | 
| 447 447 | 
             
                        chat.pop()
         | 
| 448 448 |  | 
| 449 449 | 
             
                    if not chat:
         | 
| @@ -50,7 +50,7 @@ From this aerial view of a busy urban street, it's difficult to clearly see or c | |
| 50 50 | 
             
            [suggestion 0]
         | 
| 51 51 | 
             
            The image is very large and the items you need to detect are small.
         | 
| 52 52 |  | 
| 53 | 
            -
            Step 1: You should start by splitting the image into sections and runing the detection algorithm on each section:
         | 
| 53 | 
            +
            Step 1: You should start by splitting the image into overlapping sections and runing the detection algorithm on each section:
         | 
| 54 54 |  | 
| 55 55 | 
             
            def subdivide_image(image):
         | 
| 56 56 | 
             
                height, width, _ = image.shape
         | 
| @@ -66,41 +66,96 @@ def subdivide_image(image): | |
| 66 66 |  | 
| 67 67 | 
             
            get_tool_for_task('<your prompt here>', subdivide_image(image))
         | 
| 68 68 |  | 
| 69 | 
            -
            Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions:
         | 
| 70 | 
            -
             | 
| 71 | 
            -
            def  | 
| 72 | 
            -
                 | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
                 | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
                 | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
                 | 
| 83 | 
            -
             | 
| 84 | 
            -
                 | 
| 85 | 
            -
                 | 
| 86 | 
            -
                 | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
                 | 
| 90 | 
            -
                 | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 69 | 
            +
            Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions, be sure to tranlate the offset back to the original image:
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
         | 
| 72 | 
            +
                # Calculate intersection coordinates
         | 
| 73 | 
            +
                x1 = max(b1[0], b2[0])
         | 
| 74 | 
            +
                y1 = max(b1[1], b2[1])
         | 
| 75 | 
            +
                x2 = min(b1[2], b2[2])
         | 
| 76 | 
            +
                y2 = min(b1[3], b2[3])
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                # Calculate intersection area
         | 
| 79 | 
            +
                if x2 < x1 or y2 < y1:
         | 
| 80 | 
            +
                    return False  # No overlap
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                intersection = (x2 - x1) * (y2 - y1)
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                # Calculate union area
         | 
| 85 | 
            +
                area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
         | 
| 86 | 
            +
                area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
         | 
| 87 | 
            +
                union = area1 + area2 - intersection
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                # Calculate IoU
         | 
| 90 | 
            +
                iou = intersection / union if union > 0 else 0
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                return iou >= iou_threshold
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            def merge_bounding_box_list(detections):
         | 
| 95 | 
            +
                merged_detections = []
         | 
| 96 | 
            +
                for detection in detections:
         | 
| 97 | 
            +
                    matching_box = None
         | 
| 98 | 
            +
                    for i, other in enumerate(merged_detections):
         | 
| 99 | 
            +
                        if bounding_box_match(detection["bbox"], other["bbox"]):
         | 
| 100 | 
            +
                            matching_box = i
         | 
| 97 101 | 
             
                            break
         | 
| 98 | 
            -
                    if not overlap_found:
         | 
| 99 | 
            -
                      p
         | 
| 100 | 
            -
                      merged_bboxes.append(bbox)
         | 
| 101 | 
            -
                return merged_bboxes
         | 
| 102 102 |  | 
| 103 | 
            -
             | 
| 103 | 
            +
                    if matching_box is not None:
         | 
| 104 | 
            +
                        # Keep the box with higher confidence score
         | 
| 105 | 
            +
                        if detection["score"] > merged_detections[matching_box]["score"]:
         | 
| 106 | 
            +
                            merged_detections[matching_box] = detection
         | 
| 107 | 
            +
                    else:
         | 
| 108 | 
            +
                        merged_detections.append(detection)
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            def sub_image_to_original(elt, sub_image_position, original_size):
         | 
| 111 | 
            +
                offset_x, offset_y = sub_image_position
         | 
| 112 | 
            +
                return {
         | 
| 113 | 
            +
                    "label": elt["label"],
         | 
| 114 | 
            +
                    "score": elt["score"],
         | 
| 115 | 
            +
                    "bbox": [
         | 
| 116 | 
            +
                        (elt["bbox"][0] + offset_x) / original_size[1],
         | 
| 117 | 
            +
                        (elt["bbox"][1] + offset_y) / original_size[0],
         | 
| 118 | 
            +
                        (elt["bbox"][2] + offset_x) / original_size[1],
         | 
| 119 | 
            +
                        (elt["bbox"][3] + offset_y) / original_size[0],
         | 
| 120 | 
            +
                    ],
         | 
| 121 | 
            +
                }
         | 
| 122 | 
            +
             | 
| 123 | 
            +
            def normalized_to_unnormalized(elt, image_size):
         | 
| 124 | 
            +
                return {
         | 
| 125 | 
            +
                    "label": elt["label"],
         | 
| 126 | 
            +
                    "score": elt["score"],
         | 
| 127 | 
            +
                    "bbox": [
         | 
| 128 | 
            +
                        elt["bbox"][0] * image_size[1],
         | 
| 129 | 
            +
                        elt["bbox"][1] * image_size[0],
         | 
| 130 | 
            +
                        elt["bbox"][2] * image_size[1],
         | 
| 131 | 
            +
                        elt["bbox"][3] * image_size[0],
         | 
| 132 | 
            +
                    ],
         | 
| 133 | 
            +
                }
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            height, width, _ = image.shape
         | 
| 136 | 
            +
            mid_width = width // 2
         | 
| 137 | 
            +
            mid_height = height // 2
         | 
| 138 | 
            +
             | 
| 139 | 
            +
            detection_from_subdivided_images = []
         | 
| 140 | 
            +
            for i, sub_image in enumerate(subdivided_images):
         | 
| 141 | 
            +
                detections = <your detection function here>("pedestrian", sub_image)
         | 
| 142 | 
            +
                unnorm_detections = [
         | 
| 143 | 
            +
                    normalized_to_unnormalized(
         | 
| 144 | 
            +
                        detection, (sub_image.shape[0], sub_image.shape[1])
         | 
| 145 | 
            +
                    )
         | 
| 146 | 
            +
                    for detection in detections
         | 
| 147 | 
            +
                ]
         | 
| 148 | 
            +
                offset_x = i % 2 * (mid_width - int(mid_width * 0.1))
         | 
| 149 | 
            +
                offset_y = i // 2 * (mid_height - int(mid_height * 0.1))
         | 
| 150 | 
            +
                offset_detections = [
         | 
| 151 | 
            +
                    sub_image_to_original(
         | 
| 152 | 
            +
                        unnorm_detection, (offset_x, offset_y), (height, width)
         | 
| 153 | 
            +
                    )
         | 
| 154 | 
            +
                    for unnorm_detection in unnorm_detections
         | 
| 155 | 
            +
                ]
         | 
| 156 | 
            +
                detection_from_subdivided_images.extend(offset_detections)
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            detections = merge_bounding_box_list(detection_from_subdivided_images)
         | 
| 104 159 | 
             
            [end of suggestion 0]
         | 
| 105 160 | 
             
            [end of suggestion]
         | 
| 106 161 | 
             
            <count>9</count>
         | 
| @@ -164,36 +219,44 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float | |
| 164 219 |  | 
| 165 220 | 
             
            AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
         | 
| 166 221 | 
             
            <execute_python>
         | 
| 167 | 
            -
            def  | 
| 168 | 
            -
                 | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| 171 | 
            -
                 | 
| 172 | 
            -
             | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
                 | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
| 178 | 
            -
                 | 
| 179 | 
            -
             | 
| 180 | 
            -
                 | 
| 181 | 
            -
                 | 
| 182 | 
            -
                 | 
| 183 | 
            -
             | 
| 184 | 
            -
             | 
| 185 | 
            -
                 | 
| 186 | 
            -
                 | 
| 187 | 
            -
             | 
| 188 | 
            -
             | 
| 189 | 
            -
             | 
| 190 | 
            -
             | 
| 191 | 
            -
             | 
| 192 | 
            -
             | 
| 222 | 
            +
            def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
         | 
| 223 | 
            +
                # Calculate intersection coordinates
         | 
| 224 | 
            +
                x1 = max(b1[0], b2[0])
         | 
| 225 | 
            +
                y1 = max(b1[1], b2[1])
         | 
| 226 | 
            +
                x2 = min(b1[2], b2[2])
         | 
| 227 | 
            +
                y2 = min(b1[3], b2[3])
         | 
| 228 | 
            +
             | 
| 229 | 
            +
                # Calculate intersection area
         | 
| 230 | 
            +
                if x2 < x1 or y2 < y1:
         | 
| 231 | 
            +
                    return False  # No overlap
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                intersection = (x2 - x1) * (y2 - y1)
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                # Calculate union area
         | 
| 236 | 
            +
                area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
         | 
| 237 | 
            +
                area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
         | 
| 238 | 
            +
                union = area1 + area2 - intersection
         | 
| 239 | 
            +
             | 
| 240 | 
            +
                # Calculate IoU
         | 
| 241 | 
            +
                iou = intersection / union if union > 0 else 0
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                return iou >= iou_threshold
         | 
| 244 | 
            +
             | 
| 245 | 
            +
            def merge_bounding_box_list(detections):
         | 
| 246 | 
            +
                merged_detections = []
         | 
| 247 | 
            +
                for detection in detections:
         | 
| 248 | 
            +
                    matching_box = None
         | 
| 249 | 
            +
                    for i, other in enumerate(merged_detections):
         | 
| 250 | 
            +
                        if bounding_box_match(detection["bbox"], other["bbox"]):
         | 
| 251 | 
            +
                            matching_box = i
         | 
| 193 252 | 
             
                            break
         | 
| 194 | 
            -
             | 
| 195 | 
            -
             | 
| 196 | 
            -
             | 
| 253 | 
            +
             | 
| 254 | 
            +
                    if matching_box is not None:
         | 
| 255 | 
            +
                        # Keep the box with higher confidence score
         | 
| 256 | 
            +
                        if detection["score"] > merged_detections[matching_box]["score"]:
         | 
| 257 | 
            +
                            merged_detections[matching_box] = detection
         | 
| 258 | 
            +
                    else:
         | 
| 259 | 
            +
                        merged_detections.append(detection)
         | 
| 197 260 |  | 
| 198 261 | 
             
            detections = []
         | 
| 199 262 | 
             
            for region in subdivide_image(image):
         | 
| @@ -27,7 +27,7 @@ CONFIG = Config() | |
| 27 27 |  | 
| 28 28 |  | 
| 29 29 | 
             
            def extract_conversation(
         | 
| 30 | 
            -
                chat: List[AgentMessage],
         | 
| 30 | 
            +
                chat: List[AgentMessage], include_conv: bool = False
         | 
| 31 31 | 
             
            ) -> Tuple[List[AgentMessage], Optional[str]]:
         | 
| 32 32 | 
             
                chat = copy.deepcopy(chat)
         | 
| 33 33 |  | 
| @@ -43,6 +43,8 @@ def extract_conversation( | |
| 43 43 | 
             
                    elif chat_i.role == "coder":
         | 
| 44 44 | 
             
                        if "<final_code>" in chat_i.content:
         | 
| 45 45 | 
             
                            extracted_chat.append(chat_i)
         | 
| 46 | 
            +
                    elif include_conv and chat_i.role == "conversation":
         | 
| 47 | 
            +
                        extracted_chat.append(chat_i)
         | 
| 46 48 |  | 
| 47 49 | 
             
                # only keep the last <final_code> and <final_test>
         | 
| 48 50 | 
             
                final_code = None
         | 
| @@ -64,10 +66,9 @@ def extract_conversation( | |
| 64 66 |  | 
| 65 67 |  | 
| 66 68 | 
             
            def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
         | 
| 67 | 
            -
                extracted_chat, _ = extract_conversation(chat)
         | 
| 68 | 
            -
                extracted_chat = extracted_chat[-10:]
         | 
| 69 | 
            +
                extracted_chat, _ = extract_conversation(chat, include_conv=True)
         | 
| 69 70 |  | 
| 70 | 
            -
                conv = format_conversation( | 
| 71 | 
            +
                conv = format_conversation(extracted_chat)
         | 
| 71 72 | 
             
                prompt = CONVERSATION.format(
         | 
| 72 73 | 
             
                    conversation=conv,
         | 
| 73 74 | 
             
                )
         | 
| @@ -263,7 +264,7 @@ class VisionAgentV2(Agent): | |
| 263 264 | 
             
                            # do not append updated_chat to return_chat becuase the observation
         | 
| 264 265 | 
             
                            # from running the action will have already been added via the callbacks
         | 
| 265 266 | 
             
                            obs_response_context = run_conversation(
         | 
| 266 | 
            -
                                self.agent, return_chat + updated_chat
         | 
| 267 | 
            +
                                self.agent, int_chat + return_chat + updated_chat
         | 
| 267 268 | 
             
                            )
         | 
| 268 269 | 
             
                            return_chat.append(
         | 
| 269 270 | 
             
                                AgentMessage(role="conversation", content=obs_response_context)
         | 
| @@ -4,20 +4,20 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57 | |
| 4 4 | 
             
            vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
         | 
| 5 5 | 
             
            vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
         | 
| 6 6 | 
             
            vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1565
         | 
| 7 | 
            -
            vision_agent/agent/agent_utils.py,sha256= | 
| 7 | 
            +
            vision_agent/agent/agent_utils.py,sha256=4RgG8SUEGuMFHkIt0jCFkRQF6G1PZp3Ub4LuVYKF7Ic,14092
         | 
| 8 8 | 
             
            vision_agent/agent/types.py,sha256=dIdxATH_PP76pD5Wfo0oofWt6iPQh0vpf48QbEQSzhs,2472
         | 
| 9 9 | 
             
            vision_agent/agent/vision_agent.py,sha256=fH9NOLk7twL1fPr9vLSqkaYhah-gfDWfTOVF2FfMyzI,23461
         | 
| 10 10 | 
             
            vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
         | 
| 11 11 | 
             
            vision_agent/agent/vision_agent_coder_prompts.py,sha256=_kkPLezUVnBXieNPlxMQab_6J6P7F-aa6ItF5NhZZsM,12281
         | 
| 12 12 | 
             
            vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=NUMWq-Lxq5JmmyWs3C5O_1Hm-zCbf9I_yPK5UtWGspE,4871
         | 
| 13 | 
            -
            vision_agent/agent/vision_agent_coder_v2.py,sha256= | 
| 13 | 
            +
            vision_agent/agent/vision_agent_coder_v2.py,sha256=yQYcO0s4BI9pWaAQQAVtkwWa3UF5w0iLKvwpeJ6iegM,17077
         | 
| 14 14 | 
             
            vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
         | 
| 15 15 | 
             
            vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
         | 
| 16 | 
            -
            vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256= | 
| 16 | 
            +
            vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=U88z1Y7CifFs7t53aUrl8qjWtBYs0f_F5vyg_0VYJko,35528
         | 
| 17 17 | 
             
            vision_agent/agent/vision_agent_planner_v2.py,sha256=NUyi57zxCmOO004_cJcCCDa4UgcKSWB1WCGuyOhhXQE,20602
         | 
| 18 18 | 
             
            vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
         | 
| 19 19 | 
             
            vision_agent/agent/vision_agent_prompts_v2.py,sha256=Wyxa15NOe75PefAfw3_RRwvgjg8YVqCrU7WvvWoYJpk,2733
         | 
| 20 | 
            -
            vision_agent/agent/vision_agent_v2.py,sha256= | 
| 20 | 
            +
            vision_agent/agent/vision_agent_v2.py,sha256=86_pPdkkMBk08TTFZ7zu9QG37Iz9uI8Nmt79wwm_EIA,11053
         | 
| 21 21 | 
             
            vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 22 22 | 
             
            vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
         | 
| 23 23 | 
             
            vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
         | 
| @@ -46,7 +46,7 @@ vision_agent/utils/sim.py,sha256=DYya76dYVtifFyXilMLxBzGgyfyeqhEwU4RJ4894lCI,979 | |
| 46 46 | 
             
            vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
         | 
| 47 47 | 
             
            vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
         | 
| 48 48 | 
             
            vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
         | 
| 49 | 
            -
            vision_agent-0.2. | 
| 50 | 
            -
            vision_agent-0.2. | 
| 51 | 
            -
            vision_agent-0.2. | 
| 52 | 
            -
            vision_agent-0.2. | 
| 49 | 
            +
            vision_agent-0.2.233.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         | 
| 50 | 
            +
            vision_agent-0.2.233.dist-info/METADATA,sha256=EoNuerRth0lHRC7TK2Xh7w6V__YtUJraKk9yN8AMx2U,5760
         | 
| 51 | 
            +
            vision_agent-0.2.233.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
         | 
| 52 | 
            +
            vision_agent-0.2.233.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |