PyPI - vision-agent - Versions diffs - 0.2.232__py3-none-any.whl → 0.2.233__py3-none-any.whl - Mend

vision-agent 0.2.232py3-none-any.whl → 0.2.233py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

vision_agent/agent/agent_utils.py CHANGED Viewed

@@ -160,7 +160,7 @@ def format_conversation(chat: List[AgentMessage]) -> str:
     prompt = ""
     for chat_i in chat:
         if chat_i.role == "user" or chat_i.role == "coder":
-            if "<final_code>" in chat_i.role:
+            if "<final_code>" in chat_i.content:
                 prompt += f"OBSERVATION: {chat_i.content}\n\n"
             elif chat_i.role == "user":
                 prompt += f"USER: {chat_i.content}\n\n"

vision_agent/agent/vision_agent_coder_v2.py CHANGED Viewed

@@ -443,7 +443,7 @@ class VisionAgentCoderV2(AgentCoder):
         # we don't need the user_interaction response for generating code since it's
         # already in the plan context
-        while chat[-1].role != "user":
+        while len(chat) > 0 and chat[-1].role != "user":
             chat.pop()
         if not chat:

vision_agent/agent/vision_agent_planner_prompts_v2.py CHANGED Viewed

@@ -50,7 +50,7 @@ From this aerial view of a busy urban street, it's difficult to clearly see or c
 [suggestion 0]
 The image is very large and the items you need to detect are small.
-Step 1: You should start by splitting the image into sections and runing the detection algorithm on each section:
+Step 1: You should start by splitting the image into overlapping sections and runing the detection algorithm on each section:
 def subdivide_image(image):
     height, width, _ = image.shape
@@ -66,41 +66,96 @@ def subdivide_image(image):
 get_tool_for_task('<your prompt here>', subdivide_image(image))
-Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions:
-def translate_ofset(bbox, offset_x, offset_y):
-    return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
-def bounding_boxes_overlap(bbox1, bbox2):
-    if bbox1[2] < bbox2[0] or bbox2[0] > bbox1[2]:
-        return False
-    if bbox1[3] < bbox2[1] or bbox2[3] > bbox1[3]:
-        return False
-    return True
-def merge_bounding_boxes(bbox1, bbox2):
-    x_min = min(bbox1[0], bbox2[0])
-    y_min = min(bbox1[1], bbox2[1])
-    x_max = max(bbox1[2], bbox2[2])
-    y_max = max(bbox1[3], bbox2[3])
-    return (x_min, y_min, x_max, y_max)
-def merge_bounding_box_list(bboxes):
-    merged_bboxes = []
-    while bboxes:
-        bbox = bboxes.pop()
-        overlap_found = False
-        for i, other_bbox in enumerate(merged_bboxes):
-            if bounding_boxes_overlap(bbox, other_bbox):
-                merged_bboxes[i] = merge_bounding_boxes(bbox, other_bbox)
-                overlap_found = True
+Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions, be sure to tranlate the offset back to the original image:
+def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
+    # Calculate intersection coordinates
+    x1 = max(b1[0], b2[0])
+    y1 = max(b1[1], b2[1])
+    x2 = min(b1[2], b2[2])
+    y2 = min(b1[3], b2[3])
+    # Calculate intersection area
+    if x2 < x1 or y2 < y1:
+        return False  # No overlap
+    intersection = (x2 - x1) * (y2 - y1)
+    # Calculate union area
+    area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
+    area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
+    union = area1 + area2 - intersection
+    # Calculate IoU
+    iou = intersection / union if union > 0 else 0
+    return iou >= iou_threshold
+def merge_bounding_box_list(detections):
+    merged_detections = []
+    for detection in detections:
+        matching_box = None
+        for i, other in enumerate(merged_detections):
+            if bounding_box_match(detection["bbox"], other["bbox"]):
+                matching_box = i
                 break
-        if not overlap_found:
-          p
-          merged_bboxes.append(bbox)
-    return merged_bboxes
-detection = merge_bounding_box_list(detection_from_subdivided_images)
+        if matching_box is not None:
+            # Keep the box with higher confidence score
+            if detection["score"] > merged_detections[matching_box]["score"]:
+                merged_detections[matching_box] = detection
+        else:
+            merged_detections.append(detection)
+def sub_image_to_original(elt, sub_image_position, original_size):
+    offset_x, offset_y = sub_image_position
+    return {
+        "label": elt["label"],
+        "score": elt["score"],
+        "bbox": [
+            (elt["bbox"][0] + offset_x) / original_size[1],
+            (elt["bbox"][1] + offset_y) / original_size[0],
+            (elt["bbox"][2] + offset_x) / original_size[1],
+            (elt["bbox"][3] + offset_y) / original_size[0],
+        ],
+    }
+def normalized_to_unnormalized(elt, image_size):
+    return {
+        "label": elt["label"],
+        "score": elt["score"],
+        "bbox": [
+            elt["bbox"][0] * image_size[1],
+            elt["bbox"][1] * image_size[0],
+            elt["bbox"][2] * image_size[1],
+            elt["bbox"][3] * image_size[0],
+        ],
+    }
+height, width, _ = image.shape
+mid_width = width // 2
+mid_height = height // 2
+detection_from_subdivided_images = []
+for i, sub_image in enumerate(subdivided_images):
+    detections = <your detection function here>("pedestrian", sub_image)
+    unnorm_detections = [
+        normalized_to_unnormalized(
+            detection, (sub_image.shape[0], sub_image.shape[1])
+        )
+        for detection in detections
+    ]
+    offset_x = i % 2 * (mid_width - int(mid_width * 0.1))
+    offset_y = i // 2 * (mid_height - int(mid_height * 0.1))
+    offset_detections = [
+        sub_image_to_original(
+            unnorm_detection, (offset_x, offset_y), (height, width)
+        )
+        for unnorm_detection in unnorm_detections
+    ]
+    detection_from_subdivided_images.extend(offset_detections)
+detections = merge_bounding_box_list(detection_from_subdivided_images)
 [end of suggestion 0]
 [end of suggestion]
 <count>9</count>
@@ -164,36 +219,44 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float
 AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
 <execute_python>
-def translate_ofset(bbox, offset_x, offset_y):
-    return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
-def bounding_boxes_overlap(bbox1, bbox2):
-    if bbox1[2] < bbox2[0] or bbox2[0] > bbox1[2]:
-        return False
-    if bbox1[3] < bbox2[1] or bbox2[3] > bbox1[3]:
-        return False
-    return True
-def merge_bounding_boxes(bbox1, bbox2):
-    x_min = min(bbox1[0], bbox2[0])
-    y_min = min(bbox1[1], bbox2[1])
-    x_max = max(bbox1[2], bbox2[2])
-    y_max = max(bbox1[3], bbox2[3])
-    return (x_min, y_min, x_max, y_max)
-def merge_bounding_box_list(bboxes):
-    merged_bboxes = []
-    while bboxes:
-        bbox = bboxes.pop()
-        overlap_found = False
-        for i, other_bbox in enumerate(merged_bboxes):
-            if bounding_boxes_overlap(bbox, other_bbox):
-                merged_bboxes[i] = merge_bounding_boxes(bbox, other_bbox)
-                overlap_found = True
+def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
+    # Calculate intersection coordinates
+    x1 = max(b1[0], b2[0])
+    y1 = max(b1[1], b2[1])
+    x2 = min(b1[2], b2[2])
+    y2 = min(b1[3], b2[3])
+    # Calculate intersection area
+    if x2 < x1 or y2 < y1:
+        return False  # No overlap
+    intersection = (x2 - x1) * (y2 - y1)
+    # Calculate union area
+    area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
+    area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
+    union = area1 + area2 - intersection
+    # Calculate IoU
+    iou = intersection / union if union > 0 else 0
+    return iou >= iou_threshold
+def merge_bounding_box_list(detections):
+    merged_detections = []
+    for detection in detections:
+        matching_box = None
+        for i, other in enumerate(merged_detections):
+            if bounding_box_match(detection["bbox"], other["bbox"]):
+                matching_box = i
                 break
-        if not overlap_found:
-            merged_bboxes.append(bbox)
-    return merged_bboxes
+        if matching_box is not None:
+            # Keep the box with higher confidence score
+            if detection["score"] > merged_detections[matching_box]["score"]:
+                merged_detections[matching_box] = detection
+        else:
+            merged_detections.append(detection)
 detections = []
 for region in subdivide_image(image):

vision_agent/agent/vision_agent_v2.py CHANGED Viewed

@@ -27,7 +27,7 @@ CONFIG = Config()
 def extract_conversation(
-    chat: List[AgentMessage],
+    chat: List[AgentMessage], include_conv: bool = False
 ) -> Tuple[List[AgentMessage], Optional[str]]:
     chat = copy.deepcopy(chat)
@@ -43,6 +43,8 @@ def extract_conversation(
         elif chat_i.role == "coder":
             if "<final_code>" in chat_i.content:
                 extracted_chat.append(chat_i)
+        elif include_conv and chat_i.role == "conversation":
+            extracted_chat.append(chat_i)
     # only keep the last <final_code> and <final_test>
     final_code = None
@@ -64,10 +66,9 @@ def extract_conversation(
 def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
-    extracted_chat, _ = extract_conversation(chat)
-    extracted_chat = extracted_chat[-10:]
+    extracted_chat, _ = extract_conversation(chat, include_conv=True)
-    conv = format_conversation(chat)
+    conv = format_conversation(extracted_chat)
     prompt = CONVERSATION.format(
         conversation=conv,
     )
@@ -263,7 +264,7 @@ class VisionAgentV2(Agent):
                 # do not append updated_chat to return_chat becuase the observation
                 # from running the action will have already been added via the callbacks
                 obs_response_context = run_conversation(
-                    self.agent, return_chat + updated_chat
+                    self.agent, int_chat + return_chat + updated_chat
                 )
                 return_chat.append(
                     AgentMessage(role="conversation", content=obs_response_context)

{vision_agent-0.2.232.dist-info → vision_agent-0.2.233.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.232
+Version: 0.2.233
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.232.dist-info → vision_agent-0.2.233.dist-info}/RECORD RENAMED Viewed

@@ -4,20 +4,20 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
 vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
 vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
 vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1565
-vision_agent/agent/agent_utils.py,sha256=Yq31LuhPrrDBtY6hn_-NqE_b2arTGu9IyhtkxayXUQE,14089
+vision_agent/agent/agent_utils.py,sha256=4RgG8SUEGuMFHkIt0jCFkRQF6G1PZp3Ub4LuVYKF7Ic,14092
 vision_agent/agent/types.py,sha256=dIdxATH_PP76pD5Wfo0oofWt6iPQh0vpf48QbEQSzhs,2472
 vision_agent/agent/vision_agent.py,sha256=fH9NOLk7twL1fPr9vLSqkaYhah-gfDWfTOVF2FfMyzI,23461
 vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
 vision_agent/agent/vision_agent_coder_prompts.py,sha256=_kkPLezUVnBXieNPlxMQab_6J6P7F-aa6ItF5NhZZsM,12281
 vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=NUMWq-Lxq5JmmyWs3C5O_1Hm-zCbf9I_yPK5UtWGspE,4871
-vision_agent/agent/vision_agent_coder_v2.py,sha256=LWDYxTFXBeMw6WlAftC4BN8njPt-GgDzCQBhNu_RbkE,17059
+vision_agent/agent/vision_agent_coder_v2.py,sha256=yQYcO0s4BI9pWaAQQAVtkwWa3UF5w0iLKvwpeJ6iegM,17077
 vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
 vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
-vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=izaTDUCi3BjSlrhJKW5X4680SKjn9faZ-TVARPkIu4M,33461
+vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=U88z1Y7CifFs7t53aUrl8qjWtBYs0f_F5vyg_0VYJko,35528
 vision_agent/agent/vision_agent_planner_v2.py,sha256=NUyi57zxCmOO004_cJcCCDa4UgcKSWB1WCGuyOhhXQE,20602
 vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
 vision_agent/agent/vision_agent_prompts_v2.py,sha256=Wyxa15NOe75PefAfw3_RRwvgjg8YVqCrU7WvvWoYJpk,2733
-vision_agent/agent/vision_agent_v2.py,sha256=Pip7MsXLiOvQr2CMIaanaMPKw7BmOdG9OmOWpMgN6kI,10925
+vision_agent/agent/vision_agent_v2.py,sha256=86_pPdkkMBk08TTFZ7zu9QG37Iz9uI8Nmt79wwm_EIA,11053
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
 vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -46,7 +46,7 @@ vision_agent/utils/sim.py,sha256=DYya76dYVtifFyXilMLxBzGgyfyeqhEwU4RJ4894lCI,979
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
 vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
-vision_agent-0.2.232.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.232.dist-info/METADATA,sha256=-h-P38IQVq2pd3687ppL0S3KOvJUTpuuHf_td2LVWI8,5760
-vision_agent-0.2.232.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.232.dist-info/RECORD,,
+vision_agent-0.2.233.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.233.dist-info/METADATA,sha256=EoNuerRth0lHRC7TK2Xh7w6V__YtUJraKk9yN8AMx2U,5760
+vision_agent-0.2.233.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.233.dist-info/RECORD,,

{vision_agent-0.2.232.dist-info → vision_agent-0.2.233.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.232.dist-info → vision_agent-0.2.233.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.232__py3-none-any.whl → 0.2.233__py3-none-any.whl

vision-agent 0.2.232py3-none-any.whl → 0.2.233py3-none-any.whl