vision-agent 0.2.233__py3-none-any.whl → 0.2.235__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_planner_prompts_v2.py +1 -1
- vision_agent/agent/vision_agent_planner_v2.py +2 -3
- vision_agent/agent/vision_agent_v2.py +1 -2
- vision_agent/tools/tool_utils.py +14 -9
- vision_agent/tools/tools.py +58 -21
- vision_agent/utils/video_tracking.py +59 -58
- {vision_agent-0.2.233.dist-info → vision_agent-0.2.235.dist-info}/METADATA +1 -1
- {vision_agent-0.2.233.dist-info → vision_agent-0.2.235.dist-info}/RECORD +10 -10
- {vision_agent-0.2.233.dist-info → vision_agent-0.2.235.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.233.dist-info → vision_agent-0.2.235.dist-info}/WHEEL +0 -0
@@ -20,7 +20,7 @@ PLAN = """
|
|
20
20
|
3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
|
21
21
|
4. Only load/save files from {media_list} unless you specifically saved the file previously.
|
22
22
|
5. Ensure you always call `suggestion` initially and `get_tool_for_task` to get the right tool for the subtask.
|
23
|
-
6. Calling `plt.imshow` or `save_image` will display the image to you
|
23
|
+
6. Calling `plt.imshow` or `save_image` will display the image to you so you can check your results. If you see an image after <execute_python> it's generated from your code.
|
24
24
|
7. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
|
25
25
|
8. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool.
|
26
26
|
9. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
|
@@ -97,8 +97,7 @@ def run_planning(
|
|
97
97
|
media_list: List[Union[str, Path]],
|
98
98
|
model: LMM,
|
99
99
|
) -> str:
|
100
|
-
|
101
|
-
planning = get_planning(chat[-10:])
|
100
|
+
planning = get_planning(chat)
|
102
101
|
prompt = PLAN.format(
|
103
102
|
tool_desc=PLANNING_TOOLS_DOCSTRING,
|
104
103
|
examples=f"{EXAMPLE_PLAN1}\n{EXAMPLE_PLAN2}",
|
@@ -372,7 +371,7 @@ def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]
|
|
372
371
|
function_name = response["function_name"]
|
373
372
|
tool_doc = get_tool_documentation(function_name)
|
374
373
|
if "box_threshold" in response:
|
375
|
-
tool_doc = f"Use the following function with box_threshold={response['box_threshold']}
|
374
|
+
tool_doc = f"Use the following function with box_threshold={response['box_threshold']}. This tool and its parameters were chosen by the user so do not change them in your planning.\n\n{tool_doc}."
|
376
375
|
new_chat.append(AgentMessage(role="observation", content=tool_doc))
|
377
376
|
except (json.JSONDecodeError, KeyError):
|
378
377
|
raise ValueError(f"Invalid JSON in interaction response: {chat_i}")
|
@@ -91,8 +91,6 @@ def maybe_run_action(
|
|
91
91
|
code_interpreter: Optional[CodeInterpreter] = None,
|
92
92
|
) -> Optional[List[AgentMessage]]:
|
93
93
|
extracted_chat, final_code = extract_conversation(chat)
|
94
|
-
# only keep last 5 messages to keep context recent and not overwhelm LLM
|
95
|
-
extracted_chat = extracted_chat[-5:]
|
96
94
|
if action == "generate_or_edit_vision_code":
|
97
95
|
# there's an issue here because coder.generate_code will send it's code_context
|
98
96
|
# to the outside user via it's update_callback, but we don't necessarily have
|
@@ -125,6 +123,7 @@ def maybe_run_action(
|
|
125
123
|
],
|
126
124
|
code="",
|
127
125
|
)
|
126
|
+
|
128
127
|
context = coder.generate_code_from_plan(
|
129
128
|
extracted_chat, plan_context, code_interpreter=code_interpreter
|
130
129
|
)
|
vision_agent/tools/tool_utils.py
CHANGED
@@ -270,17 +270,22 @@ def add_bboxes_from_masks(
|
|
270
270
|
) -> List[List[Dict[str, Any]]]:
|
271
271
|
for frame_preds in all_preds:
|
272
272
|
for preds in frame_preds:
|
273
|
-
|
273
|
+
mask = preds["mask"]
|
274
|
+
if mask.sum() == 0:
|
274
275
|
preds["bbox"] = []
|
275
276
|
else:
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
]
|
283
|
-
|
277
|
+
# Get indices where mask is True using axis operations
|
278
|
+
rows = np.any(mask, axis=1)
|
279
|
+
cols = np.any(mask, axis=0)
|
280
|
+
|
281
|
+
# Find boundaries using argmax/argmin
|
282
|
+
y_min = np.argmax(rows)
|
283
|
+
y_max = len(rows) - np.argmax(rows[::-1])
|
284
|
+
x_min = np.argmax(cols)
|
285
|
+
x_max = len(cols) - np.argmax(cols[::-1])
|
286
|
+
|
287
|
+
bbox = [float(x_min), float(y_min), float(x_max), float(y_max)]
|
288
|
+
bbox = normalize_bbox(bbox, mask.shape)
|
284
289
|
preds["bbox"] = bbox
|
285
290
|
|
286
291
|
return all_preds
|
vision_agent/tools/tools.py
CHANGED
@@ -234,16 +234,24 @@ def od_sam2_video_tracking(
|
|
234
234
|
od_model: ODModels,
|
235
235
|
prompt: str,
|
236
236
|
frames: List[np.ndarray],
|
237
|
-
chunk_length: Optional[int] =
|
237
|
+
chunk_length: Optional[int] = 50,
|
238
238
|
fine_tune_id: Optional[str] = None,
|
239
239
|
) -> Dict[str, Any]:
|
240
|
-
|
241
|
-
|
240
|
+
chunk_length = 50 if chunk_length is None else chunk_length
|
241
|
+
segment_size = chunk_length
|
242
|
+
# Number of overlapping frames between segments
|
243
|
+
overlap = 1
|
244
|
+
# chunk_length needs to be segment_size + 1 or else on the last segment it will
|
245
|
+
# run the OD model again and merging will not work
|
246
|
+
chunk_length = chunk_length + 1
|
247
|
+
|
248
|
+
if len(frames) == 0 or not isinstance(frames, List):
|
249
|
+
return {"files": [], "return_data": [], "display_data": []}
|
242
250
|
|
243
251
|
image_size = frames[0].shape[:2]
|
244
252
|
|
245
253
|
# Split frames into segments with overlap
|
246
|
-
segments = split_frames_into_segments(frames,
|
254
|
+
segments = split_frames_into_segments(frames, segment_size, overlap)
|
247
255
|
|
248
256
|
def _apply_object_detection( # inner method to avoid circular importing issues.
|
249
257
|
od_model: ODModels,
|
@@ -538,7 +546,7 @@ def owlv2_sam2_instance_segmentation(
|
|
538
546
|
def owlv2_sam2_video_tracking(
|
539
547
|
prompt: str,
|
540
548
|
frames: List[np.ndarray],
|
541
|
-
chunk_length: Optional[int] =
|
549
|
+
chunk_length: Optional[int] = 25,
|
542
550
|
fine_tune_id: Optional[str] = None,
|
543
551
|
) -> List[List[Dict[str, Any]]]:
|
544
552
|
"""'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
|
@@ -771,7 +779,7 @@ def florence2_sam2_instance_segmentation(
|
|
771
779
|
def florence2_sam2_video_tracking(
|
772
780
|
prompt: str,
|
773
781
|
frames: List[np.ndarray],
|
774
|
-
chunk_length: Optional[int] =
|
782
|
+
chunk_length: Optional[int] = 25,
|
775
783
|
fine_tune_id: Optional[str] = None,
|
776
784
|
) -> List[List[Dict[str, Any]]]:
|
777
785
|
"""'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
@@ -1110,7 +1118,7 @@ def countgd_sam2_instance_segmentation(
|
|
1110
1118
|
def countgd_sam2_video_tracking(
|
1111
1119
|
prompt: str,
|
1112
1120
|
frames: List[np.ndarray],
|
1113
|
-
chunk_length: Optional[int] =
|
1121
|
+
chunk_length: Optional[int] = 25,
|
1114
1122
|
) -> List[List[Dict[str, Any]]]:
|
1115
1123
|
"""'countgd_sam2_video_tracking' is a tool that can track and segment multiple
|
1116
1124
|
objects in a video given a text prompt such as category names or referring
|
@@ -1322,7 +1330,7 @@ def custom_object_detection(
|
|
1322
1330
|
def custom_od_sam2_video_tracking(
|
1323
1331
|
deployment_id: str,
|
1324
1332
|
frames: List[np.ndarray],
|
1325
|
-
chunk_length: Optional[int] =
|
1333
|
+
chunk_length: Optional[int] = 25,
|
1326
1334
|
) -> List[List[Dict[str, Any]]]:
|
1327
1335
|
"""'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
|
1328
1336
|
custom model with predefined category names.
|
@@ -2366,7 +2374,7 @@ def agentic_sam2_instance_segmentation(
|
|
2366
2374
|
def agentic_sam2_video_tracking(
|
2367
2375
|
prompt: str,
|
2368
2376
|
frames: List[np.ndarray],
|
2369
|
-
chunk_length: Optional[int] =
|
2377
|
+
chunk_length: Optional[int] = 25,
|
2370
2378
|
fine_tune_id: Optional[str] = None,
|
2371
2379
|
) -> List[List[Dict[str, Any]]]:
|
2372
2380
|
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
@@ -2791,7 +2799,15 @@ def overlay_bounding_boxes(
|
|
2791
2799
|
"Number of unique labels exceeds the number of available colors. Some labels may have the same color."
|
2792
2800
|
)
|
2793
2801
|
|
2794
|
-
|
2802
|
+
use_tracking_label = False
|
2803
|
+
if all([":" in label for label in labels]):
|
2804
|
+
unique_labels = set([label.split(":")[1].strip() for label in labels])
|
2805
|
+
use_tracking_label = True
|
2806
|
+
colors = {
|
2807
|
+
label: COLORS[i % len(COLORS)] for i, label in enumerate(unique_labels)
|
2808
|
+
}
|
2809
|
+
else:
|
2810
|
+
colors = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
|
2795
2811
|
|
2796
2812
|
frame_out = []
|
2797
2813
|
for i, frame in enumerate(medias_int):
|
@@ -2802,7 +2818,7 @@ def overlay_bounding_boxes(
|
|
2802
2818
|
|
2803
2819
|
# if more than 50 boxes use small boxes to indicate objects else use regular boxes
|
2804
2820
|
if len(bboxes) > 50:
|
2805
|
-
pil_image = _plot_counting(pil_image, bboxes,
|
2821
|
+
pil_image = _plot_counting(pil_image, bboxes, colors, use_tracking_label)
|
2806
2822
|
else:
|
2807
2823
|
width, height = pil_image.size
|
2808
2824
|
fontsize = max(12, int(min(width, height) / 40))
|
@@ -2817,18 +2833,20 @@ def overlay_bounding_boxes(
|
|
2817
2833
|
)
|
2818
2834
|
|
2819
2835
|
for elt in bboxes:
|
2836
|
+
if use_tracking_label:
|
2837
|
+
color = colors[elt["label"].split(":")[1].strip()]
|
2838
|
+
else:
|
2839
|
+
color = colors[elt["label"]]
|
2820
2840
|
label = elt["label"]
|
2821
2841
|
box = elt["bbox"]
|
2822
2842
|
scores = elt["score"]
|
2823
2843
|
|
2824
2844
|
# denormalize the box if it is normalized
|
2825
2845
|
box = denormalize_bbox(box, (height, width))
|
2826
|
-
draw.rectangle(box, outline=color
|
2846
|
+
draw.rectangle(box, outline=color, width=4)
|
2827
2847
|
text = f"{label}: {scores:.2f}"
|
2828
2848
|
text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
|
2829
|
-
draw.rectangle(
|
2830
|
-
(box[0], box[1], text_box[2], text_box[3]), fill=color[label]
|
2831
|
-
)
|
2849
|
+
draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color)
|
2832
2850
|
draw.text((box[0], box[1]), text, fill="black", font=font)
|
2833
2851
|
|
2834
2852
|
frame_out.append(np.array(pil_image))
|
@@ -2911,7 +2929,16 @@ def overlay_segmentation_masks(
|
|
2911
2929
|
for mask_i in masks_int:
|
2912
2930
|
for mask_j in mask_i:
|
2913
2931
|
labels.add(mask_j["label"])
|
2914
|
-
|
2932
|
+
|
2933
|
+
use_tracking_label = False
|
2934
|
+
if all([":" in label for label in labels]):
|
2935
|
+
use_tracking_label = True
|
2936
|
+
unique_labels = set([label.split(":")[1].strip() for label in labels])
|
2937
|
+
colors = {
|
2938
|
+
label: COLORS[i % len(COLORS)] for i, label in enumerate(unique_labels)
|
2939
|
+
}
|
2940
|
+
else:
|
2941
|
+
colors = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
|
2915
2942
|
|
2916
2943
|
width, height = Image.fromarray(medias_int[0]).size
|
2917
2944
|
fontsize = max(12, int(min(width, height) / 40))
|
@@ -2925,12 +2952,16 @@ def overlay_segmentation_masks(
|
|
2925
2952
|
pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
|
2926
2953
|
for elt in masks_int[i]:
|
2927
2954
|
mask = elt["mask"]
|
2955
|
+
if use_tracking_label:
|
2956
|
+
color = colors[elt["label"].split(":")[1].strip()]
|
2957
|
+
else:
|
2958
|
+
color = colors[elt["label"]]
|
2928
2959
|
label = elt["label"]
|
2929
2960
|
tracking_lbl = elt.get(secondary_label_key, None)
|
2930
2961
|
|
2931
2962
|
# Create semi-transparent mask overlay
|
2932
2963
|
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
2933
|
-
np_mask[mask > 0, :] = color
|
2964
|
+
np_mask[mask > 0, :] = color + (255 * 0.7,)
|
2934
2965
|
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
2935
2966
|
pil_image = Image.alpha_composite(pil_image, mask_img)
|
2936
2967
|
|
@@ -2942,7 +2973,7 @@ def overlay_segmentation_masks(
|
|
2942
2973
|
border_mask = np.zeros(
|
2943
2974
|
(pil_image.size[1], pil_image.size[0], 4), dtype=np.uint8
|
2944
2975
|
)
|
2945
|
-
cv2.drawContours(border_mask, contours, -1, color
|
2976
|
+
cv2.drawContours(border_mask, contours, -1, color + (255,), 8)
|
2946
2977
|
border_img = Image.fromarray(border_mask)
|
2947
2978
|
pil_image = Image.alpha_composite(pil_image, border_img)
|
2948
2979
|
|
@@ -2957,7 +2988,7 @@ def overlay_segmentation_masks(
|
|
2957
2988
|
)
|
2958
2989
|
if x != 0 and y != 0:
|
2959
2990
|
text_box = draw.textbbox((x, y), text=text, font=font)
|
2960
|
-
draw.rectangle((x, y, text_box[2], text_box[3]), fill=color
|
2991
|
+
draw.rectangle((x, y, text_box[2], text_box[3]), fill=color)
|
2961
2992
|
draw.text((x, y), text, fill="black", font=font)
|
2962
2993
|
frame_out.append(np.array(pil_image))
|
2963
2994
|
return_frame = frame_out[0] if len(frame_out) == 1 else frame_out
|
@@ -3014,6 +3045,7 @@ def _plot_counting(
|
|
3014
3045
|
image: Image.Image,
|
3015
3046
|
bboxes: List[Dict[str, Any]],
|
3016
3047
|
colors: Dict[str, Tuple[int, int, int]],
|
3048
|
+
use_tracking_label: bool = False,
|
3017
3049
|
) -> Image.Image:
|
3018
3050
|
width, height = image.size
|
3019
3051
|
fontsize = max(12, int(min(width, height) / 40))
|
@@ -3023,7 +3055,12 @@ def _plot_counting(
|
|
3023
3055
|
fontsize,
|
3024
3056
|
)
|
3025
3057
|
for i, elt in enumerate(bboxes, 1):
|
3026
|
-
|
3058
|
+
if use_tracking_label:
|
3059
|
+
label = elt["label"].split(":")[0]
|
3060
|
+
color = colors[elt["label"].split(":")[1].strip()]
|
3061
|
+
else:
|
3062
|
+
label = f"{i}"
|
3063
|
+
color = colors[elt["label"]]
|
3027
3064
|
box = elt["bbox"]
|
3028
3065
|
|
3029
3066
|
# denormalize the box if it is normalized
|
@@ -3044,7 +3081,7 @@ def _plot_counting(
|
|
3044
3081
|
text_y1 = cy + text_height / 2
|
3045
3082
|
|
3046
3083
|
# Draw the rectangle encapsulating the text
|
3047
|
-
draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=
|
3084
|
+
draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
|
3048
3085
|
|
3049
3086
|
# Draw the text at the center of the bounding box
|
3050
3087
|
draw.text(
|
@@ -3,10 +3,10 @@ from enum import Enum
|
|
3
3
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
4
4
|
|
5
5
|
import numpy as np
|
6
|
+
from scipy.optimize import linear_sum_assignment # type: ignore
|
6
7
|
|
7
8
|
from vision_agent.tools.tool_utils import (
|
8
9
|
add_bboxes_from_masks,
|
9
|
-
nms,
|
10
10
|
send_task_inference_request,
|
11
11
|
)
|
12
12
|
from vision_agent.utils.image_utils import denormalize_bbox, rle_decode_array
|
@@ -171,63 +171,45 @@ def _calculate_mask_iou(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
|
171
171
|
def _match_by_iou(
|
172
172
|
first_param: List[Dict],
|
173
173
|
second_param: List[Dict],
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
for
|
182
|
-
|
183
|
-
|
184
|
-
for existing_item in first_param:
|
174
|
+
max_id: int,
|
175
|
+
iou_threshold: float = 0.05,
|
176
|
+
) -> Tuple[Dict[int, int], int]:
|
177
|
+
max_first_id = max((item["id"] for item in first_param), default=0)
|
178
|
+
max_second_id = max((item["id"] for item in second_param), default=0)
|
179
|
+
|
180
|
+
cost_matrix = np.ones((max_first_id + 1, max_second_id + 1))
|
181
|
+
for first_item in first_param:
|
182
|
+
for second_item in second_param:
|
185
183
|
iou = _calculate_mask_iou(
|
186
|
-
|
184
|
+
first_item["decoded_mask"], second_item["decoded_mask"]
|
187
185
|
)
|
188
|
-
|
189
|
-
matched_id = existing_item["id"]
|
190
|
-
matched_new_item_indices.add(new_index)
|
191
|
-
id_mapping[new_item["id"]] = matched_id
|
192
|
-
break
|
193
|
-
|
194
|
-
if matched_id:
|
195
|
-
new_item["id"] = matched_id
|
196
|
-
else:
|
197
|
-
max_id += 1
|
198
|
-
id_mapping[new_item["id"]] = max_id
|
199
|
-
new_item["id"] = max_id
|
200
|
-
|
201
|
-
unmatched_items = [
|
202
|
-
item for i, item in enumerate(second_param) if i not in matched_new_item_indices
|
203
|
-
]
|
204
|
-
combined_list = first_param + unmatched_items
|
205
|
-
|
206
|
-
return combined_list, id_mapping
|
186
|
+
cost_matrix[first_item["id"], second_item["id"]] = 1 - iou
|
207
187
|
|
188
|
+
row_ind, col_ind = linear_sum_assignment(cost_matrix)
|
189
|
+
id_mapping = {second_id: first_id for first_id, second_id in zip(row_ind, col_ind)}
|
190
|
+
first_id_to_label = {item["id"]: item["label"] for item in first_param}
|
208
191
|
|
209
|
-
|
210
|
-
for
|
211
|
-
|
212
|
-
|
213
|
-
|
192
|
+
cleaned_mapping = {}
|
193
|
+
for elt in second_param:
|
194
|
+
second_id = elt["id"]
|
195
|
+
# if the id is not in the mapping, give it a new id
|
196
|
+
if second_id not in id_mapping:
|
197
|
+
max_id += 1
|
198
|
+
cleaned_mapping[second_id] = max_id
|
199
|
+
else:
|
200
|
+
first_id = id_mapping[second_id]
|
201
|
+
iou = 1 - cost_matrix[first_id, second_id]
|
202
|
+
# only map if the iou is above the threshold and the labels match
|
203
|
+
if iou > iou_threshold and first_id_to_label[first_id] == elt["label"]:
|
204
|
+
cleaned_mapping[second_id] = first_id
|
214
205
|
else:
|
215
|
-
|
216
|
-
|
217
|
-
id_mapping[detection["id"]] = detection["id"]
|
206
|
+
max_id += 1
|
207
|
+
cleaned_mapping[second_id] = max_id
|
218
208
|
|
209
|
+
return cleaned_mapping, max_id
|
219
210
|
|
220
|
-
def _convert_to_2d(detections_per_segment: List[Any]) -> List[Any]:
|
221
|
-
result = []
|
222
|
-
for i, segment in enumerate(detections_per_segment):
|
223
|
-
if i == 0:
|
224
|
-
result.extend(segment)
|
225
|
-
else:
|
226
|
-
result.extend(segment[1:])
|
227
|
-
return result
|
228
211
|
|
229
|
-
|
230
|
-
def merge_segments(detections_per_segment: List[Any]) -> List[Any]:
|
212
|
+
def merge_segments(detections_per_segment: List[Any], overlap: int = 1) -> List[Any]:
|
231
213
|
"""
|
232
214
|
Merges detections from all segments into a unified result.
|
233
215
|
|
@@ -242,16 +224,20 @@ def merge_segments(detections_per_segment: List[Any]) -> List[Any]:
|
|
242
224
|
for item in detection:
|
243
225
|
item["decoded_mask"] = rle_decode_array(item["mask"])
|
244
226
|
|
227
|
+
merged_result = detections_per_segment[0]
|
228
|
+
max_id = max((item["id"] for item in merged_result[-1]), default=0)
|
245
229
|
for segment_idx in range(len(detections_per_segment) - 1):
|
246
|
-
|
230
|
+
id_mapping, max_id = _match_by_iou(
|
247
231
|
detections_per_segment[segment_idx][-1],
|
248
232
|
detections_per_segment[segment_idx + 1][0],
|
233
|
+
max_id,
|
249
234
|
)
|
250
|
-
|
251
|
-
|
252
|
-
|
235
|
+
for frame in detections_per_segment[segment_idx + 1][overlap:]:
|
236
|
+
for detection in frame:
|
237
|
+
detection["id"] = id_mapping[detection["id"]]
|
238
|
+
merged_result.extend(detections_per_segment[segment_idx + 1][overlap:])
|
253
239
|
|
254
|
-
return merged_result
|
240
|
+
return merged_result # type: ignore
|
255
241
|
|
256
242
|
|
257
243
|
def post_process(
|
@@ -269,10 +255,26 @@ def post_process(
|
|
269
255
|
Dict[str, Any]: Post-processed data including return_data and display_data.
|
270
256
|
"""
|
271
257
|
return_data = []
|
272
|
-
|
258
|
+
label_remapping = {}
|
259
|
+
for _, frame in enumerate(merged_detections):
|
273
260
|
return_frame_data = []
|
274
261
|
for detection in frame:
|
275
|
-
label =
|
262
|
+
label = detection["label"]
|
263
|
+
id = detection["id"]
|
264
|
+
|
265
|
+
# Remap label IDs so for each label the IDs restart at 1. This makes it
|
266
|
+
# easier to count the number of instances per label.
|
267
|
+
if label not in label_remapping:
|
268
|
+
label_remapping[label] = {"max": 1, "remap": {id: 1}}
|
269
|
+
elif label in label_remapping and id not in label_remapping[label]["remap"]: # type: ignore
|
270
|
+
max_id = label_remapping[label]["max"]
|
271
|
+
max_id += 1 # type: ignore
|
272
|
+
label_remapping[label]["remap"][id] = max_id # type: ignore
|
273
|
+
label_remapping[label]["max"] = max_id
|
274
|
+
|
275
|
+
new_id = label_remapping[label]["remap"][id] # type: ignore
|
276
|
+
|
277
|
+
label = f"{new_id}: {detection['label']}"
|
276
278
|
return_frame_data.append(
|
277
279
|
{
|
278
280
|
"label": label,
|
@@ -285,7 +287,6 @@ def post_process(
|
|
285
287
|
return_data.append(return_frame_data)
|
286
288
|
|
287
289
|
return_data = add_bboxes_from_masks(return_data)
|
288
|
-
return_data = nms(return_data, iou_threshold=0.95)
|
289
290
|
|
290
291
|
# We save the RLE for display purposes, re-calculting RLE can get very expensive.
|
291
292
|
# Deleted here because we are returning the numpy masks instead
|
@@ -13,11 +13,11 @@ vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=NUMWq-Lxq5JmmyWs3C5O_
|
|
13
13
|
vision_agent/agent/vision_agent_coder_v2.py,sha256=yQYcO0s4BI9pWaAQQAVtkwWa3UF5w0iLKvwpeJ6iegM,17077
|
14
14
|
vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
|
15
15
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
|
16
|
-
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=
|
17
|
-
vision_agent/agent/vision_agent_planner_v2.py,sha256=
|
16
|
+
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=JLiFZvwQFLYukgj4l-SzxSIjmJHTEkL2HSZbkjU529w,35591
|
17
|
+
vision_agent/agent/vision_agent_planner_v2.py,sha256=wISmdTN-W1pjgZg3_aneGowI3lRQRHTSbpyeTJ79O5A,20645
|
18
18
|
vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
|
19
19
|
vision_agent/agent/vision_agent_prompts_v2.py,sha256=Wyxa15NOe75PefAfw3_RRwvgjg8YVqCrU7WvvWoYJpk,2733
|
20
|
-
vision_agent/agent/vision_agent_v2.py,sha256=
|
20
|
+
vision_agent/agent/vision_agent_v2.py,sha256=Q96YsLovCUOd6m2Cg7EGCiHshDq65vxHsfVj7IToyls,10936
|
21
21
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
23
23
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -35,8 +35,8 @@ vision_agent/tools/__init__.py,sha256=zopUrANPx7p0NGy6BxmEaYhDrj8DX8w7BLfgmCbz-m
|
|
35
35
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
36
36
|
vision_agent/tools/planner_tools.py,sha256=8pJZCGGOGIqGiV2or52BjyRP6eDlporuQ2hXCIHfLTQ,15382
|
37
37
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
38
|
-
vision_agent/tools/tool_utils.py,sha256=
|
39
|
-
vision_agent/tools/tools.py,sha256=
|
38
|
+
vision_agent/tools/tool_utils.py,sha256=l4oWkgPd_s8QzXqqbrLwPgcfnhsJaPpdMKikOcwRaoQ,10396
|
39
|
+
vision_agent/tools/tools.py,sha256=MBeFVYyCx-QQLCFb0Cn4m6SgmT6-6HxUOmCKAARrv6s,108547
|
40
40
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
41
41
|
vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
|
42
42
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -45,8 +45,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
|
|
45
45
|
vision_agent/utils/sim.py,sha256=DYya76dYVtifFyXilMLxBzGgyfyeqhEwU4RJ4894lCI,9796
|
46
46
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
47
47
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
48
|
-
vision_agent/utils/video_tracking.py,sha256=
|
49
|
-
vision_agent-0.2.
|
50
|
-
vision_agent-0.2.
|
51
|
-
vision_agent-0.2.
|
52
|
-
vision_agent-0.2.
|
48
|
+
vision_agent/utils/video_tracking.py,sha256=PXZYB0ZJM97WU52XbucyoXX2GW9-gNpSHrPF30_Lq1Q,10263
|
49
|
+
vision_agent-0.2.235.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
50
|
+
vision_agent-0.2.235.dist-info/METADATA,sha256=bAPsRs8veydvltpYRLYxAOB_bhASv9zKkE9TjkPVm2Q,5760
|
51
|
+
vision_agent-0.2.235.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
52
|
+
vision_agent-0.2.235.dist-info/RECORD,,
|
File without changes
|
File without changes
|