vision-agent 0.2.226__py3-none-any.whl → 0.2.227__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +140 -0
- vision_agent/utils/video_tracking.py +1 -0
- {vision_agent-0.2.226.dist-info → vision_agent-0.2.227.dist-info}/METADATA +1 -1
- {vision_agent-0.2.226.dist-info → vision_agent-0.2.227.dist-info}/RECORD +7 -7
- {vision_agent-0.2.226.dist-info → vision_agent-0.2.227.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.226.dist-info → vision_agent-0.2.227.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -290,6 +290,13 @@ def od_sam2_video_tracking(
|
|
290
290
|
)
|
291
291
|
function_name = "florence2_object_detection"
|
292
292
|
|
293
|
+
elif od_model == ODModels.CUSTOM:
|
294
|
+
segment_results = custom_object_detection(
|
295
|
+
deployment_id=fine_tune_id,
|
296
|
+
image=segment_frames[frame_number],
|
297
|
+
)
|
298
|
+
function_name = "custom_object_detection"
|
299
|
+
|
293
300
|
else:
|
294
301
|
raise NotImplementedError(
|
295
302
|
f"Object detection model '{od_model}' is not implemented."
|
@@ -1217,6 +1224,139 @@ def countgd_visual_prompt_object_detection(
|
|
1217
1224
|
return bboxes_formatted
|
1218
1225
|
|
1219
1226
|
|
1227
|
+
def custom_object_detection(
|
1228
|
+
deployment_id: str,
|
1229
|
+
image: np.ndarray,
|
1230
|
+
box_threshold: float = 0.1,
|
1231
|
+
) -> List[Dict[str, Any]]:
|
1232
|
+
"""'custom_object_detection' is a tool that can detect instances of an
|
1233
|
+
object given a deployment_id of a previously finetuned object detection model.
|
1234
|
+
It is particularly useful when trying to detect objects that are not well detected by generalist models.
|
1235
|
+
It returns a list of bounding boxes with normalized
|
1236
|
+
coordinates, label names and associated confidence scores.
|
1237
|
+
|
1238
|
+
Parameters:
|
1239
|
+
deployment_id (str): The id of the finetuned model.
|
1240
|
+
image (np.ndarray): The image that contains instances of the object.
|
1241
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
1242
|
+
to 0.1.
|
1243
|
+
|
1244
|
+
Returns:
|
1245
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1246
|
+
bounding box of the detected objects with normalized coordinates between 0
|
1247
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1248
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1249
|
+
bounding box.
|
1250
|
+
|
1251
|
+
Example
|
1252
|
+
-------
|
1253
|
+
>>> custom_object_detection("abcd1234-5678efg", image)
|
1254
|
+
[
|
1255
|
+
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1256
|
+
{'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5]},
|
1257
|
+
{'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52]},
|
1258
|
+
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58]},
|
1259
|
+
]
|
1260
|
+
"""
|
1261
|
+
image_size = image.shape[:2]
|
1262
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
1263
|
+
return []
|
1264
|
+
|
1265
|
+
files = [("image", numpy_to_bytes(image))]
|
1266
|
+
payload = {
|
1267
|
+
"deployment_id": deployment_id,
|
1268
|
+
"confidence": box_threshold,
|
1269
|
+
}
|
1270
|
+
detections: List[List[Dict[str, Any]]] = send_inference_request(
|
1271
|
+
payload, "custom-object-detection", files=files, v2=True
|
1272
|
+
)
|
1273
|
+
|
1274
|
+
bboxes = detections[0]
|
1275
|
+
bboxes_formatted = [
|
1276
|
+
{
|
1277
|
+
"label": bbox["label"],
|
1278
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1279
|
+
"score": bbox["score"],
|
1280
|
+
}
|
1281
|
+
for bbox in bboxes
|
1282
|
+
]
|
1283
|
+
display_data = [
|
1284
|
+
{
|
1285
|
+
"label": bbox["label"],
|
1286
|
+
"bbox": bbox["bounding_box"],
|
1287
|
+
"score": bbox["score"],
|
1288
|
+
}
|
1289
|
+
for bbox in bboxes
|
1290
|
+
]
|
1291
|
+
|
1292
|
+
_display_tool_trace(
|
1293
|
+
custom_object_detection.__name__,
|
1294
|
+
payload,
|
1295
|
+
display_data,
|
1296
|
+
files,
|
1297
|
+
)
|
1298
|
+
return bboxes_formatted
|
1299
|
+
|
1300
|
+
|
1301
|
+
def custom_od_sam2_video_tracking(
|
1302
|
+
deployment_id: str,
|
1303
|
+
frames: List[np.ndarray],
|
1304
|
+
chunk_length: Optional[int] = 10,
|
1305
|
+
) -> List[List[Dict[str, Any]]]:
|
1306
|
+
"""'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
|
1307
|
+
custom model with predefined category names.
|
1308
|
+
It returns a list of bounding boxes, label names,
|
1309
|
+
mask file names and associated probability scores.
|
1310
|
+
|
1311
|
+
Parameters:
|
1312
|
+
deployment_id (str): The id of the deployed custom model.
|
1313
|
+
image (np.ndarray): The image to ground the prompt to.
|
1314
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
1315
|
+
new objects.
|
1316
|
+
|
1317
|
+
Returns:
|
1318
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
1319
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
1320
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
1321
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
1322
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
1323
|
+
the background.
|
1324
|
+
|
1325
|
+
Example
|
1326
|
+
-------
|
1327
|
+
>>> custom_od_sam2_video_tracking("abcd1234-5678efg", frames)
|
1328
|
+
[
|
1329
|
+
[
|
1330
|
+
{
|
1331
|
+
'label': '0: dinosaur',
|
1332
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1333
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1334
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1335
|
+
...,
|
1336
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1337
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1338
|
+
},
|
1339
|
+
],
|
1340
|
+
...
|
1341
|
+
]
|
1342
|
+
"""
|
1343
|
+
|
1344
|
+
ret = od_sam2_video_tracking(
|
1345
|
+
ODModels.CUSTOM,
|
1346
|
+
prompt="",
|
1347
|
+
frames=frames,
|
1348
|
+
chunk_length=chunk_length,
|
1349
|
+
fine_tune_id=deployment_id,
|
1350
|
+
)
|
1351
|
+
_display_tool_trace(
|
1352
|
+
custom_od_sam2_video_tracking.__name__,
|
1353
|
+
{},
|
1354
|
+
ret["display_data"],
|
1355
|
+
ret["files"],
|
1356
|
+
)
|
1357
|
+
return ret["return_data"] # type: ignore
|
1358
|
+
|
1359
|
+
|
1220
1360
|
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
1221
1361
|
"""'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
1222
1362
|
images including regular images or images of documents or presentations. It can be
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=fcucnAzr5Hue9xSqpBgA7RcRJP2CgAgQJ31p_R5lg-I,2794
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=36f0qAhQfA5lDhYv5BKpHfHgBVEBgOD-XNVHG5K4HLY,96619
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -40,8 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
|
|
40
40
|
vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent/utils/video_tracking.py,sha256=
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
47
|
-
vision_agent-0.2.
|
43
|
+
vision_agent/utils/video_tracking.py,sha256=7ZiFBqQRTid5ytPmkrAGQUiVMr-twzib8Ha2hN3JsR0,9474
|
44
|
+
vision_agent-0.2.227.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
45
|
+
vision_agent-0.2.227.dist-info/METADATA,sha256=qFefkLzCo7G98LyhIPqYzPOUv5nyvOK84DJvUWmeqcc,20039
|
46
|
+
vision_agent-0.2.227.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
47
|
+
vision_agent-0.2.227.dist-info/RECORD,,
|
File without changes
|
File without changes
|