vision-agent 0.2.226__py3-none-any.whl → 0.2.227__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +140 -0
- vision_agent/utils/video_tracking.py +1 -0
- {vision_agent-0.2.226.dist-info → vision_agent-0.2.227.dist-info}/METADATA +1 -1
- {vision_agent-0.2.226.dist-info → vision_agent-0.2.227.dist-info}/RECORD +7 -7
- {vision_agent-0.2.226.dist-info → vision_agent-0.2.227.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.226.dist-info → vision_agent-0.2.227.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -290,6 +290,13 @@ def od_sam2_video_tracking(
|
|
290
290
|
)
|
291
291
|
function_name = "florence2_object_detection"
|
292
292
|
|
293
|
+
elif od_model == ODModels.CUSTOM:
|
294
|
+
segment_results = custom_object_detection(
|
295
|
+
deployment_id=fine_tune_id,
|
296
|
+
image=segment_frames[frame_number],
|
297
|
+
)
|
298
|
+
function_name = "custom_object_detection"
|
299
|
+
|
293
300
|
else:
|
294
301
|
raise NotImplementedError(
|
295
302
|
f"Object detection model '{od_model}' is not implemented."
|
@@ -1217,6 +1224,139 @@ def countgd_visual_prompt_object_detection(
|
|
1217
1224
|
return bboxes_formatted
|
1218
1225
|
|
1219
1226
|
|
1227
|
+
def custom_object_detection(
|
1228
|
+
deployment_id: str,
|
1229
|
+
image: np.ndarray,
|
1230
|
+
box_threshold: float = 0.1,
|
1231
|
+
) -> List[Dict[str, Any]]:
|
1232
|
+
"""'custom_object_detection' is a tool that can detect instances of an
|
1233
|
+
object given a deployment_id of a previously finetuned object detection model.
|
1234
|
+
It is particularly useful when trying to detect objects that are not well detected by generalist models.
|
1235
|
+
It returns a list of bounding boxes with normalized
|
1236
|
+
coordinates, label names and associated confidence scores.
|
1237
|
+
|
1238
|
+
Parameters:
|
1239
|
+
deployment_id (str): The id of the finetuned model.
|
1240
|
+
image (np.ndarray): The image that contains instances of the object.
|
1241
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
1242
|
+
to 0.1.
|
1243
|
+
|
1244
|
+
Returns:
|
1245
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1246
|
+
bounding box of the detected objects with normalized coordinates between 0
|
1247
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1248
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1249
|
+
bounding box.
|
1250
|
+
|
1251
|
+
Example
|
1252
|
+
-------
|
1253
|
+
>>> custom_object_detection("abcd1234-5678efg", image)
|
1254
|
+
[
|
1255
|
+
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1256
|
+
{'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5]},
|
1257
|
+
{'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52]},
|
1258
|
+
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58]},
|
1259
|
+
]
|
1260
|
+
"""
|
1261
|
+
image_size = image.shape[:2]
|
1262
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
1263
|
+
return []
|
1264
|
+
|
1265
|
+
files = [("image", numpy_to_bytes(image))]
|
1266
|
+
payload = {
|
1267
|
+
"deployment_id": deployment_id,
|
1268
|
+
"confidence": box_threshold,
|
1269
|
+
}
|
1270
|
+
detections: List[List[Dict[str, Any]]] = send_inference_request(
|
1271
|
+
payload, "custom-object-detection", files=files, v2=True
|
1272
|
+
)
|
1273
|
+
|
1274
|
+
bboxes = detections[0]
|
1275
|
+
bboxes_formatted = [
|
1276
|
+
{
|
1277
|
+
"label": bbox["label"],
|
1278
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1279
|
+
"score": bbox["score"],
|
1280
|
+
}
|
1281
|
+
for bbox in bboxes
|
1282
|
+
]
|
1283
|
+
display_data = [
|
1284
|
+
{
|
1285
|
+
"label": bbox["label"],
|
1286
|
+
"bbox": bbox["bounding_box"],
|
1287
|
+
"score": bbox["score"],
|
1288
|
+
}
|
1289
|
+
for bbox in bboxes
|
1290
|
+
]
|
1291
|
+
|
1292
|
+
_display_tool_trace(
|
1293
|
+
custom_object_detection.__name__,
|
1294
|
+
payload,
|
1295
|
+
display_data,
|
1296
|
+
files,
|
1297
|
+
)
|
1298
|
+
return bboxes_formatted
|
1299
|
+
|
1300
|
+
|
1301
|
+
def custom_od_sam2_video_tracking(
|
1302
|
+
deployment_id: str,
|
1303
|
+
frames: List[np.ndarray],
|
1304
|
+
chunk_length: Optional[int] = 10,
|
1305
|
+
) -> List[List[Dict[str, Any]]]:
|
1306
|
+
"""'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
|
1307
|
+
custom model with predefined category names.
|
1308
|
+
It returns a list of bounding boxes, label names,
|
1309
|
+
mask file names and associated probability scores.
|
1310
|
+
|
1311
|
+
Parameters:
|
1312
|
+
deployment_id (str): The id of the deployed custom model.
|
1313
|
+
image (np.ndarray): The image to ground the prompt to.
|
1314
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
1315
|
+
new objects.
|
1316
|
+
|
1317
|
+
Returns:
|
1318
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
1319
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
1320
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
1321
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
1322
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
1323
|
+
the background.
|
1324
|
+
|
1325
|
+
Example
|
1326
|
+
-------
|
1327
|
+
>>> custom_od_sam2_video_tracking("abcd1234-5678efg", frames)
|
1328
|
+
[
|
1329
|
+
[
|
1330
|
+
{
|
1331
|
+
'label': '0: dinosaur',
|
1332
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1333
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1334
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1335
|
+
...,
|
1336
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1337
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1338
|
+
},
|
1339
|
+
],
|
1340
|
+
...
|
1341
|
+
]
|
1342
|
+
"""
|
1343
|
+
|
1344
|
+
ret = od_sam2_video_tracking(
|
1345
|
+
ODModels.CUSTOM,
|
1346
|
+
prompt="",
|
1347
|
+
frames=frames,
|
1348
|
+
chunk_length=chunk_length,
|
1349
|
+
fine_tune_id=deployment_id,
|
1350
|
+
)
|
1351
|
+
_display_tool_trace(
|
1352
|
+
custom_od_sam2_video_tracking.__name__,
|
1353
|
+
{},
|
1354
|
+
ret["display_data"],
|
1355
|
+
ret["files"],
|
1356
|
+
)
|
1357
|
+
return ret["return_data"] # type: ignore
|
1358
|
+
|
1359
|
+
|
1220
1360
|
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
1221
1361
|
"""'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
1222
1362
|
images including regular images or images of documents or presentations. It can be
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=fcucnAzr5Hue9xSqpBgA7RcRJP2CgAgQJ31p_R5lg-I,2794
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=36f0qAhQfA5lDhYv5BKpHfHgBVEBgOD-XNVHG5K4HLY,96619
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -40,8 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
|
|
40
40
|
vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent/utils/video_tracking.py,sha256=
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
47
|
-
vision_agent-0.2.
|
43
|
+
vision_agent/utils/video_tracking.py,sha256=7ZiFBqQRTid5ytPmkrAGQUiVMr-twzib8Ha2hN3JsR0,9474
|
44
|
+
vision_agent-0.2.227.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
45
|
+
vision_agent-0.2.227.dist-info/METADATA,sha256=qFefkLzCo7G98LyhIPqYzPOUv5nyvOK84DJvUWmeqcc,20039
|
46
|
+
vision_agent-0.2.227.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
47
|
+
vision_agent-0.2.227.dist-info/RECORD,,
|
File without changes
|
File without changes
|