vision-agent 0.2.226__py3-none-any.whl → 0.2.227__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,6 +63,7 @@ from .tools import (
63
63
  video_temporal_localization,
64
64
  vit_image_classification,
65
65
  vit_nsfw_classification,
66
+ custom_object_detection,
66
67
  )
67
68
 
68
69
  __new_tools__ = [
@@ -290,6 +290,13 @@ def od_sam2_video_tracking(
290
290
  )
291
291
  function_name = "florence2_object_detection"
292
292
 
293
+ elif od_model == ODModels.CUSTOM:
294
+ segment_results = custom_object_detection(
295
+ deployment_id=fine_tune_id,
296
+ image=segment_frames[frame_number],
297
+ )
298
+ function_name = "custom_object_detection"
299
+
293
300
  else:
294
301
  raise NotImplementedError(
295
302
  f"Object detection model '{od_model}' is not implemented."
@@ -1217,6 +1224,139 @@ def countgd_visual_prompt_object_detection(
1217
1224
  return bboxes_formatted
1218
1225
 
1219
1226
 
1227
+ def custom_object_detection(
1228
+ deployment_id: str,
1229
+ image: np.ndarray,
1230
+ box_threshold: float = 0.1,
1231
+ ) -> List[Dict[str, Any]]:
1232
+ """'custom_object_detection' is a tool that can detect instances of an
1233
+ object given a deployment_id of a previously finetuned object detection model.
1234
+ It is particularly useful when trying to detect objects that are not well detected by generalist models.
1235
+ It returns a list of bounding boxes with normalized
1236
+ coordinates, label names and associated confidence scores.
1237
+
1238
+ Parameters:
1239
+ deployment_id (str): The id of the finetuned model.
1240
+ image (np.ndarray): The image that contains instances of the object.
1241
+ box_threshold (float, optional): The threshold for detection. Defaults
1242
+ to 0.1.
1243
+
1244
+ Returns:
1245
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1246
+ bounding box of the detected objects with normalized coordinates between 0
1247
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1248
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
1249
+ bounding box.
1250
+
1251
+ Example
1252
+ -------
1253
+ >>> custom_object_detection("abcd1234-5678efg", image)
1254
+ [
1255
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1256
+ {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5]},
1257
+ {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52]},
1258
+ {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58]},
1259
+ ]
1260
+ """
1261
+ image_size = image.shape[:2]
1262
+ if image_size[0] < 1 or image_size[1] < 1:
1263
+ return []
1264
+
1265
+ files = [("image", numpy_to_bytes(image))]
1266
+ payload = {
1267
+ "deployment_id": deployment_id,
1268
+ "confidence": box_threshold,
1269
+ }
1270
+ detections: List[List[Dict[str, Any]]] = send_inference_request(
1271
+ payload, "custom-object-detection", files=files, v2=True
1272
+ )
1273
+
1274
+ bboxes = detections[0]
1275
+ bboxes_formatted = [
1276
+ {
1277
+ "label": bbox["label"],
1278
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1279
+ "score": bbox["score"],
1280
+ }
1281
+ for bbox in bboxes
1282
+ ]
1283
+ display_data = [
1284
+ {
1285
+ "label": bbox["label"],
1286
+ "bbox": bbox["bounding_box"],
1287
+ "score": bbox["score"],
1288
+ }
1289
+ for bbox in bboxes
1290
+ ]
1291
+
1292
+ _display_tool_trace(
1293
+ custom_object_detection.__name__,
1294
+ payload,
1295
+ display_data,
1296
+ files,
1297
+ )
1298
+ return bboxes_formatted
1299
+
1300
+
1301
+ def custom_od_sam2_video_tracking(
1302
+ deployment_id: str,
1303
+ frames: List[np.ndarray],
1304
+ chunk_length: Optional[int] = 10,
1305
+ ) -> List[List[Dict[str, Any]]]:
1306
+ """'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
1307
+ custom model with predefined category names.
1308
+ It returns a list of bounding boxes, label names,
1309
+ mask file names and associated probability scores.
1310
+
1311
+ Parameters:
1312
+ deployment_id (str): The id of the deployed custom model.
1313
+ image (np.ndarray): The image to ground the prompt to.
1314
+ chunk_length (Optional[int]): The number of frames to re-run florence2 to find
1315
+ new objects.
1316
+
1317
+ Returns:
1318
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1319
+ bounding box, and mask of the detected objects with normalized coordinates
1320
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1321
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1322
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1323
+ the background.
1324
+
1325
+ Example
1326
+ -------
1327
+ >>> custom_od_sam2_video_tracking("abcd1234-5678efg", frames)
1328
+ [
1329
+ [
1330
+ {
1331
+ 'label': '0: dinosaur',
1332
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1333
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1334
+ [0, 0, 0, ..., 0, 0, 0],
1335
+ ...,
1336
+ [0, 0, 0, ..., 0, 0, 0],
1337
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1338
+ },
1339
+ ],
1340
+ ...
1341
+ ]
1342
+ """
1343
+
1344
+ ret = od_sam2_video_tracking(
1345
+ ODModels.CUSTOM,
1346
+ prompt="",
1347
+ frames=frames,
1348
+ chunk_length=chunk_length,
1349
+ fine_tune_id=deployment_id,
1350
+ )
1351
+ _display_tool_trace(
1352
+ custom_od_sam2_video_tracking.__name__,
1353
+ {},
1354
+ ret["display_data"],
1355
+ ret["files"],
1356
+ )
1357
+ return ret["return_data"] # type: ignore
1358
+
1359
+
1220
1360
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1221
1361
  """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
1222
1362
  images including regular images or images of documents or presentations. It can be
@@ -17,6 +17,7 @@ class ODModels(str, Enum):
17
17
  COUNTGD = "countgd"
18
18
  FLORENCE2 = "florence2"
19
19
  OWLV2 = "owlv2"
20
+ CUSTOM = "custom"
20
21
 
21
22
 
22
23
  def split_frames_into_segments(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.226
3
+ Version: 0.2.227
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=15O7eQVn0bitmzUO5OxKdA618PoiLt6Z02gmKsSNMFM,2765
29
+ vision_agent/tools/__init__.py,sha256=fcucnAzr5Hue9xSqpBgA7RcRJP2CgAgQJ31p_R5lg-I,2794
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
34
- vision_agent/tools/tools.py,sha256=zqoo4ml9ZS99kOeOIN6Zplq7pxOwBrVZKKFUVIzsjfw,91712
34
+ vision_agent/tools/tools.py,sha256=36f0qAhQfA5lDhYv5BKpHfHgBVEBgOD-XNVHG5K4HLY,96619
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,8 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
40
40
  vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent/utils/video_tracking.py,sha256=EeOiSY8gjvvneuAnv-BO7yOyMBF_-1Irk_lLLOt3bDM,9452
44
- vision_agent-0.2.226.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
- vision_agent-0.2.226.dist-info/METADATA,sha256=_7jZokNbQLK6Ups2psyRKbPDjUIzU3daxCpfrHZ6gSU,20039
46
- vision_agent-0.2.226.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
- vision_agent-0.2.226.dist-info/RECORD,,
43
+ vision_agent/utils/video_tracking.py,sha256=7ZiFBqQRTid5ytPmkrAGQUiVMr-twzib8Ha2hN3JsR0,9474
44
+ vision_agent-0.2.227.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
+ vision_agent-0.2.227.dist-info/METADATA,sha256=qFefkLzCo7G98LyhIPqYzPOUv5nyvOK84DJvUWmeqcc,20039
46
+ vision_agent-0.2.227.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
+ vision_agent-0.2.227.dist-info/RECORD,,