vision-agent 0.2.226__py3-none-any.whl → 0.2.227__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -63,6 +63,7 @@ from .tools import (
63
63
  video_temporal_localization,
64
64
  vit_image_classification,
65
65
  vit_nsfw_classification,
66
+ custom_object_detection,
66
67
  )
67
68
 
68
69
  __new_tools__ = [
@@ -290,6 +290,13 @@ def od_sam2_video_tracking(
290
290
  )
291
291
  function_name = "florence2_object_detection"
292
292
 
293
+ elif od_model == ODModels.CUSTOM:
294
+ segment_results = custom_object_detection(
295
+ deployment_id=fine_tune_id,
296
+ image=segment_frames[frame_number],
297
+ )
298
+ function_name = "custom_object_detection"
299
+
293
300
  else:
294
301
  raise NotImplementedError(
295
302
  f"Object detection model '{od_model}' is not implemented."
@@ -1217,6 +1224,139 @@ def countgd_visual_prompt_object_detection(
1217
1224
  return bboxes_formatted
1218
1225
 
1219
1226
 
1227
+ def custom_object_detection(
1228
+ deployment_id: str,
1229
+ image: np.ndarray,
1230
+ box_threshold: float = 0.1,
1231
+ ) -> List[Dict[str, Any]]:
1232
+ """'custom_object_detection' is a tool that can detect instances of an
1233
+ object given a deployment_id of a previously finetuned object detection model.
1234
+ It is particularly useful when trying to detect objects that are not well detected by generalist models.
1235
+ It returns a list of bounding boxes with normalized
1236
+ coordinates, label names and associated confidence scores.
1237
+
1238
+ Parameters:
1239
+ deployment_id (str): The id of the finetuned model.
1240
+ image (np.ndarray): The image that contains instances of the object.
1241
+ box_threshold (float, optional): The threshold for detection. Defaults
1242
+ to 0.1.
1243
+
1244
+ Returns:
1245
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1246
+ bounding box of the detected objects with normalized coordinates between 0
1247
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1248
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
1249
+ bounding box.
1250
+
1251
+ Example
1252
+ -------
1253
+ >>> custom_object_detection("abcd1234-5678efg", image)
1254
+ [
1255
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1256
+ {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5]},
1257
+ {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52]},
1258
+ {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58]},
1259
+ ]
1260
+ """
1261
+ image_size = image.shape[:2]
1262
+ if image_size[0] < 1 or image_size[1] < 1:
1263
+ return []
1264
+
1265
+ files = [("image", numpy_to_bytes(image))]
1266
+ payload = {
1267
+ "deployment_id": deployment_id,
1268
+ "confidence": box_threshold,
1269
+ }
1270
+ detections: List[List[Dict[str, Any]]] = send_inference_request(
1271
+ payload, "custom-object-detection", files=files, v2=True
1272
+ )
1273
+
1274
+ bboxes = detections[0]
1275
+ bboxes_formatted = [
1276
+ {
1277
+ "label": bbox["label"],
1278
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1279
+ "score": bbox["score"],
1280
+ }
1281
+ for bbox in bboxes
1282
+ ]
1283
+ display_data = [
1284
+ {
1285
+ "label": bbox["label"],
1286
+ "bbox": bbox["bounding_box"],
1287
+ "score": bbox["score"],
1288
+ }
1289
+ for bbox in bboxes
1290
+ ]
1291
+
1292
+ _display_tool_trace(
1293
+ custom_object_detection.__name__,
1294
+ payload,
1295
+ display_data,
1296
+ files,
1297
+ )
1298
+ return bboxes_formatted
1299
+
1300
+
1301
+ def custom_od_sam2_video_tracking(
1302
+ deployment_id: str,
1303
+ frames: List[np.ndarray],
1304
+ chunk_length: Optional[int] = 10,
1305
+ ) -> List[List[Dict[str, Any]]]:
1306
+ """'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
1307
+ custom model with predefined category names.
1308
+ It returns a list of bounding boxes, label names,
1309
+ mask file names and associated probability scores.
1310
+
1311
+ Parameters:
1312
+ deployment_id (str): The id of the deployed custom model.
1313
+ image (np.ndarray): The image to ground the prompt to.
1314
+ chunk_length (Optional[int]): The number of frames to re-run florence2 to find
1315
+ new objects.
1316
+
1317
+ Returns:
1318
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1319
+ bounding box, and mask of the detected objects with normalized coordinates
1320
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1321
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1322
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1323
+ the background.
1324
+
1325
+ Example
1326
+ -------
1327
+ >>> custom_od_sam2_video_tracking("abcd1234-5678efg", frames)
1328
+ [
1329
+ [
1330
+ {
1331
+ 'label': '0: dinosaur',
1332
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1333
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1334
+ [0, 0, 0, ..., 0, 0, 0],
1335
+ ...,
1336
+ [0, 0, 0, ..., 0, 0, 0],
1337
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1338
+ },
1339
+ ],
1340
+ ...
1341
+ ]
1342
+ """
1343
+
1344
+ ret = od_sam2_video_tracking(
1345
+ ODModels.CUSTOM,
1346
+ prompt="",
1347
+ frames=frames,
1348
+ chunk_length=chunk_length,
1349
+ fine_tune_id=deployment_id,
1350
+ )
1351
+ _display_tool_trace(
1352
+ custom_od_sam2_video_tracking.__name__,
1353
+ {},
1354
+ ret["display_data"],
1355
+ ret["files"],
1356
+ )
1357
+ return ret["return_data"] # type: ignore
1358
+
1359
+
1220
1360
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1221
1361
  """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
1222
1362
  images including regular images or images of documents or presentations. It can be
@@ -17,6 +17,7 @@ class ODModels(str, Enum):
17
17
  COUNTGD = "countgd"
18
18
  FLORENCE2 = "florence2"
19
19
  OWLV2 = "owlv2"
20
+ CUSTOM = "custom"
20
21
 
21
22
 
22
23
  def split_frames_into_segments(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.226
3
+ Version: 0.2.227
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=15O7eQVn0bitmzUO5OxKdA618PoiLt6Z02gmKsSNMFM,2765
29
+ vision_agent/tools/__init__.py,sha256=fcucnAzr5Hue9xSqpBgA7RcRJP2CgAgQJ31p_R5lg-I,2794
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
34
- vision_agent/tools/tools.py,sha256=zqoo4ml9ZS99kOeOIN6Zplq7pxOwBrVZKKFUVIzsjfw,91712
34
+ vision_agent/tools/tools.py,sha256=36f0qAhQfA5lDhYv5BKpHfHgBVEBgOD-XNVHG5K4HLY,96619
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,8 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
40
40
  vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent/utils/video_tracking.py,sha256=EeOiSY8gjvvneuAnv-BO7yOyMBF_-1Irk_lLLOt3bDM,9452
44
- vision_agent-0.2.226.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
- vision_agent-0.2.226.dist-info/METADATA,sha256=_7jZokNbQLK6Ups2psyRKbPDjUIzU3daxCpfrHZ6gSU,20039
46
- vision_agent-0.2.226.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
- vision_agent-0.2.226.dist-info/RECORD,,
43
+ vision_agent/utils/video_tracking.py,sha256=7ZiFBqQRTid5ytPmkrAGQUiVMr-twzib8Ha2hN3JsR0,9474
44
+ vision_agent-0.2.227.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
+ vision_agent-0.2.227.dist-info/METADATA,sha256=qFefkLzCo7G98LyhIPqYzPOUv5nyvOK84DJvUWmeqcc,20039
46
+ vision_agent-0.2.227.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
+ vision_agent-0.2.227.dist-info/RECORD,,