vision-agent 0.2.242__py3-none-any.whl → 0.2.244__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  desc,doc,name
2
- "'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
2
+ "'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1) -> List[Dict[str, Any]]:
3
3
  'owlv2_object_detection' is a tool that can detect and count multiple objects
4
4
  given a text prompt such as category names or referring expressions on images. The
5
5
  categories in text prompt are separated by commas. It returns a list of bounding
@@ -10,8 +10,6 @@ desc,doc,name
10
10
  image (np.ndarray): The image to ground the prompt to.
11
11
  box_threshold (float, optional): The threshold for the box detection. Defaults
12
12
  to 0.10.
13
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
14
- fine-tuned model ID here to use it.
15
13
 
16
14
  Returns:
17
15
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -65,7 +63,7 @@ desc,doc,name
65
63
  },
66
64
  ]
67
65
  ",owlv2_sam2_instance_segmentation
68
- "'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
66
+ "'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
69
67
  'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
70
68
  objects in a video given a text prompt such as category names or referring
71
69
  expressions. The categories in the text prompt are separated by commas. It returns
@@ -79,8 +77,6 @@ desc,doc,name
79
77
  to 0.10.
80
78
  chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
81
79
  new objects.
82
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
83
- fine-tuned model ID here to use it.
84
80
 
85
81
  Returns:
86
82
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -240,7 +236,7 @@ desc,doc,name
240
236
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
241
237
  ]
242
238
  ",florence2_ocr
243
- "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
239
+ "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
244
240
  'florence2_object_detection' is a tool that can detect multiple objects given a
245
241
  text prompt which can be object names or caption. You can optionally separate the
246
242
  object names in the text with commas. It returns a list of bounding boxes with
@@ -250,8 +246,6 @@ desc,doc,name
250
246
  prompt (str): The prompt to ground to the image. Use exclusive categories that
251
247
  do not overlap such as 'person, car' and NOT 'person, athlete'.
252
248
  image (np.ndarray): The image to used to detect objects
253
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
254
- fine-tuned model ID here to use it.
255
249
 
256
250
  Returns:
257
251
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -268,7 +262,7 @@ desc,doc,name
268
262
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
269
263
  ]
270
264
  ",florence2_object_detection
271
- "'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
265
+ "'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
272
266
  'florence2_sam2_instance_segmentation' is a tool that can segment multiple
273
267
  objects given a text prompt such as category names or referring expressions. The
274
268
  categories in the text prompt are separated by commas. It returns a list of
@@ -279,8 +273,6 @@ desc,doc,name
279
273
  prompt (str): The prompt to ground to the image. Use exclusive categories that
280
274
  do not overlap such as 'person, car' and NOT 'person, athlete'.
281
275
  image (np.ndarray): The image to ground the prompt to.
282
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
283
- fine-tuned model ID here to use it.
284
276
 
285
277
  Returns:
286
278
  List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -306,7 +298,7 @@ desc,doc,name
306
298
  },
307
299
  ]
308
300
  ",florence2_sam2_instance_segmentation
309
- "'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
301
+ "'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
310
302
  'florence2_sam2_video_tracking' is a tool that can track and segment multiple
311
303
  objects in a video given a text prompt such as category names or referring
312
304
  expressions. The categories in the text prompt are separated by commas. It returns
@@ -319,8 +311,6 @@ desc,doc,name
319
311
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
320
312
  chunk_length (Optional[int]): The number of frames to re-run florence2 to find
321
313
  new objects.
322
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
323
- fine-tuned model ID here to use it.
324
314
 
325
315
  Returns:
326
316
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -484,16 +474,17 @@ desc,doc,name
484
474
  >>> activity_recognition('Did a goal happened?', frames)
485
475
  [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
486
476
  ",activity_recognition
487
- 'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
488
- 'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
477
+ 'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
478
+ 'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
489
479
  depth image from a given RGB image. The returned depth image is monochrome and
490
- represents depth values as pixel intesities with pixel values ranging from 0 to 255.
480
+ represents depth values as pixel intensities with pixel values ranging from 0 to 255.
491
481
 
492
482
  Parameters:
493
483
  image (np.ndarray): The image to used to generate depth image
494
484
 
495
485
  Returns:
496
- np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255.
486
+ np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
487
+ where high values represent closer objects and low values further.
497
488
 
498
489
  Example
499
490
  -------
Binary file
@@ -23,7 +23,8 @@ from .tools import (
23
23
  countgd_object_detection,
24
24
  countgd_sam2_instance_segmentation,
25
25
  countgd_sam2_video_tracking,
26
- countgd_visual_prompt_object_detection,
26
+ countgd_sam2_visual_instance_segmentation,
27
+ countgd_visual_object_detection,
27
28
  custom_object_detection,
28
29
  depth_anything_v2,
29
30
  detr_segmentation,
@@ -41,6 +42,9 @@ from .tools import (
41
42
  get_tools_df,
42
43
  get_tools_docstring,
43
44
  get_utilties_docstring,
45
+ glee_object_detection,
46
+ glee_sam2_instance_segmentation,
47
+ glee_sam2_video_tracking,
44
48
  load_image,
45
49
  minimum_distance,
46
50
  ocr,
@@ -53,6 +57,8 @@ from .tools import (
53
57
  owlv2_sam2_video_tracking,
54
58
  qwen2_vl_images_vqa,
55
59
  qwen2_vl_video_vqa,
60
+ qwen25_vl_images_vqa,
61
+ qwen25_vl_video_vqa,
56
62
  sam2,
57
63
  save_image,
58
64
  save_json,
@@ -313,6 +313,13 @@ def od_sam2_video_tracking(
313
313
  box_threshold=box_threshold,
314
314
  )
315
315
  function_name = "custom_object_detection"
316
+ elif od_model == ODModels.GLEE:
317
+ segment_results = glee_object_detection(
318
+ prompt=prompt,
319
+ image=segment_frames[frame_number],
320
+ box_threshold=box_threshold,
321
+ )
322
+ function_name = "glee_object_detection"
316
323
 
317
324
  else:
318
325
  raise NotImplementedError(
@@ -1128,25 +1135,71 @@ def countgd_sam2_video_tracking(
1128
1135
  return ret["return_data"] # type: ignore
1129
1136
 
1130
1137
 
1131
- # Custom Models
1138
+ def _countgd_visual_object_detection(
1139
+ visual_prompts: List[List[float]],
1140
+ image: np.ndarray,
1141
+ box_threshold: float = 0.23,
1142
+ ) -> Dict[str, Any]:
1143
+ image_size = image.shape[:2]
1144
+
1145
+ buffer_bytes = numpy_to_bytes(image)
1146
+ files = [("image", buffer_bytes)]
1147
+ visual_prompts = [
1148
+ denormalize_bbox(bbox, image.shape[:2]) for bbox in visual_prompts
1149
+ ]
1150
+ payload = {
1151
+ "visual_prompts": json.dumps(visual_prompts),
1152
+ "model": "countgd",
1153
+ "confidence": box_threshold,
1154
+ }
1155
+ metadata = {"function_name": "countgd_visual_object_detection"}
1156
+
1157
+ detections = send_task_inference_request(
1158
+ payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
1159
+ )
1160
+
1161
+ # get the first frame
1162
+ bboxes = detections[0]
1163
+ bboxes_formatted = [
1164
+ {
1165
+ "label": bbox["label"],
1166
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1167
+ "score": round(bbox["score"], 2),
1168
+ }
1169
+ for bbox in bboxes
1170
+ ]
1171
+ display_data = [
1172
+ {
1173
+ "label": bbox["label"],
1174
+ "bbox": bbox["bounding_box"],
1175
+ "score": bbox["score"],
1176
+ }
1177
+ for bbox in bboxes
1178
+ ]
1179
+ return {
1180
+ "files": files,
1181
+ "return_data": bboxes_formatted,
1182
+ "display_data": display_data,
1183
+ }
1132
1184
 
1133
1185
 
1134
- def countgd_visual_prompt_object_detection(
1186
+ def countgd_visual_object_detection(
1135
1187
  visual_prompts: List[List[float]],
1136
1188
  image: np.ndarray,
1137
1189
  box_threshold: float = 0.23,
1138
1190
  ) -> List[Dict[str, Any]]:
1139
- """'countgd_visual_prompt_object_detection' is a tool that can precisely count
1140
- multiple instances of an object given few visual example prompts. It returns a list
1141
- of bounding boxes with normalized coordinates, label names and associated
1142
- confidence scores.
1191
+ """'countgd_visual_object_detection' is a tool that can detect multiple instances
1192
+ of an object given a visual prompt. It is particularly useful when trying to detect
1193
+ and count a large number of objects. You can optionally separate object names in
1194
+ the prompt with commas. It returns a list of bounding boxes with normalized
1195
+ coordinates, label names and associated confidence scores.
1143
1196
 
1144
1197
  Parameters:
1145
1198
  visual_prompts (List[List[float]]): Bounding boxes of the object in format
1146
- [xmin, ymin, xmax, ymax]. Upto 3 bounding boxes can be provided. image
1147
- (np.ndarray): The image that contains multiple instances of the object.
1148
- box_threshold (float, optional): The threshold for detection. Defaults to
1149
- 0.23.
1199
+ [xmin, ymin, xmax, ymax] with normalized coordinates. Up to 3 bounding
1200
+ boxes can be provided.
1201
+ image (np.ndarray): The image that contains multiple instances of the object.
1202
+ box_threshold (float, optional): The threshold for detection. Defaults to 0.23.
1150
1203
 
1151
1204
  Returns:
1152
1205
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -1172,43 +1225,76 @@ def countgd_visual_prompt_object_detection(
1172
1225
  if image_size[0] < 1 or image_size[1] < 1:
1173
1226
  return []
1174
1227
 
1175
- buffer_bytes = numpy_to_bytes(image)
1176
- files = [("image", buffer_bytes)]
1177
- visual_prompts = [
1178
- denormalize_bbox(bbox, image.shape[:2]) for bbox in visual_prompts
1179
- ]
1180
- payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
1181
- metadata = {"function_name": "countgd_visual_prompt_object_detection"}
1228
+ od_ret = _countgd_visual_object_detection(visual_prompts, image, box_threshold)
1182
1229
 
1183
- detections = send_task_inference_request(
1184
- payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
1230
+ _display_tool_trace(
1231
+ countgd_visual_object_detection.__name__,
1232
+ {},
1233
+ od_ret["display_data"],
1234
+ od_ret["files"],
1185
1235
  )
1186
1236
 
1187
- # get the first frame
1188
- bboxes_per_frame = detections[0]
1189
- bboxes_formatted = [
1190
- {
1191
- "label": bbox["label"],
1192
- "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1193
- "score": round(bbox["score"], 2),
1194
- }
1195
- for bbox in bboxes_per_frame
1196
- ]
1197
- _display_tool_trace(
1198
- countgd_visual_prompt_object_detection.__name__,
1199
- payload,
1237
+ return od_ret["return_data"] # type: ignore
1238
+
1239
+
1240
+ def countgd_sam2_visual_instance_segmentation(
1241
+ visual_prompts: List[List[float]],
1242
+ image: np.ndarray,
1243
+ box_threshold: float = 0.23,
1244
+ ) -> List[Dict[str, Any]]:
1245
+ """'countgd_sam2_visual_instance_segmentation' is a tool that can precisely count
1246
+ multiple instances of an object given few visual example prompts. It returns a list
1247
+ of bounding boxes, label names, masks and associated probability scores.
1248
+
1249
+ Parameters:
1250
+ visual_prompts (List[List[float]]): Bounding boxes of the object in format
1251
+ [xmin, ymin, xmax, ymax] with normalized coordinates. Up to 3 bounding
1252
+ boxes can be provided.
1253
+ image (np.ndarray): The image that contains multiple instances of the object.
1254
+ box_threshold (float, optional): The threshold for detection. Defaults to 0.23.
1255
+
1256
+ Returns:
1257
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1258
+ bounding box, and mask of the detected objects with normalized coordinates
1259
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1260
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1261
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1262
+ the background.
1263
+
1264
+ Example
1265
+ -------
1266
+ >>> countgd_sam2_visual_instance_segmentation(
1267
+ visual_prompts=[[0.1, 0.1, 0.4, 0.42], [0.2, 0.3, 0.25, 0.35]],
1268
+ image=image
1269
+ )
1200
1270
  [
1201
1271
  {
1202
- "label": e["label"],
1203
- "score": e["score"],
1204
- "bbox": denormalize_bbox(e["bbox"], image_size),
1205
- }
1206
- for e in bboxes_formatted
1207
- ],
1208
- files,
1272
+ 'score': 0.49,
1273
+ 'label': 'object',
1274
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1275
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1276
+ [0, 0, 0, ..., 0, 0, 0],
1277
+ ...,
1278
+ [0, 0, 0, ..., 0, 0, 0],
1279
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1280
+ },
1281
+ ]
1282
+ """
1283
+
1284
+ od_ret = _countgd_visual_object_detection(visual_prompts, image, box_threshold)
1285
+ seg_ret = _sam2(
1286
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
1209
1287
  )
1288
+ _display_tool_trace(
1289
+ countgd_sam2_visual_instance_segmentation.__name__,
1290
+ {},
1291
+ seg_ret["display_data"],
1292
+ seg_ret["files"],
1293
+ )
1294
+ return seg_ret["return_data"] # type: ignore
1210
1295
 
1211
- return bboxes_formatted
1296
+
1297
+ # Custom Models
1212
1298
 
1213
1299
 
1214
1300
  def custom_object_detection(
@@ -1401,9 +1487,9 @@ def agentic_object_detection(
1401
1487
  ) -> List[Dict[str, Any]]:
1402
1488
  """'agentic_object_detection' is a tool that can detect multiple objects given a
1403
1489
  text prompt such as object names or referring expressions on images. It's
1404
- particularly good at detecting specific objects given detailed descriptive prompts.
1405
- It returns a list of bounding boxes with normalized coordinates, label names and
1406
- associated probability scores.
1490
+ particularly good at detecting specific objects given detailed descriptive prompts
1491
+ but runs slower. It returns a list of bounding boxes with normalized coordinates,
1492
+ label names and associated probability scores.
1407
1493
 
1408
1494
  Parameters:
1409
1495
  prompt (str): The prompt to ground to the image, only supports a single prompt
@@ -1447,8 +1533,8 @@ def agentic_sam2_instance_segmentation(
1447
1533
  """'agentic_sam2_instance_segmentation' is a tool that can detect multiple
1448
1534
  instances given a text prompt such as object names or referring expressions on
1449
1535
  images. It's particularly good at detecting specific objects given detailed
1450
- descriptive prompts. It returns a list of bounding boxes with normalized
1451
- coordinates, label names, masks and associated probability scores.
1536
+ descriptive prompts but runs slower. It returns a list of bounding boxes with
1537
+ normalized coordinates, label names, masks and associated probability scores.
1452
1538
 
1453
1539
  Parameters:
1454
1540
  prompt (str): The object that needs to be counted, only supports a single
@@ -1505,9 +1591,9 @@ def agentic_sam2_video_tracking(
1505
1591
  """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
1506
1592
  objects in a video given a text prompt such as object names or referring
1507
1593
  expressions. It's particularly good at detecting specific objects given detailed
1508
- descriptive prompts and returns a list of bounding boxes, label names, masks and
1509
- associated probability scores and is useful for tracking and counting without
1510
- duplicating counts.
1594
+ descriptive prompts but runs slower, and returns a list of bounding boxes, label
1595
+ names, masks and associated probability scores and is useful for tracking and
1596
+ counting without duplicating counts.
1511
1597
 
1512
1598
  Parameters:
1513
1599
  prompt (str): The prompt to ground to the image, only supports a single prompt
@@ -1560,6 +1646,305 @@ def agentic_sam2_video_tracking(
1560
1646
  return ret["return_data"] # type: ignore
1561
1647
 
1562
1648
 
1649
+ # GLEE Tools
1650
+
1651
+
1652
+ def _glee_object_detection(
1653
+ prompt: str,
1654
+ image: np.ndarray,
1655
+ box_threshold: float,
1656
+ image_size: Tuple[int, ...],
1657
+ image_bytes: Optional[bytes] = None,
1658
+ ) -> Dict[str, Any]:
1659
+ if image_bytes is None:
1660
+ image_bytes = numpy_to_bytes(image)
1661
+
1662
+ files = [("image", image_bytes)]
1663
+ payload = {
1664
+ "prompts": [s.strip() for s in prompt.split(",")],
1665
+ "confidence": box_threshold,
1666
+ "model": "glee",
1667
+ }
1668
+ metadata = {"function_name": "glee"}
1669
+ detections = send_task_inference_request(
1670
+ payload,
1671
+ "text-to-object-detection",
1672
+ files=files,
1673
+ metadata=metadata,
1674
+ )
1675
+ # get the first frame
1676
+ bboxes = detections[0]
1677
+ bboxes_formatted = [
1678
+ {
1679
+ "label": bbox["label"],
1680
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1681
+ "score": round(bbox["score"], 2),
1682
+ }
1683
+ for bbox in bboxes
1684
+ ]
1685
+ display_data = [
1686
+ {
1687
+ "label": bbox["label"],
1688
+ "bbox": bbox["bounding_box"],
1689
+ "score": round(bbox["score"], 2),
1690
+ }
1691
+ for bbox in bboxes
1692
+ ]
1693
+ return {
1694
+ "files": files,
1695
+ "return_data": bboxes_formatted,
1696
+ "display_data": display_data,
1697
+ }
1698
+
1699
+
1700
+ def glee_object_detection(
1701
+ prompt: str,
1702
+ image: np.ndarray,
1703
+ box_threshold: float = 0.23,
1704
+ ) -> List[Dict[str, Any]]:
1705
+ """'glee_object_detection' is a tool that can detect multiple objects given a
1706
+ text prompt such as object names or referring expressions on images. It's
1707
+ particularly good at detecting specific objects given detailed descriptive prompts.
1708
+ It returns a list of bounding boxes with normalized coordinates, label names and
1709
+ associated probability scores.
1710
+
1711
+ Parameters:
1712
+ prompt (str): The prompt to ground to the image, only supports a single prompt
1713
+ with no commas or periods.
1714
+ image (np.ndarray): The image to ground the prompt to.
1715
+
1716
+ Returns:
1717
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1718
+ bounding box of the detected objects with normalized coordinates between 0
1719
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1720
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
1721
+ bounding box.
1722
+
1723
+ Example
1724
+ -------
1725
+ >>> glee_object_detection("person holding a box", image)
1726
+ [
1727
+ {'score': 0.99, 'label': 'person holding a box', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1728
+ {'score': 0.98, 'label': 'person holding a box', 'bbox': [0.2, 0.21, 0.45, 0.5},
1729
+ ]
1730
+ """
1731
+
1732
+ od_ret = _glee_object_detection(prompt, image, box_threshold, image.shape[:2])
1733
+ _display_tool_trace(
1734
+ glee_object_detection.__name__,
1735
+ {"prompts": prompt, "confidence": box_threshold},
1736
+ od_ret["display_data"],
1737
+ od_ret["files"],
1738
+ )
1739
+ return od_ret["return_data"] # type: ignore
1740
+
1741
+
1742
+ def glee_sam2_instance_segmentation(
1743
+ prompt: str, image: np.ndarray, box_threshold: float = 0.23
1744
+ ) -> List[Dict[str, Any]]:
1745
+ """'glee_sam2_instance_segmentation' is a tool that can detect multiple
1746
+ instances given a text prompt such as object names or referring expressions on
1747
+ images. It's particularly good at detecting specific objects given detailed
1748
+ descriptive prompts. It returns a list of bounding boxes with normalized
1749
+ coordinates, label names, masks and associated probability scores.
1750
+
1751
+ Parameters:
1752
+ prompt (str): The object that needs to be counted, only supports a single
1753
+ prompt with no commas or periods.
1754
+ image (np.ndarray): The image that contains multiple instances of the object.
1755
+
1756
+ Returns:
1757
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1758
+ bounding box, and mask of the detected objects with normalized coordinates
1759
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1760
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1761
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1762
+ the background.
1763
+
1764
+ Example
1765
+ -------
1766
+ >>> glee_sam2_instance_segmentation("a large blue flower", image)
1767
+ [
1768
+ {
1769
+ 'score': 0.49,
1770
+ 'label': 'a large blue flower',
1771
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1772
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1773
+ [0, 0, 0, ..., 0, 0, 0],
1774
+ ...,
1775
+ [0, 0, 0, ..., 0, 0, 0],
1776
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1777
+ },
1778
+ ]
1779
+ """
1780
+ od_ret = _glee_object_detection(prompt, image, box_threshold, image.shape[:2])
1781
+ seg_ret = _sam2(
1782
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
1783
+ )
1784
+
1785
+ _display_tool_trace(
1786
+ glee_sam2_instance_segmentation.__name__,
1787
+ {
1788
+ "prompts": prompt,
1789
+ "confidence": box_threshold,
1790
+ },
1791
+ seg_ret["display_data"],
1792
+ seg_ret["files"],
1793
+ )
1794
+
1795
+ return seg_ret["return_data"] # type: ignore
1796
+
1797
+
1798
+ def glee_sam2_video_tracking(
1799
+ prompt: str,
1800
+ frames: List[np.ndarray],
1801
+ box_threshold: float = 0.23,
1802
+ chunk_length: Optional[int] = 25,
1803
+ ) -> List[List[Dict[str, Any]]]:
1804
+ """'glee_sam2_video_tracking' is a tool that can track and segment multiple
1805
+ objects in a video given a text prompt such as object names or referring
1806
+ expressions. It's particularly good at detecting specific objects given detailed
1807
+ descriptive prompts and returns a list of bounding boxes, label names, masks and
1808
+ associated probability scores and is useful for tracking and counting without
1809
+ duplicating counts.
1810
+
1811
+ Parameters:
1812
+ prompt (str): The prompt to ground to the image, only supports a single prompt
1813
+ with no commas or periods.
1814
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
1815
+ chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
1816
+ to find new objects.
1817
+
1818
+ Returns:
1819
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
1820
+ label, segmentation mask and bounding boxes. The outer list represents each
1821
+ frame and the inner list is the entities per frame. The detected objects
1822
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
1823
+ and ymin are the coordinates of the top-left and xmax and ymax are the
1824
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
1825
+ numpy array where 1 indicates the object and 0 indicates the background.
1826
+ The label names are prefixed with their ID represent the total count.
1827
+
1828
+ Example
1829
+ -------
1830
+ >>> glee_sam2_video_tracking("a runner with yellow shoes", frames)
1831
+ [
1832
+ [
1833
+ {
1834
+ 'label': '0: a runner with yellow shoes',
1835
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1836
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1837
+ [0, 0, 0, ..., 0, 0, 0],
1838
+ ...,
1839
+ [0, 0, 0, ..., 0, 0, 0],
1840
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1841
+ },
1842
+ ],
1843
+ ...
1844
+ ]
1845
+ """
1846
+ ret = od_sam2_video_tracking(
1847
+ ODModels.GLEE,
1848
+ prompt=prompt,
1849
+ frames=frames,
1850
+ box_threshold=box_threshold,
1851
+ chunk_length=chunk_length,
1852
+ )
1853
+ _display_tool_trace(
1854
+ glee_sam2_video_tracking.__name__,
1855
+ {"prompt": prompt, "chunk_length": chunk_length},
1856
+ ret["display_data"],
1857
+ ret["files"],
1858
+ )
1859
+ return ret["return_data"] # type: ignore
1860
+
1861
+
1862
+ # Qwen2 and 2.5 VL Tool
1863
+
1864
+
1865
+ def qwen25_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1866
+ """'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
1867
+ images including regular images or images of documents or presentations. It can be
1868
+ very useful for document QA or OCR text extraction. It returns text as an answer to
1869
+ the question.
1870
+
1871
+ Parameters:
1872
+ prompt (str): The question about the document image
1873
+ images (List[np.ndarray]): The reference images used for the question
1874
+
1875
+ Returns:
1876
+ str: A string which is the answer to the given prompt.
1877
+
1878
+ Example
1879
+ -------
1880
+ >>> qwen25_vl_images_vqa('Give a summary of the document', images)
1881
+ 'The document talks about the history of the United States of America and its...'
1882
+ """
1883
+ if isinstance(images, np.ndarray):
1884
+ images = [images]
1885
+
1886
+ for image in images:
1887
+ if image.shape[0] < 1 or image.shape[1] < 1:
1888
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
1889
+
1890
+ files = [("images", numpy_to_bytes(image)) for image in images]
1891
+ payload = {
1892
+ "prompt": prompt,
1893
+ "model": "qwen25vl",
1894
+ "function_name": "qwen25_vl_images_vqa",
1895
+ }
1896
+ data: Dict[str, Any] = send_inference_request(
1897
+ payload, "image-to-text", files=files, v2=True
1898
+ )
1899
+ _display_tool_trace(
1900
+ qwen25_vl_images_vqa.__name__,
1901
+ payload,
1902
+ cast(str, data),
1903
+ files,
1904
+ )
1905
+ return cast(str, data)
1906
+
1907
+
1908
+ def qwen25_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1909
+ """'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
1910
+ including regular videos or videos of documents or presentations. It returns text
1911
+ as an answer to the question.
1912
+
1913
+ Parameters:
1914
+ prompt (str): The question about the video
1915
+ frames (List[np.ndarray]): The reference frames used for the question
1916
+
1917
+ Returns:
1918
+ str: A string which is the answer to the given prompt.
1919
+
1920
+ Example
1921
+ -------
1922
+ >>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
1923
+ 'Lionel Messi'
1924
+ """
1925
+
1926
+ if len(frames) == 0 or not isinstance(frames, list):
1927
+ raise ValueError("Must provide a list of numpy arrays for frames")
1928
+
1929
+ buffer_bytes = frames_to_bytes(frames)
1930
+ files = [("video", buffer_bytes)]
1931
+ payload = {
1932
+ "prompt": prompt,
1933
+ "model": "qwen25vl",
1934
+ "function_name": "qwen25_vl_video_vqa",
1935
+ }
1936
+ data: Dict[str, Any] = send_inference_request(
1937
+ payload, "image-to-text", files=files, v2=True
1938
+ )
1939
+ _display_tool_trace(
1940
+ qwen25_vl_video_vqa.__name__,
1941
+ payload,
1942
+ cast(str, data),
1943
+ files,
1944
+ )
1945
+ return cast(str, data)
1946
+
1947
+
1563
1948
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1564
1949
  """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
1565
1950
  images including regular images or images of documents or presentations. It can be
@@ -1882,6 +2267,58 @@ def _sample(frames: List[np.ndarray], sample_size: int) -> List[np.ndarray]:
1882
2267
  return sampled_frames
1883
2268
 
1884
2269
 
2270
+ def _lmm_activity_recognition(
2271
+ lmm: LMM,
2272
+ segment: List[np.ndarray],
2273
+ prompt: str,
2274
+ ) -> List[float]:
2275
+ frames = _sample(segment, 10)
2276
+ media = []
2277
+ for frame in frames:
2278
+ buffer = io.BytesIO()
2279
+ image_pil = Image.fromarray(frame)
2280
+ if image_pil.size[0] > 768:
2281
+ image_pil.thumbnail((768, 768))
2282
+ image_pil.save(buffer, format="PNG")
2283
+ image_bytes = buffer.getvalue()
2284
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
2285
+ media.append(image_b64)
2286
+
2287
+ response = cast(str, lmm.generate(prompt, media))
2288
+ if "yes" in response.lower():
2289
+ return [1.0] * len(segment)
2290
+ return [0.0] * len(segment)
2291
+
2292
+
2293
+ def _qwenvl_activity_recognition(
2294
+ segment: List[np.ndarray], prompt: str, model_name: str = "qwen2vl"
2295
+ ) -> List[float]:
2296
+ payload: Dict[str, Any] = {
2297
+ "prompt": prompt,
2298
+ "model": model_name,
2299
+ "function_name": f"{model_name}_vl_video_vqa",
2300
+ }
2301
+ segment_buffer_bytes = [("video", frames_to_bytes(segment))]
2302
+ response = send_inference_request(
2303
+ payload, "image-to-text", files=segment_buffer_bytes, v2=True
2304
+ )
2305
+ if "yes" in response.lower():
2306
+ return [1.0] * len(segment)
2307
+ return [0.0] * len(segment)
2308
+
2309
+
2310
+ def _qwen2vl_activity_recognition(
2311
+ segment: List[np.ndarray], prompt: str
2312
+ ) -> List[float]:
2313
+ return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
2314
+
2315
+
2316
+ def _qwen25vl_activity_recognition(
2317
+ segment: List[np.ndarray], prompt: str
2318
+ ) -> List[float]:
2319
+ return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
2320
+
2321
+
1885
2322
  def activity_recognition(
1886
2323
  prompt: str,
1887
2324
  frames: List[np.ndarray],
@@ -1921,53 +2358,26 @@ def activity_recognition(
1921
2358
  f"{prompt} Please respond with a 'yes' or 'no' based on the frames provided."
1922
2359
  )
1923
2360
 
1924
- def _lmm_activity_recognition(
1925
- lmm: LMM,
1926
- segment: List[np.ndarray],
1927
- ) -> List[float]:
1928
- frames = _sample(segment, 10)
1929
- media = []
1930
- for frame in frames:
1931
- buffer = io.BytesIO()
1932
- image_pil = Image.fromarray(frame)
1933
- if image_pil.size[0] > 768:
1934
- image_pil.thumbnail((768, 768))
1935
- image_pil.save(buffer, format="PNG")
1936
- image_bytes = buffer.getvalue()
1937
- image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1938
- media.append(image_b64)
1939
-
1940
- response = cast(str, lmm.generate(prompt, media))
1941
- if "yes" in response.lower():
1942
- return [1.0] * len(segment)
1943
- return [0.0] * len(segment)
1944
-
1945
- def _qwen2vl_activity_recognition(segment: List[np.ndarray]) -> List[float]:
1946
- payload: Dict[str, Any] = {
1947
- "prompt": prompt,
1948
- "model": "qwen2vl",
1949
- "function_name": "qwen2_vl_video_vqa",
1950
- }
1951
- segment_buffer_bytes = [("video", frames_to_bytes(segment))]
1952
- response = send_inference_request(
1953
- payload, "image-to-text", files=segment_buffer_bytes, v2=True
1954
- )
1955
- if "yes" in response.lower():
1956
- return [1.0] * len(segment)
1957
- return [0.0] * len(segment)
1958
-
1959
2361
  if model == "claude-35":
1960
2362
 
1961
2363
  def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
1962
- return _lmm_activity_recognition(AnthropicLMM(), segment)
2364
+ return _lmm_activity_recognition(AnthropicLMM(), segment, prompt)
1963
2365
 
1964
2366
  elif model == "gpt-4o":
1965
2367
 
1966
2368
  def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
1967
- return _lmm_activity_recognition(OpenAILMM(), segment)
2369
+ return _lmm_activity_recognition(OpenAILMM(), segment, prompt)
1968
2370
 
1969
2371
  elif model == "qwen2vl":
1970
- _apply_activity_recognition = _qwen2vl_activity_recognition
2372
+
2373
+ def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2374
+ return _qwen2vl_activity_recognition(segment, prompt)
2375
+
2376
+ elif model == "qwen25vl":
2377
+
2378
+ def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2379
+ return _qwen25vl_activity_recognition(segment, prompt)
2380
+
1971
2381
  else:
1972
2382
  raise ValueError(f"Invalid model: {model}")
1973
2383
 
@@ -2135,15 +2545,16 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
2135
2545
 
2136
2546
 
2137
2547
  def depth_anything_v2(image: np.ndarray) -> np.ndarray:
2138
- """'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
2548
+ """'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
2139
2549
  depth image from a given RGB image. The returned depth image is monochrome and
2140
- represents depth values as pixel intesities with pixel values ranging from 0 to 255.
2550
+ represents depth values as pixel intensities with pixel values ranging from 0 to 255.
2141
2551
 
2142
2552
  Parameters:
2143
2553
  image (np.ndarray): The image to used to generate depth image
2144
2554
 
2145
2555
  Returns:
2146
- np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255.
2556
+ np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
2557
+ where high values represent closer objects and low values further.
2147
2558
 
2148
2559
  Example
2149
2560
  -------
@@ -15,6 +15,7 @@ class ODModels(str, Enum):
15
15
  FLORENCE2 = "florence2"
16
16
  OWLV2 = "owlv2"
17
17
  AGENTIC = "agentic"
18
+ GLEE = "glee"
18
19
  CUSTOM = "custom"
19
20
 
20
21
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.242
3
+ Version: 0.2.244
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -1,5 +1,5 @@
1
- vision_agent/.sim_tools/df.csv,sha256=3tuSr8bjF1pzjEpRJi7LLJssU_2A25SDCWvs4VZKkg4,41479
2
- vision_agent/.sim_tools/embs.npy,sha256=pi7h3NHlrKncIGNR-oPn_XoTe2PzBb9-aFMi7qK0tEw,245888
1
+ vision_agent/.sim_tools/df.csv,sha256=mIr1iubLDqGsL3K3ab6bmh6PtLvmOpvnaIX28lxdV6c,40706
2
+ vision_agent/.sim_tools/embs.npy,sha256=pZZMFMg0rkIAOpMOjN7gjD58hPK07c2ylfQ9YST8xFA,245888
3
3
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
4
4
  vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
5
5
  vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
@@ -33,11 +33,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
33
33
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
34
34
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
35
35
  vision_agent/sim/sim.py,sha256=VSU_1rYd4ifvF45xKWBEYugxdeeEQVpj0QL6rjx49i4,9801
36
- vision_agent/tools/__init__.py,sha256=bYrOPuqrpwFA3TeY_pxRXVv61oJsxVWVgv1psJlBEcc,2391
36
+ vision_agent/tools/__init__.py,sha256=H7FWx0OXGVIjrSOTpNH-YwE4LBuOfThZTG-SHFpo_Z8,2576
37
37
  vision_agent/tools/meta_tools.py,sha256=DNRXHX9nZ1GBeqeLiq87sBshoe0aiZeYasETbG-9neI,24053
38
38
  vision_agent/tools/planner_tools.py,sha256=orBTdJQz2NKoLuX9WE6XixaYuG305xz0UBYvZOiuquQ,19474
39
39
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
40
- vision_agent/tools/tools.py,sha256=uhvgPeAzhOV2vfBa216vq-JVItqgzIRKs1JMBezj2Es,107631
40
+ vision_agent/tools/tools.py,sha256=-jBrykNYPinRpDXnBsnzlSgJ_hbZClzCp3pkzWjTUxs,122098
41
41
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
42
42
  vision_agent/utils/agent.py,sha256=8z4Ei0q397lVWUga8v9nQKuenGAsh2wfkAKQOB8CwpI,14701
43
43
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -47,8 +47,8 @@ vision_agent/utils/tools.py,sha256=USZL0MKsiJgqA8RFiYRTcj_Kn2FVYKLHK4wIk0gP1Ow,7
47
47
  vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
48
48
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
49
49
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
50
- vision_agent/utils/video_tracking.py,sha256=eMIiWOG24bgXbqOy1DTtepO2gPo1ClW6Y0tdbEF_14k,12227
51
- vision_agent-0.2.242.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
- vision_agent-0.2.242.dist-info/METADATA,sha256=Lvr9OdngkgZJd-ifod6Wp8FuX0BnAmR6fZIelqAmjz8,5712
53
- vision_agent-0.2.242.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
54
- vision_agent-0.2.242.dist-info/RECORD,,
50
+ vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
51
+ vision_agent-0.2.244.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
+ vision_agent-0.2.244.dist-info/METADATA,sha256=RLXic4HHajyOObt4rHuJIPDX2rx2fyFeyIAGFsIpeNQ,5712
53
+ vision_agent-0.2.244.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
54
+ vision_agent-0.2.244.dist-info/RECORD,,