vision-agent 0.2.243__tar.gz → 0.2.244__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.243 → vision_agent-0.2.244}/PKG-INFO +1 -1
- {vision_agent-0.2.243 → vision_agent-0.2.244}/pyproject.toml +1 -1
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/.sim_tools/df.csv +10 -19
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/tools/__init__.py +7 -1
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/tools/tools.py +500 -89
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/video_tracking.py +1 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/LICENSE +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/README.md +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_v2.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/configs/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/configs/anthropic_config.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/configs/anthropic_openai_config.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/configs/config.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/configs/openai_config.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/models/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/models/agent_types.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/models/lmm_types.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/models/tools_types.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/sim/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/sim/sim.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/tools/planner_tools.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/agent.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/tools.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/tools_doc.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/utils/video.py +0 -0
@@ -1,5 +1,5 @@
|
|
1
1
|
desc,doc,name
|
2
|
-
"'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1
|
2
|
+
"'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1) -> List[Dict[str, Any]]:
|
3
3
|
'owlv2_object_detection' is a tool that can detect and count multiple objects
|
4
4
|
given a text prompt such as category names or referring expressions on images. The
|
5
5
|
categories in text prompt are separated by commas. It returns a list of bounding
|
@@ -10,8 +10,6 @@ desc,doc,name
|
|
10
10
|
image (np.ndarray): The image to ground the prompt to.
|
11
11
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
12
12
|
to 0.10.
|
13
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
14
|
-
fine-tuned model ID here to use it.
|
15
13
|
|
16
14
|
Returns:
|
17
15
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -65,7 +63,7 @@ desc,doc,name
|
|
65
63
|
},
|
66
64
|
]
|
67
65
|
",owlv2_sam2_instance_segmentation
|
68
|
-
"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25
|
66
|
+
"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
|
69
67
|
'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
|
70
68
|
objects in a video given a text prompt such as category names or referring
|
71
69
|
expressions. The categories in the text prompt are separated by commas. It returns
|
@@ -79,8 +77,6 @@ desc,doc,name
|
|
79
77
|
to 0.10.
|
80
78
|
chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
|
81
79
|
new objects.
|
82
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
83
|
-
fine-tuned model ID here to use it.
|
84
80
|
|
85
81
|
Returns:
|
86
82
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
@@ -240,7 +236,7 @@ desc,doc,name
|
|
240
236
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
241
237
|
]
|
242
238
|
",florence2_ocr
|
243
|
-
"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray
|
239
|
+
"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
|
244
240
|
'florence2_object_detection' is a tool that can detect multiple objects given a
|
245
241
|
text prompt which can be object names or caption. You can optionally separate the
|
246
242
|
object names in the text with commas. It returns a list of bounding boxes with
|
@@ -250,8 +246,6 @@ desc,doc,name
|
|
250
246
|
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
251
247
|
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
252
248
|
image (np.ndarray): The image to used to detect objects
|
253
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
254
|
-
fine-tuned model ID here to use it.
|
255
249
|
|
256
250
|
Returns:
|
257
251
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -268,7 +262,7 @@ desc,doc,name
|
|
268
262
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
269
263
|
]
|
270
264
|
",florence2_object_detection
|
271
|
-
"'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray
|
265
|
+
"'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
|
272
266
|
'florence2_sam2_instance_segmentation' is a tool that can segment multiple
|
273
267
|
objects given a text prompt such as category names or referring expressions. The
|
274
268
|
categories in the text prompt are separated by commas. It returns a list of
|
@@ -279,8 +273,6 @@ desc,doc,name
|
|
279
273
|
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
280
274
|
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
281
275
|
image (np.ndarray): The image to ground the prompt to.
|
282
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
283
|
-
fine-tuned model ID here to use it.
|
284
276
|
|
285
277
|
Returns:
|
286
278
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
@@ -306,7 +298,7 @@ desc,doc,name
|
|
306
298
|
},
|
307
299
|
]
|
308
300
|
",florence2_sam2_instance_segmentation
|
309
|
-
"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25
|
301
|
+
"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
|
310
302
|
'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
311
303
|
objects in a video given a text prompt such as category names or referring
|
312
304
|
expressions. The categories in the text prompt are separated by commas. It returns
|
@@ -319,8 +311,6 @@ desc,doc,name
|
|
319
311
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
320
312
|
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
321
313
|
new objects.
|
322
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
323
|
-
fine-tuned model ID here to use it.
|
324
314
|
|
325
315
|
Returns:
|
326
316
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
@@ -484,16 +474,17 @@ desc,doc,name
|
|
484
474
|
>>> activity_recognition('Did a goal happened?', frames)
|
485
475
|
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
486
476
|
",activity_recognition
|
487
|
-
'depth_anything_v2' is a tool that runs
|
488
|
-
'depth_anything_v2' is a tool that runs
|
477
|
+
'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
|
478
|
+
'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
|
489
479
|
depth image from a given RGB image. The returned depth image is monochrome and
|
490
|
-
represents depth values as pixel
|
480
|
+
represents depth values as pixel intensities with pixel values ranging from 0 to 255.
|
491
481
|
|
492
482
|
Parameters:
|
493
483
|
image (np.ndarray): The image to used to generate depth image
|
494
484
|
|
495
485
|
Returns:
|
496
|
-
np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
|
486
|
+
np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
|
487
|
+
where high values represent closer objects and low values further.
|
497
488
|
|
498
489
|
Example
|
499
490
|
-------
|
Binary file
|
@@ -23,7 +23,8 @@ from .tools import (
|
|
23
23
|
countgd_object_detection,
|
24
24
|
countgd_sam2_instance_segmentation,
|
25
25
|
countgd_sam2_video_tracking,
|
26
|
-
|
26
|
+
countgd_sam2_visual_instance_segmentation,
|
27
|
+
countgd_visual_object_detection,
|
27
28
|
custom_object_detection,
|
28
29
|
depth_anything_v2,
|
29
30
|
detr_segmentation,
|
@@ -41,6 +42,9 @@ from .tools import (
|
|
41
42
|
get_tools_df,
|
42
43
|
get_tools_docstring,
|
43
44
|
get_utilties_docstring,
|
45
|
+
glee_object_detection,
|
46
|
+
glee_sam2_instance_segmentation,
|
47
|
+
glee_sam2_video_tracking,
|
44
48
|
load_image,
|
45
49
|
minimum_distance,
|
46
50
|
ocr,
|
@@ -53,6 +57,8 @@ from .tools import (
|
|
53
57
|
owlv2_sam2_video_tracking,
|
54
58
|
qwen2_vl_images_vqa,
|
55
59
|
qwen2_vl_video_vqa,
|
60
|
+
qwen25_vl_images_vqa,
|
61
|
+
qwen25_vl_video_vqa,
|
56
62
|
sam2,
|
57
63
|
save_image,
|
58
64
|
save_json,
|
@@ -313,6 +313,13 @@ def od_sam2_video_tracking(
|
|
313
313
|
box_threshold=box_threshold,
|
314
314
|
)
|
315
315
|
function_name = "custom_object_detection"
|
316
|
+
elif od_model == ODModels.GLEE:
|
317
|
+
segment_results = glee_object_detection(
|
318
|
+
prompt=prompt,
|
319
|
+
image=segment_frames[frame_number],
|
320
|
+
box_threshold=box_threshold,
|
321
|
+
)
|
322
|
+
function_name = "glee_object_detection"
|
316
323
|
|
317
324
|
else:
|
318
325
|
raise NotImplementedError(
|
@@ -1128,25 +1135,71 @@ def countgd_sam2_video_tracking(
|
|
1128
1135
|
return ret["return_data"] # type: ignore
|
1129
1136
|
|
1130
1137
|
|
1131
|
-
|
1138
|
+
def _countgd_visual_object_detection(
|
1139
|
+
visual_prompts: List[List[float]],
|
1140
|
+
image: np.ndarray,
|
1141
|
+
box_threshold: float = 0.23,
|
1142
|
+
) -> Dict[str, Any]:
|
1143
|
+
image_size = image.shape[:2]
|
1144
|
+
|
1145
|
+
buffer_bytes = numpy_to_bytes(image)
|
1146
|
+
files = [("image", buffer_bytes)]
|
1147
|
+
visual_prompts = [
|
1148
|
+
denormalize_bbox(bbox, image.shape[:2]) for bbox in visual_prompts
|
1149
|
+
]
|
1150
|
+
payload = {
|
1151
|
+
"visual_prompts": json.dumps(visual_prompts),
|
1152
|
+
"model": "countgd",
|
1153
|
+
"confidence": box_threshold,
|
1154
|
+
}
|
1155
|
+
metadata = {"function_name": "countgd_visual_object_detection"}
|
1156
|
+
|
1157
|
+
detections = send_task_inference_request(
|
1158
|
+
payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
|
1159
|
+
)
|
1160
|
+
|
1161
|
+
# get the first frame
|
1162
|
+
bboxes = detections[0]
|
1163
|
+
bboxes_formatted = [
|
1164
|
+
{
|
1165
|
+
"label": bbox["label"],
|
1166
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1167
|
+
"score": round(bbox["score"], 2),
|
1168
|
+
}
|
1169
|
+
for bbox in bboxes
|
1170
|
+
]
|
1171
|
+
display_data = [
|
1172
|
+
{
|
1173
|
+
"label": bbox["label"],
|
1174
|
+
"bbox": bbox["bounding_box"],
|
1175
|
+
"score": bbox["score"],
|
1176
|
+
}
|
1177
|
+
for bbox in bboxes
|
1178
|
+
]
|
1179
|
+
return {
|
1180
|
+
"files": files,
|
1181
|
+
"return_data": bboxes_formatted,
|
1182
|
+
"display_data": display_data,
|
1183
|
+
}
|
1132
1184
|
|
1133
1185
|
|
1134
|
-
def
|
1186
|
+
def countgd_visual_object_detection(
|
1135
1187
|
visual_prompts: List[List[float]],
|
1136
1188
|
image: np.ndarray,
|
1137
1189
|
box_threshold: float = 0.23,
|
1138
1190
|
) -> List[Dict[str, Any]]:
|
1139
|
-
"""'
|
1140
|
-
|
1141
|
-
of
|
1142
|
-
|
1191
|
+
"""'countgd_visual_object_detection' is a tool that can detect multiple instances
|
1192
|
+
of an object given a visual prompt. It is particularly useful when trying to detect
|
1193
|
+
and count a large number of objects. You can optionally separate object names in
|
1194
|
+
the prompt with commas. It returns a list of bounding boxes with normalized
|
1195
|
+
coordinates, label names and associated confidence scores.
|
1143
1196
|
|
1144
1197
|
Parameters:
|
1145
1198
|
visual_prompts (List[List[float]]): Bounding boxes of the object in format
|
1146
|
-
[xmin, ymin, xmax, ymax]
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1199
|
+
[xmin, ymin, xmax, ymax] with normalized coordinates. Up to 3 bounding
|
1200
|
+
boxes can be provided.
|
1201
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
1202
|
+
box_threshold (float, optional): The threshold for detection. Defaults to 0.23.
|
1150
1203
|
|
1151
1204
|
Returns:
|
1152
1205
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -1172,43 +1225,76 @@ def countgd_visual_prompt_object_detection(
|
|
1172
1225
|
if image_size[0] < 1 or image_size[1] < 1:
|
1173
1226
|
return []
|
1174
1227
|
|
1175
|
-
|
1176
|
-
files = [("image", buffer_bytes)]
|
1177
|
-
visual_prompts = [
|
1178
|
-
denormalize_bbox(bbox, image.shape[:2]) for bbox in visual_prompts
|
1179
|
-
]
|
1180
|
-
payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
|
1181
|
-
metadata = {"function_name": "countgd_visual_prompt_object_detection"}
|
1228
|
+
od_ret = _countgd_visual_object_detection(visual_prompts, image, box_threshold)
|
1182
1229
|
|
1183
|
-
|
1184
|
-
|
1230
|
+
_display_tool_trace(
|
1231
|
+
countgd_visual_object_detection.__name__,
|
1232
|
+
{},
|
1233
|
+
od_ret["display_data"],
|
1234
|
+
od_ret["files"],
|
1185
1235
|
)
|
1186
1236
|
|
1187
|
-
#
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1237
|
+
return od_ret["return_data"] # type: ignore
|
1238
|
+
|
1239
|
+
|
1240
|
+
def countgd_sam2_visual_instance_segmentation(
|
1241
|
+
visual_prompts: List[List[float]],
|
1242
|
+
image: np.ndarray,
|
1243
|
+
box_threshold: float = 0.23,
|
1244
|
+
) -> List[Dict[str, Any]]:
|
1245
|
+
"""'countgd_sam2_visual_instance_segmentation' is a tool that can precisely count
|
1246
|
+
multiple instances of an object given few visual example prompts. It returns a list
|
1247
|
+
of bounding boxes, label names, masks and associated probability scores.
|
1248
|
+
|
1249
|
+
Parameters:
|
1250
|
+
visual_prompts (List[List[float]]): Bounding boxes of the object in format
|
1251
|
+
[xmin, ymin, xmax, ymax] with normalized coordinates. Up to 3 bounding
|
1252
|
+
boxes can be provided.
|
1253
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
1254
|
+
box_threshold (float, optional): The threshold for detection. Defaults to 0.23.
|
1255
|
+
|
1256
|
+
Returns:
|
1257
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
1258
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
1259
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
1260
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
1261
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
1262
|
+
the background.
|
1263
|
+
|
1264
|
+
Example
|
1265
|
+
-------
|
1266
|
+
>>> countgd_sam2_visual_instance_segmentation(
|
1267
|
+
visual_prompts=[[0.1, 0.1, 0.4, 0.42], [0.2, 0.3, 0.25, 0.35]],
|
1268
|
+
image=image
|
1269
|
+
)
|
1200
1270
|
[
|
1201
1271
|
{
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1272
|
+
'score': 0.49,
|
1273
|
+
'label': 'object',
|
1274
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1275
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1276
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1277
|
+
...,
|
1278
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1279
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1280
|
+
},
|
1281
|
+
]
|
1282
|
+
"""
|
1283
|
+
|
1284
|
+
od_ret = _countgd_visual_object_detection(visual_prompts, image, box_threshold)
|
1285
|
+
seg_ret = _sam2(
|
1286
|
+
image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
|
1209
1287
|
)
|
1288
|
+
_display_tool_trace(
|
1289
|
+
countgd_sam2_visual_instance_segmentation.__name__,
|
1290
|
+
{},
|
1291
|
+
seg_ret["display_data"],
|
1292
|
+
seg_ret["files"],
|
1293
|
+
)
|
1294
|
+
return seg_ret["return_data"] # type: ignore
|
1210
1295
|
|
1211
|
-
|
1296
|
+
|
1297
|
+
# Custom Models
|
1212
1298
|
|
1213
1299
|
|
1214
1300
|
def custom_object_detection(
|
@@ -1401,9 +1487,9 @@ def agentic_object_detection(
|
|
1401
1487
|
) -> List[Dict[str, Any]]:
|
1402
1488
|
"""'agentic_object_detection' is a tool that can detect multiple objects given a
|
1403
1489
|
text prompt such as object names or referring expressions on images. It's
|
1404
|
-
particularly good at detecting specific objects given detailed descriptive prompts
|
1405
|
-
It returns a list of bounding boxes with normalized coordinates,
|
1406
|
-
associated probability scores.
|
1490
|
+
particularly good at detecting specific objects given detailed descriptive prompts
|
1491
|
+
but runs slower. It returns a list of bounding boxes with normalized coordinates,
|
1492
|
+
label names and associated probability scores.
|
1407
1493
|
|
1408
1494
|
Parameters:
|
1409
1495
|
prompt (str): The prompt to ground to the image, only supports a single prompt
|
@@ -1447,8 +1533,8 @@ def agentic_sam2_instance_segmentation(
|
|
1447
1533
|
"""'agentic_sam2_instance_segmentation' is a tool that can detect multiple
|
1448
1534
|
instances given a text prompt such as object names or referring expressions on
|
1449
1535
|
images. It's particularly good at detecting specific objects given detailed
|
1450
|
-
descriptive prompts. It returns a list of bounding boxes with
|
1451
|
-
coordinates, label names, masks and associated probability scores.
|
1536
|
+
descriptive prompts but runs slower. It returns a list of bounding boxes with
|
1537
|
+
normalized coordinates, label names, masks and associated probability scores.
|
1452
1538
|
|
1453
1539
|
Parameters:
|
1454
1540
|
prompt (str): The object that needs to be counted, only supports a single
|
@@ -1505,9 +1591,9 @@ def agentic_sam2_video_tracking(
|
|
1505
1591
|
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
1506
1592
|
objects in a video given a text prompt such as object names or referring
|
1507
1593
|
expressions. It's particularly good at detecting specific objects given detailed
|
1508
|
-
descriptive prompts and returns a list of bounding boxes, label
|
1509
|
-
associated probability scores and is useful for tracking and
|
1510
|
-
duplicating counts.
|
1594
|
+
descriptive prompts but runs slower, and returns a list of bounding boxes, label
|
1595
|
+
names, masks and associated probability scores and is useful for tracking and
|
1596
|
+
counting without duplicating counts.
|
1511
1597
|
|
1512
1598
|
Parameters:
|
1513
1599
|
prompt (str): The prompt to ground to the image, only supports a single prompt
|
@@ -1560,6 +1646,305 @@ def agentic_sam2_video_tracking(
|
|
1560
1646
|
return ret["return_data"] # type: ignore
|
1561
1647
|
|
1562
1648
|
|
1649
|
+
# GLEE Tools
|
1650
|
+
|
1651
|
+
|
1652
|
+
def _glee_object_detection(
|
1653
|
+
prompt: str,
|
1654
|
+
image: np.ndarray,
|
1655
|
+
box_threshold: float,
|
1656
|
+
image_size: Tuple[int, ...],
|
1657
|
+
image_bytes: Optional[bytes] = None,
|
1658
|
+
) -> Dict[str, Any]:
|
1659
|
+
if image_bytes is None:
|
1660
|
+
image_bytes = numpy_to_bytes(image)
|
1661
|
+
|
1662
|
+
files = [("image", image_bytes)]
|
1663
|
+
payload = {
|
1664
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
1665
|
+
"confidence": box_threshold,
|
1666
|
+
"model": "glee",
|
1667
|
+
}
|
1668
|
+
metadata = {"function_name": "glee"}
|
1669
|
+
detections = send_task_inference_request(
|
1670
|
+
payload,
|
1671
|
+
"text-to-object-detection",
|
1672
|
+
files=files,
|
1673
|
+
metadata=metadata,
|
1674
|
+
)
|
1675
|
+
# get the first frame
|
1676
|
+
bboxes = detections[0]
|
1677
|
+
bboxes_formatted = [
|
1678
|
+
{
|
1679
|
+
"label": bbox["label"],
|
1680
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1681
|
+
"score": round(bbox["score"], 2),
|
1682
|
+
}
|
1683
|
+
for bbox in bboxes
|
1684
|
+
]
|
1685
|
+
display_data = [
|
1686
|
+
{
|
1687
|
+
"label": bbox["label"],
|
1688
|
+
"bbox": bbox["bounding_box"],
|
1689
|
+
"score": round(bbox["score"], 2),
|
1690
|
+
}
|
1691
|
+
for bbox in bboxes
|
1692
|
+
]
|
1693
|
+
return {
|
1694
|
+
"files": files,
|
1695
|
+
"return_data": bboxes_formatted,
|
1696
|
+
"display_data": display_data,
|
1697
|
+
}
|
1698
|
+
|
1699
|
+
|
1700
|
+
def glee_object_detection(
|
1701
|
+
prompt: str,
|
1702
|
+
image: np.ndarray,
|
1703
|
+
box_threshold: float = 0.23,
|
1704
|
+
) -> List[Dict[str, Any]]:
|
1705
|
+
"""'glee_object_detection' is a tool that can detect multiple objects given a
|
1706
|
+
text prompt such as object names or referring expressions on images. It's
|
1707
|
+
particularly good at detecting specific objects given detailed descriptive prompts.
|
1708
|
+
It returns a list of bounding boxes with normalized coordinates, label names and
|
1709
|
+
associated probability scores.
|
1710
|
+
|
1711
|
+
Parameters:
|
1712
|
+
prompt (str): The prompt to ground to the image, only supports a single prompt
|
1713
|
+
with no commas or periods.
|
1714
|
+
image (np.ndarray): The image to ground the prompt to.
|
1715
|
+
|
1716
|
+
Returns:
|
1717
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1718
|
+
bounding box of the detected objects with normalized coordinates between 0
|
1719
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1720
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1721
|
+
bounding box.
|
1722
|
+
|
1723
|
+
Example
|
1724
|
+
-------
|
1725
|
+
>>> glee_object_detection("person holding a box", image)
|
1726
|
+
[
|
1727
|
+
{'score': 0.99, 'label': 'person holding a box', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1728
|
+
{'score': 0.98, 'label': 'person holding a box', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
1729
|
+
]
|
1730
|
+
"""
|
1731
|
+
|
1732
|
+
od_ret = _glee_object_detection(prompt, image, box_threshold, image.shape[:2])
|
1733
|
+
_display_tool_trace(
|
1734
|
+
glee_object_detection.__name__,
|
1735
|
+
{"prompts": prompt, "confidence": box_threshold},
|
1736
|
+
od_ret["display_data"],
|
1737
|
+
od_ret["files"],
|
1738
|
+
)
|
1739
|
+
return od_ret["return_data"] # type: ignore
|
1740
|
+
|
1741
|
+
|
1742
|
+
def glee_sam2_instance_segmentation(
|
1743
|
+
prompt: str, image: np.ndarray, box_threshold: float = 0.23
|
1744
|
+
) -> List[Dict[str, Any]]:
|
1745
|
+
"""'glee_sam2_instance_segmentation' is a tool that can detect multiple
|
1746
|
+
instances given a text prompt such as object names or referring expressions on
|
1747
|
+
images. It's particularly good at detecting specific objects given detailed
|
1748
|
+
descriptive prompts. It returns a list of bounding boxes with normalized
|
1749
|
+
coordinates, label names, masks and associated probability scores.
|
1750
|
+
|
1751
|
+
Parameters:
|
1752
|
+
prompt (str): The object that needs to be counted, only supports a single
|
1753
|
+
prompt with no commas or periods.
|
1754
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
1755
|
+
|
1756
|
+
Returns:
|
1757
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
1758
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
1759
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
1760
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
1761
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
1762
|
+
the background.
|
1763
|
+
|
1764
|
+
Example
|
1765
|
+
-------
|
1766
|
+
>>> glee_sam2_instance_segmentation("a large blue flower", image)
|
1767
|
+
[
|
1768
|
+
{
|
1769
|
+
'score': 0.49,
|
1770
|
+
'label': 'a large blue flower',
|
1771
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1772
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1773
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1774
|
+
...,
|
1775
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1776
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1777
|
+
},
|
1778
|
+
]
|
1779
|
+
"""
|
1780
|
+
od_ret = _glee_object_detection(prompt, image, box_threshold, image.shape[:2])
|
1781
|
+
seg_ret = _sam2(
|
1782
|
+
image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
|
1783
|
+
)
|
1784
|
+
|
1785
|
+
_display_tool_trace(
|
1786
|
+
glee_sam2_instance_segmentation.__name__,
|
1787
|
+
{
|
1788
|
+
"prompts": prompt,
|
1789
|
+
"confidence": box_threshold,
|
1790
|
+
},
|
1791
|
+
seg_ret["display_data"],
|
1792
|
+
seg_ret["files"],
|
1793
|
+
)
|
1794
|
+
|
1795
|
+
return seg_ret["return_data"] # type: ignore
|
1796
|
+
|
1797
|
+
|
1798
|
+
def glee_sam2_video_tracking(
|
1799
|
+
prompt: str,
|
1800
|
+
frames: List[np.ndarray],
|
1801
|
+
box_threshold: float = 0.23,
|
1802
|
+
chunk_length: Optional[int] = 25,
|
1803
|
+
) -> List[List[Dict[str, Any]]]:
|
1804
|
+
"""'glee_sam2_video_tracking' is a tool that can track and segment multiple
|
1805
|
+
objects in a video given a text prompt such as object names or referring
|
1806
|
+
expressions. It's particularly good at detecting specific objects given detailed
|
1807
|
+
descriptive prompts and returns a list of bounding boxes, label names, masks and
|
1808
|
+
associated probability scores and is useful for tracking and counting without
|
1809
|
+
duplicating counts.
|
1810
|
+
|
1811
|
+
Parameters:
|
1812
|
+
prompt (str): The prompt to ground to the image, only supports a single prompt
|
1813
|
+
with no commas or periods.
|
1814
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
1815
|
+
chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
|
1816
|
+
to find new objects.
|
1817
|
+
|
1818
|
+
Returns:
|
1819
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
1820
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
1821
|
+
frame and the inner list is the entities per frame. The detected objects
|
1822
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
1823
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
1824
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
1825
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
1826
|
+
The label names are prefixed with their ID represent the total count.
|
1827
|
+
|
1828
|
+
Example
|
1829
|
+
-------
|
1830
|
+
>>> glee_sam2_video_tracking("a runner with yellow shoes", frames)
|
1831
|
+
[
|
1832
|
+
[
|
1833
|
+
{
|
1834
|
+
'label': '0: a runner with yellow shoes',
|
1835
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1836
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1837
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1838
|
+
...,
|
1839
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1840
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1841
|
+
},
|
1842
|
+
],
|
1843
|
+
...
|
1844
|
+
]
|
1845
|
+
"""
|
1846
|
+
ret = od_sam2_video_tracking(
|
1847
|
+
ODModels.GLEE,
|
1848
|
+
prompt=prompt,
|
1849
|
+
frames=frames,
|
1850
|
+
box_threshold=box_threshold,
|
1851
|
+
chunk_length=chunk_length,
|
1852
|
+
)
|
1853
|
+
_display_tool_trace(
|
1854
|
+
glee_sam2_video_tracking.__name__,
|
1855
|
+
{"prompt": prompt, "chunk_length": chunk_length},
|
1856
|
+
ret["display_data"],
|
1857
|
+
ret["files"],
|
1858
|
+
)
|
1859
|
+
return ret["return_data"] # type: ignore
|
1860
|
+
|
1861
|
+
|
1862
|
+
# Qwen2 and 2.5 VL Tool
|
1863
|
+
|
1864
|
+
|
1865
|
+
def qwen25_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
1866
|
+
"""'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
1867
|
+
images including regular images or images of documents or presentations. It can be
|
1868
|
+
very useful for document QA or OCR text extraction. It returns text as an answer to
|
1869
|
+
the question.
|
1870
|
+
|
1871
|
+
Parameters:
|
1872
|
+
prompt (str): The question about the document image
|
1873
|
+
images (List[np.ndarray]): The reference images used for the question
|
1874
|
+
|
1875
|
+
Returns:
|
1876
|
+
str: A string which is the answer to the given prompt.
|
1877
|
+
|
1878
|
+
Example
|
1879
|
+
-------
|
1880
|
+
>>> qwen25_vl_images_vqa('Give a summary of the document', images)
|
1881
|
+
'The document talks about the history of the United States of America and its...'
|
1882
|
+
"""
|
1883
|
+
if isinstance(images, np.ndarray):
|
1884
|
+
images = [images]
|
1885
|
+
|
1886
|
+
for image in images:
|
1887
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
1888
|
+
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
1889
|
+
|
1890
|
+
files = [("images", numpy_to_bytes(image)) for image in images]
|
1891
|
+
payload = {
|
1892
|
+
"prompt": prompt,
|
1893
|
+
"model": "qwen25vl",
|
1894
|
+
"function_name": "qwen25_vl_images_vqa",
|
1895
|
+
}
|
1896
|
+
data: Dict[str, Any] = send_inference_request(
|
1897
|
+
payload, "image-to-text", files=files, v2=True
|
1898
|
+
)
|
1899
|
+
_display_tool_trace(
|
1900
|
+
qwen25_vl_images_vqa.__name__,
|
1901
|
+
payload,
|
1902
|
+
cast(str, data),
|
1903
|
+
files,
|
1904
|
+
)
|
1905
|
+
return cast(str, data)
|
1906
|
+
|
1907
|
+
|
1908
|
+
def qwen25_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
1909
|
+
"""'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
|
1910
|
+
including regular videos or videos of documents or presentations. It returns text
|
1911
|
+
as an answer to the question.
|
1912
|
+
|
1913
|
+
Parameters:
|
1914
|
+
prompt (str): The question about the video
|
1915
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
1916
|
+
|
1917
|
+
Returns:
|
1918
|
+
str: A string which is the answer to the given prompt.
|
1919
|
+
|
1920
|
+
Example
|
1921
|
+
-------
|
1922
|
+
>>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
|
1923
|
+
'Lionel Messi'
|
1924
|
+
"""
|
1925
|
+
|
1926
|
+
if len(frames) == 0 or not isinstance(frames, list):
|
1927
|
+
raise ValueError("Must provide a list of numpy arrays for frames")
|
1928
|
+
|
1929
|
+
buffer_bytes = frames_to_bytes(frames)
|
1930
|
+
files = [("video", buffer_bytes)]
|
1931
|
+
payload = {
|
1932
|
+
"prompt": prompt,
|
1933
|
+
"model": "qwen25vl",
|
1934
|
+
"function_name": "qwen25_vl_video_vqa",
|
1935
|
+
}
|
1936
|
+
data: Dict[str, Any] = send_inference_request(
|
1937
|
+
payload, "image-to-text", files=files, v2=True
|
1938
|
+
)
|
1939
|
+
_display_tool_trace(
|
1940
|
+
qwen25_vl_video_vqa.__name__,
|
1941
|
+
payload,
|
1942
|
+
cast(str, data),
|
1943
|
+
files,
|
1944
|
+
)
|
1945
|
+
return cast(str, data)
|
1946
|
+
|
1947
|
+
|
1563
1948
|
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
1564
1949
|
"""'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
1565
1950
|
images including regular images or images of documents or presentations. It can be
|
@@ -1882,6 +2267,58 @@ def _sample(frames: List[np.ndarray], sample_size: int) -> List[np.ndarray]:
|
|
1882
2267
|
return sampled_frames
|
1883
2268
|
|
1884
2269
|
|
2270
|
+
def _lmm_activity_recognition(
|
2271
|
+
lmm: LMM,
|
2272
|
+
segment: List[np.ndarray],
|
2273
|
+
prompt: str,
|
2274
|
+
) -> List[float]:
|
2275
|
+
frames = _sample(segment, 10)
|
2276
|
+
media = []
|
2277
|
+
for frame in frames:
|
2278
|
+
buffer = io.BytesIO()
|
2279
|
+
image_pil = Image.fromarray(frame)
|
2280
|
+
if image_pil.size[0] > 768:
|
2281
|
+
image_pil.thumbnail((768, 768))
|
2282
|
+
image_pil.save(buffer, format="PNG")
|
2283
|
+
image_bytes = buffer.getvalue()
|
2284
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
2285
|
+
media.append(image_b64)
|
2286
|
+
|
2287
|
+
response = cast(str, lmm.generate(prompt, media))
|
2288
|
+
if "yes" in response.lower():
|
2289
|
+
return [1.0] * len(segment)
|
2290
|
+
return [0.0] * len(segment)
|
2291
|
+
|
2292
|
+
|
2293
|
+
def _qwenvl_activity_recognition(
|
2294
|
+
segment: List[np.ndarray], prompt: str, model_name: str = "qwen2vl"
|
2295
|
+
) -> List[float]:
|
2296
|
+
payload: Dict[str, Any] = {
|
2297
|
+
"prompt": prompt,
|
2298
|
+
"model": model_name,
|
2299
|
+
"function_name": f"{model_name}_vl_video_vqa",
|
2300
|
+
}
|
2301
|
+
segment_buffer_bytes = [("video", frames_to_bytes(segment))]
|
2302
|
+
response = send_inference_request(
|
2303
|
+
payload, "image-to-text", files=segment_buffer_bytes, v2=True
|
2304
|
+
)
|
2305
|
+
if "yes" in response.lower():
|
2306
|
+
return [1.0] * len(segment)
|
2307
|
+
return [0.0] * len(segment)
|
2308
|
+
|
2309
|
+
|
2310
|
+
def _qwen2vl_activity_recognition(
|
2311
|
+
segment: List[np.ndarray], prompt: str
|
2312
|
+
) -> List[float]:
|
2313
|
+
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
|
2314
|
+
|
2315
|
+
|
2316
|
+
def _qwen25vl_activity_recognition(
|
2317
|
+
segment: List[np.ndarray], prompt: str
|
2318
|
+
) -> List[float]:
|
2319
|
+
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
|
2320
|
+
|
2321
|
+
|
1885
2322
|
def activity_recognition(
|
1886
2323
|
prompt: str,
|
1887
2324
|
frames: List[np.ndarray],
|
@@ -1921,53 +2358,26 @@ def activity_recognition(
|
|
1921
2358
|
f"{prompt} Please respond with a 'yes' or 'no' based on the frames provided."
|
1922
2359
|
)
|
1923
2360
|
|
1924
|
-
def _lmm_activity_recognition(
|
1925
|
-
lmm: LMM,
|
1926
|
-
segment: List[np.ndarray],
|
1927
|
-
) -> List[float]:
|
1928
|
-
frames = _sample(segment, 10)
|
1929
|
-
media = []
|
1930
|
-
for frame in frames:
|
1931
|
-
buffer = io.BytesIO()
|
1932
|
-
image_pil = Image.fromarray(frame)
|
1933
|
-
if image_pil.size[0] > 768:
|
1934
|
-
image_pil.thumbnail((768, 768))
|
1935
|
-
image_pil.save(buffer, format="PNG")
|
1936
|
-
image_bytes = buffer.getvalue()
|
1937
|
-
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
1938
|
-
media.append(image_b64)
|
1939
|
-
|
1940
|
-
response = cast(str, lmm.generate(prompt, media))
|
1941
|
-
if "yes" in response.lower():
|
1942
|
-
return [1.0] * len(segment)
|
1943
|
-
return [0.0] * len(segment)
|
1944
|
-
|
1945
|
-
def _qwen2vl_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
1946
|
-
payload: Dict[str, Any] = {
|
1947
|
-
"prompt": prompt,
|
1948
|
-
"model": "qwen2vl",
|
1949
|
-
"function_name": "qwen2_vl_video_vqa",
|
1950
|
-
}
|
1951
|
-
segment_buffer_bytes = [("video", frames_to_bytes(segment))]
|
1952
|
-
response = send_inference_request(
|
1953
|
-
payload, "image-to-text", files=segment_buffer_bytes, v2=True
|
1954
|
-
)
|
1955
|
-
if "yes" in response.lower():
|
1956
|
-
return [1.0] * len(segment)
|
1957
|
-
return [0.0] * len(segment)
|
1958
|
-
|
1959
2361
|
if model == "claude-35":
|
1960
2362
|
|
1961
2363
|
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
1962
|
-
return _lmm_activity_recognition(AnthropicLMM(), segment)
|
2364
|
+
return _lmm_activity_recognition(AnthropicLMM(), segment, prompt)
|
1963
2365
|
|
1964
2366
|
elif model == "gpt-4o":
|
1965
2367
|
|
1966
2368
|
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
1967
|
-
return _lmm_activity_recognition(OpenAILMM(), segment)
|
2369
|
+
return _lmm_activity_recognition(OpenAILMM(), segment, prompt)
|
1968
2370
|
|
1969
2371
|
elif model == "qwen2vl":
|
1970
|
-
|
2372
|
+
|
2373
|
+
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2374
|
+
return _qwen2vl_activity_recognition(segment, prompt)
|
2375
|
+
|
2376
|
+
elif model == "qwen25vl":
|
2377
|
+
|
2378
|
+
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2379
|
+
return _qwen25vl_activity_recognition(segment, prompt)
|
2380
|
+
|
1971
2381
|
else:
|
1972
2382
|
raise ValueError(f"Invalid model: {model}")
|
1973
2383
|
|
@@ -2135,15 +2545,16 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
2135
2545
|
|
2136
2546
|
|
2137
2547
|
def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
2138
|
-
"""'depth_anything_v2' is a tool that runs
|
2548
|
+
"""'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
|
2139
2549
|
depth image from a given RGB image. The returned depth image is monochrome and
|
2140
|
-
represents depth values as pixel
|
2550
|
+
represents depth values as pixel intensities with pixel values ranging from 0 to 255.
|
2141
2551
|
|
2142
2552
|
Parameters:
|
2143
2553
|
image (np.ndarray): The image to used to generate depth image
|
2144
2554
|
|
2145
2555
|
Returns:
|
2146
|
-
np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
|
2556
|
+
np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
|
2557
|
+
where high values represent closer objects and low values further.
|
2147
2558
|
|
2148
2559
|
Example
|
2149
2560
|
-------
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.243 → vision_agent-0.2.244}/vision_agent/configs/anthropic_openai_config.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|