vision-agent 0.2.236__py3-none-any.whl → 0.2.237__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +57 -80
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/agent.py +2 -2
- vision_agent/agent/vision_agent.py +3 -2
- vision_agent/agent/vision_agent_coder.py +13 -19
- vision_agent/agent/vision_agent_coder_v2.py +17 -17
- vision_agent/agent/vision_agent_planner.py +16 -21
- vision_agent/agent/vision_agent_planner_prompts_v2.py +19 -20
- vision_agent/agent/vision_agent_planner_v2.py +29 -15
- vision_agent/agent/vision_agent_v2.py +12 -12
- vision_agent/clients/landing_public_api.py +1 -1
- vision_agent/configs/config.py +17 -3
- vision_agent/lmm/__init__.py +0 -1
- vision_agent/lmm/lmm.py +4 -3
- vision_agent/models/__init__.py +11 -0
- vision_agent/{lmm/types.py → models/lmm_types.py} +4 -1
- vision_agent/sim/__init__.py +8 -0
- vision_agent/{utils → sim}/sim.py +3 -3
- vision_agent/tools/__init__.py +10 -23
- vision_agent/tools/meta_tools.py +4 -5
- vision_agent/tools/planner_tools.py +127 -37
- vision_agent/tools/tools.py +388 -302
- vision_agent/utils/__init__.py +0 -1
- vision_agent/{agent/agent_utils.py → utils/agent.py} +11 -2
- vision_agent/utils/image_utils.py +18 -7
- vision_agent/{tools/tool_utils.py → utils/tools.py} +1 -93
- vision_agent/utils/tools_doc.py +87 -0
- vision_agent/utils/video.py +15 -0
- vision_agent/utils/video_tracking.py +38 -5
- {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/METADATA +2 -2
- vision_agent-0.2.237.dist-info/RECORD +55 -0
- vision_agent-0.2.236.dist-info/RECORD +0 -52
- /vision_agent/{agent/types.py → models/agent_types.py} +0 -0
- /vision_agent/{tools → models}/tools_types.py +0 -0
- {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -8,11 +8,12 @@ from base64 import b64encode
|
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
9
|
from importlib import resources
|
10
10
|
from pathlib import Path
|
11
|
-
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
11
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
12
12
|
from uuid import UUID
|
13
13
|
|
14
14
|
import cv2
|
15
15
|
import numpy as np
|
16
|
+
import pandas as pd
|
16
17
|
import requests
|
17
18
|
from IPython.display import display
|
18
19
|
from PIL import Image, ImageDraw, ImageFont
|
@@ -20,21 +21,8 @@ from pillow_heif import register_heif_opener # type: ignore
|
|
20
21
|
from pytube import YouTube # type: ignore
|
21
22
|
|
22
23
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
23
|
-
from vision_agent.lmm.lmm import AnthropicLMM
|
24
|
-
from vision_agent.
|
25
|
-
ToolCallTrace,
|
26
|
-
add_bboxes_from_masks,
|
27
|
-
get_tool_descriptions,
|
28
|
-
get_tool_documentation,
|
29
|
-
get_tools_df,
|
30
|
-
get_tools_info,
|
31
|
-
nms,
|
32
|
-
send_inference_request,
|
33
|
-
send_task_inference_request,
|
34
|
-
should_report_tool_traces,
|
35
|
-
single_nms,
|
36
|
-
)
|
37
|
-
from vision_agent.tools.tools_types import JobStatus
|
24
|
+
from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
|
25
|
+
from vision_agent.models import JobStatus
|
38
26
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
39
27
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
40
28
|
from vision_agent.utils.image_utils import (
|
@@ -48,6 +36,21 @@ from vision_agent.utils.image_utils import (
|
|
48
36
|
rle_decode,
|
49
37
|
rle_decode_array,
|
50
38
|
)
|
39
|
+
from vision_agent.utils.tools import (
|
40
|
+
ToolCallTrace,
|
41
|
+
add_bboxes_from_masks,
|
42
|
+
nms,
|
43
|
+
send_inference_request,
|
44
|
+
send_task_inference_request,
|
45
|
+
should_report_tool_traces,
|
46
|
+
single_nms,
|
47
|
+
)
|
48
|
+
from vision_agent.utils.tools_doc import get_tool_descriptions as _get_tool_descriptions
|
49
|
+
from vision_agent.utils.tools_doc import (
|
50
|
+
get_tool_documentation as _get_tool_documentation,
|
51
|
+
)
|
52
|
+
from vision_agent.utils.tools_doc import get_tools_df as _get_tools_df
|
53
|
+
from vision_agent.utils.tools_doc import get_tools_info as _get_tools_info
|
51
54
|
from vision_agent.utils.video import (
|
52
55
|
extract_frames_from_video,
|
53
56
|
frames_to_bytes,
|
@@ -234,6 +237,7 @@ def od_sam2_video_tracking(
|
|
234
237
|
od_model: ODModels,
|
235
238
|
prompt: str,
|
236
239
|
frames: List[np.ndarray],
|
240
|
+
box_threshold: float = 0.30,
|
237
241
|
chunk_length: Optional[int] = 50,
|
238
242
|
fine_tune_id: Optional[str] = None,
|
239
243
|
) -> Dict[str, Any]:
|
@@ -278,7 +282,9 @@ def od_sam2_video_tracking(
|
|
278
282
|
|
279
283
|
if od_model == ODModels.COUNTGD:
|
280
284
|
segment_results = countgd_object_detection(
|
281
|
-
prompt=prompt,
|
285
|
+
prompt=prompt,
|
286
|
+
image=segment_frames[frame_number],
|
287
|
+
box_threshold=box_threshold,
|
282
288
|
)
|
283
289
|
function_name = "countgd_object_detection"
|
284
290
|
|
@@ -286,6 +292,7 @@ def od_sam2_video_tracking(
|
|
286
292
|
segment_results = owlv2_object_detection(
|
287
293
|
prompt=prompt,
|
288
294
|
image=segment_frames[frame_number],
|
295
|
+
box_threshold=box_threshold,
|
289
296
|
fine_tune_id=fine_tune_id,
|
290
297
|
)
|
291
298
|
function_name = "owlv2_object_detection"
|
@@ -310,6 +317,7 @@ def od_sam2_video_tracking(
|
|
310
317
|
segment_results = custom_object_detection(
|
311
318
|
deployment_id=fine_tune_id,
|
312
319
|
image=segment_frames[frame_number],
|
320
|
+
box_threshold=box_threshold,
|
313
321
|
)
|
314
322
|
function_name = "custom_object_detection"
|
315
323
|
|
@@ -546,6 +554,7 @@ def owlv2_sam2_instance_segmentation(
|
|
546
554
|
def owlv2_sam2_video_tracking(
|
547
555
|
prompt: str,
|
548
556
|
frames: List[np.ndarray],
|
557
|
+
box_threshold: float = 0.10,
|
549
558
|
chunk_length: Optional[int] = 25,
|
550
559
|
fine_tune_id: Optional[str] = None,
|
551
560
|
) -> List[List[Dict[str, Any]]]:
|
@@ -558,6 +567,8 @@ def owlv2_sam2_video_tracking(
|
|
558
567
|
Parameters:
|
559
568
|
prompt (str): The prompt to ground to the image.
|
560
569
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
570
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
571
|
+
to 0.10.
|
561
572
|
chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
|
562
573
|
new objects.
|
563
574
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
@@ -596,6 +607,7 @@ def owlv2_sam2_video_tracking(
|
|
596
607
|
ODModels.OWLV2,
|
597
608
|
prompt=prompt,
|
598
609
|
frames=frames,
|
610
|
+
box_threshold=box_threshold,
|
599
611
|
chunk_length=chunk_length,
|
600
612
|
fine_tune_id=fine_tune_id,
|
601
613
|
)
|
@@ -1118,6 +1130,7 @@ def countgd_sam2_instance_segmentation(
|
|
1118
1130
|
def countgd_sam2_video_tracking(
|
1119
1131
|
prompt: str,
|
1120
1132
|
frames: List[np.ndarray],
|
1133
|
+
box_threshold: float = 0.23,
|
1121
1134
|
chunk_length: Optional[int] = 25,
|
1122
1135
|
) -> List[List[Dict[str, Any]]]:
|
1123
1136
|
"""'countgd_sam2_video_tracking' is a tool that can track and segment multiple
|
@@ -1129,6 +1142,8 @@ def countgd_sam2_video_tracking(
|
|
1129
1142
|
Parameters:
|
1130
1143
|
prompt (str): The prompt to ground to the image.
|
1131
1144
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
1145
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
1146
|
+
to 0.23.
|
1132
1147
|
chunk_length (Optional[int]): The number of frames to re-run countgd to find
|
1133
1148
|
new objects.
|
1134
1149
|
|
@@ -1162,7 +1177,11 @@ def countgd_sam2_video_tracking(
|
|
1162
1177
|
"""
|
1163
1178
|
|
1164
1179
|
ret = od_sam2_video_tracking(
|
1165
|
-
ODModels.COUNTGD,
|
1180
|
+
ODModels.COUNTGD,
|
1181
|
+
prompt=prompt,
|
1182
|
+
frames=frames,
|
1183
|
+
box_threshold=box_threshold,
|
1184
|
+
chunk_length=chunk_length,
|
1166
1185
|
)
|
1167
1186
|
_display_tool_trace(
|
1168
1187
|
countgd_sam2_video_tracking.__name__,
|
@@ -1173,6 +1192,9 @@ def countgd_sam2_video_tracking(
|
|
1173
1192
|
return ret["return_data"] # type: ignore
|
1174
1193
|
|
1175
1194
|
|
1195
|
+
# Custom Models
|
1196
|
+
|
1197
|
+
|
1176
1198
|
def countgd_visual_prompt_object_detection(
|
1177
1199
|
visual_prompts: List[List[float]],
|
1178
1200
|
image: np.ndarray,
|
@@ -1386,6 +1408,247 @@ def custom_od_sam2_video_tracking(
|
|
1386
1408
|
return ret["return_data"] # type: ignore
|
1387
1409
|
|
1388
1410
|
|
1411
|
+
# Agentic OD Tools
|
1412
|
+
|
1413
|
+
|
1414
|
+
def _agentic_object_detection(
|
1415
|
+
prompt: str,
|
1416
|
+
image: np.ndarray,
|
1417
|
+
image_size: Tuple[int, ...],
|
1418
|
+
image_bytes: Optional[bytes] = None,
|
1419
|
+
fine_tune_id: Optional[str] = None,
|
1420
|
+
) -> Dict[str, Any]:
|
1421
|
+
if image_bytes is None:
|
1422
|
+
image_bytes = numpy_to_bytes(image)
|
1423
|
+
|
1424
|
+
files = [("image", image_bytes)]
|
1425
|
+
payload = {
|
1426
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
1427
|
+
"model": "agentic",
|
1428
|
+
}
|
1429
|
+
metadata = {"function_name": "agentic_object_detection"}
|
1430
|
+
|
1431
|
+
if fine_tune_id is not None:
|
1432
|
+
landing_api = LandingPublicAPI()
|
1433
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
1434
|
+
if status is not JobStatus.SUCCEEDED:
|
1435
|
+
raise FineTuneModelIsNotReady(
|
1436
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
1437
|
+
)
|
1438
|
+
|
1439
|
+
# we can only execute fine-tuned models with florence2
|
1440
|
+
payload = {
|
1441
|
+
"prompts": payload["prompts"],
|
1442
|
+
"jobId": fine_tune_id,
|
1443
|
+
"model": "florence2",
|
1444
|
+
}
|
1445
|
+
|
1446
|
+
detections = send_task_inference_request(
|
1447
|
+
payload,
|
1448
|
+
"text-to-object-detection",
|
1449
|
+
files=files,
|
1450
|
+
metadata=metadata,
|
1451
|
+
)
|
1452
|
+
|
1453
|
+
# get the first frame
|
1454
|
+
bboxes = detections[0]
|
1455
|
+
bboxes_formatted = [
|
1456
|
+
{
|
1457
|
+
"label": bbox["label"],
|
1458
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1459
|
+
"score": bbox["score"],
|
1460
|
+
}
|
1461
|
+
for bbox in bboxes
|
1462
|
+
]
|
1463
|
+
display_data = [
|
1464
|
+
{
|
1465
|
+
"label": bbox["label"],
|
1466
|
+
"bbox": bbox["bounding_box"],
|
1467
|
+
"score": bbox["score"],
|
1468
|
+
}
|
1469
|
+
for bbox in bboxes
|
1470
|
+
]
|
1471
|
+
return {
|
1472
|
+
"files": files,
|
1473
|
+
"return_data": bboxes_formatted,
|
1474
|
+
"display_data": display_data,
|
1475
|
+
}
|
1476
|
+
|
1477
|
+
|
1478
|
+
def agentic_object_detection(
|
1479
|
+
prompt: str,
|
1480
|
+
image: np.ndarray,
|
1481
|
+
fine_tune_id: Optional[str] = None,
|
1482
|
+
) -> List[Dict[str, Any]]:
|
1483
|
+
"""'agentic_object_detection' is a tool that can detect multiple objects given a
|
1484
|
+
text prompt such as object names or referring expressions on images. It's
|
1485
|
+
particularly good at detecting specific objects given detailed descriptive prompts.
|
1486
|
+
It returns a list of bounding boxes with normalized coordinates, label names and
|
1487
|
+
associated probability scores.
|
1488
|
+
|
1489
|
+
Parameters:
|
1490
|
+
prompt (str): The prompt to ground to the image, only supports a single prompt
|
1491
|
+
with no commas or periods.
|
1492
|
+
image (np.ndarray): The image to ground the prompt to.
|
1493
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
1494
|
+
fine-tuned model ID here to use it.
|
1495
|
+
|
1496
|
+
Returns:
|
1497
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1498
|
+
bounding box of the detected objects with normalized coordinates between 0
|
1499
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1500
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1501
|
+
bounding box.
|
1502
|
+
|
1503
|
+
Example
|
1504
|
+
-------
|
1505
|
+
>>> agentic_object_detection("a red car", image)
|
1506
|
+
[
|
1507
|
+
{'score': 0.99, 'label': 'a red car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1508
|
+
{'score': 0.98, 'label': 'a red car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
1509
|
+
]
|
1510
|
+
"""
|
1511
|
+
|
1512
|
+
image_size = image.shape[:2]
|
1513
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
1514
|
+
return []
|
1515
|
+
|
1516
|
+
ret = _agentic_object_detection(
|
1517
|
+
prompt, image, image_size, fine_tune_id=fine_tune_id
|
1518
|
+
)
|
1519
|
+
|
1520
|
+
_display_tool_trace(
|
1521
|
+
agentic_object_detection.__name__,
|
1522
|
+
{"prompts": prompt},
|
1523
|
+
ret["display_data"],
|
1524
|
+
ret["files"],
|
1525
|
+
)
|
1526
|
+
return ret["return_data"] # type: ignore
|
1527
|
+
|
1528
|
+
|
1529
|
+
def agentic_sam2_instance_segmentation(
|
1530
|
+
prompt: str, image: np.ndarray
|
1531
|
+
) -> List[Dict[str, Any]]:
|
1532
|
+
"""'agentic_sam2_instance_segmentation' is a tool that can detect multiple
|
1533
|
+
instances given a text prompt such as object names or referring expressions on
|
1534
|
+
images. It's particularly good at detecting specific objects given detailed
|
1535
|
+
descriptive prompts. It returns a list of bounding boxes with normalized
|
1536
|
+
coordinates, label names, masks and associated probability scores.
|
1537
|
+
|
1538
|
+
Parameters:
|
1539
|
+
prompt (str): The object that needs to be counted, only supports a single
|
1540
|
+
prompt with no commas or periods.
|
1541
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
1542
|
+
|
1543
|
+
Returns:
|
1544
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
1545
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
1546
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
1547
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
1548
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
1549
|
+
the background.
|
1550
|
+
|
1551
|
+
Example
|
1552
|
+
-------
|
1553
|
+
>>> agentic_sam2_instance_segmentation("a large blue flower", image)
|
1554
|
+
[
|
1555
|
+
{
|
1556
|
+
'score': 0.49,
|
1557
|
+
'label': 'a large blue flower',
|
1558
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1559
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1560
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1561
|
+
...,
|
1562
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1563
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1564
|
+
},
|
1565
|
+
]
|
1566
|
+
"""
|
1567
|
+
|
1568
|
+
od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
|
1569
|
+
seg_ret = _sam2(
|
1570
|
+
image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
|
1571
|
+
)
|
1572
|
+
|
1573
|
+
_display_tool_trace(
|
1574
|
+
agentic_sam2_instance_segmentation.__name__,
|
1575
|
+
{
|
1576
|
+
"prompts": prompt,
|
1577
|
+
},
|
1578
|
+
seg_ret["display_data"],
|
1579
|
+
seg_ret["files"],
|
1580
|
+
)
|
1581
|
+
|
1582
|
+
return seg_ret["return_data"] # type: ignore
|
1583
|
+
|
1584
|
+
|
1585
|
+
def agentic_sam2_video_tracking(
|
1586
|
+
prompt: str,
|
1587
|
+
frames: List[np.ndarray],
|
1588
|
+
chunk_length: Optional[int] = 25,
|
1589
|
+
fine_tune_id: Optional[str] = None,
|
1590
|
+
) -> List[List[Dict[str, Any]]]:
|
1591
|
+
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
1592
|
+
objects in a video given a text prompt such as object names or referring
|
1593
|
+
expressions. It's particularly good at detecting specific objects given detailed
|
1594
|
+
descriptive prompts and returns a list of bounding boxes, label names, masks and
|
1595
|
+
associated probability scores and is useful for tracking and counting without
|
1596
|
+
duplicating counts.
|
1597
|
+
|
1598
|
+
Parameters:
|
1599
|
+
prompt (str): The prompt to ground to the image, only supports a single prompt
|
1600
|
+
with no commas or periods.
|
1601
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
1602
|
+
chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
|
1603
|
+
to find new objects.
|
1604
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
1605
|
+
fine-tuned model ID here to use it.
|
1606
|
+
|
1607
|
+
Returns:
|
1608
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
1609
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
1610
|
+
frame and the inner list is the entities per frame. The detected objects
|
1611
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
1612
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
1613
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
1614
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
1615
|
+
The label names are prefixed with their ID represent the total count.
|
1616
|
+
|
1617
|
+
Example
|
1618
|
+
-------
|
1619
|
+
>>> agentic_sam2_video_tracking("a runner with yellow shoes", frames)
|
1620
|
+
[
|
1621
|
+
[
|
1622
|
+
{
|
1623
|
+
'label': '0: a runner with yellow shoes',
|
1624
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1625
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1626
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1627
|
+
...,
|
1628
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1629
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1630
|
+
},
|
1631
|
+
],
|
1632
|
+
...
|
1633
|
+
]
|
1634
|
+
"""
|
1635
|
+
|
1636
|
+
ret = od_sam2_video_tracking(
|
1637
|
+
ODModels.AGENTIC,
|
1638
|
+
prompt=prompt,
|
1639
|
+
frames=frames,
|
1640
|
+
chunk_length=chunk_length,
|
1641
|
+
fine_tune_id=fine_tune_id,
|
1642
|
+
)
|
1643
|
+
_display_tool_trace(
|
1644
|
+
agentic_sam2_video_tracking.__name__,
|
1645
|
+
{},
|
1646
|
+
ret["display_data"],
|
1647
|
+
ret["files"],
|
1648
|
+
)
|
1649
|
+
return ret["return_data"] # type: ignore
|
1650
|
+
|
1651
|
+
|
1389
1652
|
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
1390
1653
|
"""'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
1391
1654
|
images including regular images or images of documents or presentations. It can be
|
@@ -1696,85 +1959,127 @@ Answer the question directly using only the information from the document, do no
|
|
1696
1959
|
return llm_output
|
1697
1960
|
|
1698
1961
|
|
1699
|
-
def
|
1962
|
+
def _sample(frames: List[np.ndarray], sample_size: int) -> List[np.ndarray]:
|
1963
|
+
sample_indices = np.linspace(0, len(frames) - 1, sample_size, dtype=int)
|
1964
|
+
sampled_frames = []
|
1965
|
+
|
1966
|
+
for i, frame in enumerate(frames):
|
1967
|
+
if i in sample_indices:
|
1968
|
+
sampled_frames.append(frame)
|
1969
|
+
if len(sampled_frames) >= sample_size:
|
1970
|
+
break
|
1971
|
+
return sampled_frames
|
1972
|
+
|
1973
|
+
|
1974
|
+
def activity_recognition(
|
1700
1975
|
prompt: str,
|
1701
1976
|
frames: List[np.ndarray],
|
1702
1977
|
model: str = "qwen2vl",
|
1703
|
-
chunk_length_frames: int =
|
1978
|
+
chunk_length_frames: int = 10,
|
1704
1979
|
) -> List[float]:
|
1705
|
-
"""'
|
1706
|
-
|
1707
|
-
|
1708
|
-
but does not track objects across frames.
|
1709
|
-
It returns a list of floats with a value of 1.0 if the objects are found in a given
|
1710
|
-
chunk_length_frames of the video.
|
1980
|
+
"""'activity_recognition' is a tool that can recognize activities in a video given a
|
1981
|
+
text prompt. It can be used to identify where specific activities or actions
|
1982
|
+
happen in a video and returns a list of 0s and 1s to indicate the activity.
|
1711
1983
|
|
1712
1984
|
Parameters:
|
1713
|
-
prompt (str): The
|
1985
|
+
prompt (str): The event you want to identify, should be phrased as a question,
|
1986
|
+
for example, "Did a goal happen?".
|
1714
1987
|
frames (List[np.ndarray]): The reference frames used for the question
|
1715
1988
|
model (str): The model to use for the inference. Valid values are
|
1716
|
-
'
|
1989
|
+
'claude-35', 'gpt-4o', 'qwen2vl'.
|
1717
1990
|
chunk_length_frames (int): length of each chunk in frames
|
1718
1991
|
|
1719
1992
|
Returns:
|
1720
|
-
List[float]: A list of floats with a value of 1.0 if the
|
1721
|
-
|
1993
|
+
List[float]: A list of floats with a value of 1.0 if the activity is detected in
|
1994
|
+
the chunk_length_frames of the video.
|
1722
1995
|
|
1723
1996
|
Example
|
1724
1997
|
-------
|
1725
|
-
>>>
|
1998
|
+
>>> activity_recognition('Did a goal happened?', frames)
|
1726
1999
|
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
1727
2000
|
"""
|
1728
2001
|
|
1729
2002
|
buffer_bytes = frames_to_bytes(frames)
|
1730
2003
|
files = [("video", buffer_bytes)]
|
1731
|
-
payload: Dict[str, Any] = {
|
1732
|
-
"prompt": prompt,
|
1733
|
-
"model": model,
|
1734
|
-
"function_name": "video_temporal_localization",
|
1735
|
-
}
|
1736
|
-
payload["chunk_length_frames"] = chunk_length_frames
|
1737
2004
|
|
1738
|
-
segments = split_frames_into_segments(
|
2005
|
+
segments = split_frames_into_segments(
|
2006
|
+
frames, segment_size=chunk_length_frames, overlap=0
|
2007
|
+
)
|
2008
|
+
|
2009
|
+
prompt = (
|
2010
|
+
f"{prompt} Please respond with a 'yes' or 'no' based on the frames provided."
|
2011
|
+
)
|
1739
2012
|
|
1740
|
-
def
|
2013
|
+
def _lmm_activity_recognition(
|
2014
|
+
lmm: LMM,
|
1741
2015
|
segment: List[np.ndarray],
|
1742
2016
|
) -> List[float]:
|
2017
|
+
frames = _sample(segment, 10)
|
2018
|
+
media = []
|
2019
|
+
for frame in frames:
|
2020
|
+
buffer = io.BytesIO()
|
2021
|
+
image_pil = Image.fromarray(frame)
|
2022
|
+
if image_pil.size[0] > 768:
|
2023
|
+
image_pil.thumbnail((768, 768))
|
2024
|
+
image_pil.save(buffer, format="PNG")
|
2025
|
+
image_bytes = buffer.getvalue()
|
2026
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
2027
|
+
media.append(image_b64)
|
2028
|
+
|
2029
|
+
response = cast(str, lmm.generate(prompt, media))
|
2030
|
+
if "yes" in response.lower():
|
2031
|
+
return [1.0] * len(segment)
|
2032
|
+
return [0.0] * len(segment)
|
2033
|
+
|
2034
|
+
def _qwen2vl_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2035
|
+
payload: Dict[str, Any] = {
|
2036
|
+
"prompt": prompt,
|
2037
|
+
"model": "qwen2vl",
|
2038
|
+
"function_name": "qwen2_vl_video_vqa",
|
2039
|
+
}
|
1743
2040
|
segment_buffer_bytes = [("video", frames_to_bytes(segment))]
|
1744
|
-
|
1745
|
-
payload, "
|
2041
|
+
response = send_inference_request(
|
2042
|
+
payload, "image-to-text", files=segment_buffer_bytes, v2=True
|
1746
2043
|
)
|
1747
|
-
|
2044
|
+
if "yes" in response.lower():
|
2045
|
+
return [1.0] * len(segment)
|
2046
|
+
return [0.0] * len(segment)
|
2047
|
+
|
2048
|
+
if model == "claude-35":
|
2049
|
+
|
2050
|
+
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2051
|
+
return _lmm_activity_recognition(AnthropicLMM(), segment)
|
2052
|
+
|
2053
|
+
elif model == "gpt-4o":
|
1748
2054
|
|
1749
|
-
|
1750
|
-
|
1751
|
-
full_data.extend([value] * chunk_length_frames)
|
2055
|
+
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2056
|
+
return _lmm_activity_recognition(OpenAILMM(), segment)
|
1752
2057
|
|
1753
|
-
|
2058
|
+
elif model == "qwen2vl":
|
2059
|
+
_apply_activity_recognition = _qwen2vl_activity_recognition
|
2060
|
+
else:
|
2061
|
+
raise ValueError(f"Invalid model: {model}")
|
1754
2062
|
|
1755
2063
|
with ThreadPoolExecutor() as executor:
|
1756
2064
|
futures = {
|
1757
|
-
executor.submit(
|
2065
|
+
executor.submit(_apply_activity_recognition, segment): segment_index
|
1758
2066
|
for segment_index, segment in enumerate(segments)
|
1759
2067
|
}
|
1760
2068
|
|
1761
|
-
|
2069
|
+
return_value_tuples = []
|
1762
2070
|
for future in as_completed(futures):
|
1763
2071
|
segment_index = futures[future]
|
1764
|
-
|
1765
|
-
|
1766
|
-
|
1767
|
-
x[1] for x in sorted(localization_per_segment, key=lambda x: x[0]) # type: ignore
|
1768
|
-
]
|
1769
|
-
localizations = cast(List[float], [e for o in localization_per_segment for e in o])
|
2072
|
+
return_value_tuples.append((segment_index, future.result()))
|
2073
|
+
return_values = [x[1] for x in sorted(return_value_tuples, key=lambda x: x[0])]
|
2074
|
+
return_values_flattened = cast(List[float], [e for o in return_values for e in o])
|
1770
2075
|
|
1771
2076
|
_display_tool_trace(
|
1772
|
-
|
1773
|
-
|
1774
|
-
|
2077
|
+
activity_recognition.__name__,
|
2078
|
+
{"prompt": prompt, "model": model},
|
2079
|
+
return_values,
|
1775
2080
|
files,
|
1776
2081
|
)
|
1777
|
-
return
|
2082
|
+
return return_values_flattened
|
1778
2083
|
|
1779
2084
|
|
1780
2085
|
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
@@ -2200,242 +2505,6 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
|
|
2200
2505
|
return response
|
2201
2506
|
|
2202
2507
|
|
2203
|
-
# Agentic OD Tools
|
2204
|
-
|
2205
|
-
|
2206
|
-
def _agentic_object_detection(
|
2207
|
-
prompt: str,
|
2208
|
-
image: np.ndarray,
|
2209
|
-
image_size: Tuple[int, ...],
|
2210
|
-
image_bytes: Optional[bytes] = None,
|
2211
|
-
fine_tune_id: Optional[str] = None,
|
2212
|
-
) -> Dict[str, Any]:
|
2213
|
-
if image_bytes is None:
|
2214
|
-
image_bytes = numpy_to_bytes(image)
|
2215
|
-
|
2216
|
-
files = [("image", image_bytes)]
|
2217
|
-
payload = {
|
2218
|
-
"prompts": [s.strip() for s in prompt.split(",")],
|
2219
|
-
"model": "agentic",
|
2220
|
-
}
|
2221
|
-
metadata = {"function_name": "agentic_object_detection"}
|
2222
|
-
|
2223
|
-
if fine_tune_id is not None:
|
2224
|
-
landing_api = LandingPublicAPI()
|
2225
|
-
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
2226
|
-
if status is not JobStatus.SUCCEEDED:
|
2227
|
-
raise FineTuneModelIsNotReady(
|
2228
|
-
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
2229
|
-
)
|
2230
|
-
|
2231
|
-
# we can only execute fine-tuned models with florence2
|
2232
|
-
payload = {
|
2233
|
-
"prompts": payload["prompts"],
|
2234
|
-
"jobId": fine_tune_id,
|
2235
|
-
"model": "florence2",
|
2236
|
-
}
|
2237
|
-
|
2238
|
-
detections = send_task_inference_request(
|
2239
|
-
payload,
|
2240
|
-
"text-to-object-detection",
|
2241
|
-
files=files,
|
2242
|
-
metadata=metadata,
|
2243
|
-
)
|
2244
|
-
|
2245
|
-
# get the first frame
|
2246
|
-
bboxes = detections[0]
|
2247
|
-
bboxes_formatted = [
|
2248
|
-
{
|
2249
|
-
"label": bbox["label"],
|
2250
|
-
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
2251
|
-
"score": bbox["score"],
|
2252
|
-
}
|
2253
|
-
for bbox in bboxes
|
2254
|
-
]
|
2255
|
-
display_data = [
|
2256
|
-
{
|
2257
|
-
"label": bbox["label"],
|
2258
|
-
"bbox": bbox["bounding_box"],
|
2259
|
-
"score": bbox["score"],
|
2260
|
-
}
|
2261
|
-
for bbox in bboxes
|
2262
|
-
]
|
2263
|
-
return {
|
2264
|
-
"files": files,
|
2265
|
-
"return_data": bboxes_formatted,
|
2266
|
-
"display_data": display_data,
|
2267
|
-
}
|
2268
|
-
|
2269
|
-
|
2270
|
-
def agentic_object_detection(
|
2271
|
-
prompt: str,
|
2272
|
-
image: np.ndarray,
|
2273
|
-
fine_tune_id: Optional[str] = None,
|
2274
|
-
) -> List[Dict[str, Any]]:
|
2275
|
-
"""'agentic_object_detection' is a tool that can detect and count multiple objects
|
2276
|
-
given a text prompt such as category names or referring expressions on images. The
|
2277
|
-
categories in text prompt are separated by commas. It returns a list of bounding
|
2278
|
-
boxes with normalized coordinates, label names and associated probability scores.
|
2279
|
-
|
2280
|
-
Parameters:
|
2281
|
-
prompt (str): The prompt to ground to the image.
|
2282
|
-
image (np.ndarray): The image to ground the prompt to.
|
2283
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
2284
|
-
fine-tuned model ID here to use it.
|
2285
|
-
|
2286
|
-
Returns:
|
2287
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
2288
|
-
bounding box of the detected objects with normalized coordinates between 0
|
2289
|
-
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
2290
|
-
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
2291
|
-
bounding box.
|
2292
|
-
|
2293
|
-
Example
|
2294
|
-
-------
|
2295
|
-
>>> agentic_object_detection("car", image)
|
2296
|
-
[
|
2297
|
-
{'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
2298
|
-
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
2299
|
-
]
|
2300
|
-
"""
|
2301
|
-
|
2302
|
-
image_size = image.shape[:2]
|
2303
|
-
if image_size[0] < 1 or image_size[1] < 1:
|
2304
|
-
return []
|
2305
|
-
|
2306
|
-
ret = _agentic_object_detection(
|
2307
|
-
prompt, image, image_size, fine_tune_id=fine_tune_id
|
2308
|
-
)
|
2309
|
-
|
2310
|
-
_display_tool_trace(
|
2311
|
-
agentic_object_detection.__name__,
|
2312
|
-
{"prompts": prompt},
|
2313
|
-
ret["display_data"],
|
2314
|
-
ret["files"],
|
2315
|
-
)
|
2316
|
-
return ret["return_data"] # type: ignore
|
2317
|
-
|
2318
|
-
|
2319
|
-
def agentic_sam2_instance_segmentation(
|
2320
|
-
prompt: str, image: np.ndarray
|
2321
|
-
) -> List[Dict[str, Any]]:
|
2322
|
-
"""'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
|
2323
|
-
instances of objects given a text prompt such as category names or referring
|
2324
|
-
expressions on images. The categories in text prompt are separated by commas. It
|
2325
|
-
returns a list of bounding boxes with normalized coordinates, label names, masks
|
2326
|
-
and associated probability scores.
|
2327
|
-
|
2328
|
-
Parameters:
|
2329
|
-
prompt (str): The object that needs to be counted.
|
2330
|
-
image (np.ndarray): The image that contains multiple instances of the object.
|
2331
|
-
|
2332
|
-
Returns:
|
2333
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2334
|
-
bounding box, and mask of the detected objects with normalized coordinates
|
2335
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2336
|
-
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2337
|
-
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2338
|
-
the background.
|
2339
|
-
|
2340
|
-
Example
|
2341
|
-
-------
|
2342
|
-
>>> agentic_sam2_instance_segmentation("flower", image)
|
2343
|
-
[
|
2344
|
-
{
|
2345
|
-
'score': 0.49,
|
2346
|
-
'label': 'flower',
|
2347
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2348
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2349
|
-
[0, 0, 0, ..., 0, 0, 0],
|
2350
|
-
...,
|
2351
|
-
[0, 0, 0, ..., 0, 0, 0],
|
2352
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2353
|
-
},
|
2354
|
-
]
|
2355
|
-
"""
|
2356
|
-
|
2357
|
-
od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
|
2358
|
-
seg_ret = _sam2(
|
2359
|
-
image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
|
2360
|
-
)
|
2361
|
-
|
2362
|
-
_display_tool_trace(
|
2363
|
-
agentic_sam2_instance_segmentation.__name__,
|
2364
|
-
{
|
2365
|
-
"prompts": prompt,
|
2366
|
-
},
|
2367
|
-
seg_ret["display_data"],
|
2368
|
-
seg_ret["files"],
|
2369
|
-
)
|
2370
|
-
|
2371
|
-
return seg_ret["return_data"] # type: ignore
|
2372
|
-
|
2373
|
-
|
2374
|
-
def agentic_sam2_video_tracking(
|
2375
|
-
prompt: str,
|
2376
|
-
frames: List[np.ndarray],
|
2377
|
-
chunk_length: Optional[int] = 25,
|
2378
|
-
fine_tune_id: Optional[str] = None,
|
2379
|
-
) -> List[List[Dict[str, Any]]]:
|
2380
|
-
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
2381
|
-
objects in a video given a text prompt such as category names or referring
|
2382
|
-
expressions. The categories in the text prompt are separated by commas. It returns
|
2383
|
-
a list of bounding boxes, label names, masks and associated probability scores and
|
2384
|
-
is useful for tracking and counting without duplicating counts.
|
2385
|
-
|
2386
|
-
Parameters:
|
2387
|
-
prompt (str): The prompt to ground to the image.
|
2388
|
-
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
2389
|
-
chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
|
2390
|
-
to find new objects.
|
2391
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
2392
|
-
fine-tuned model ID here to use it.
|
2393
|
-
|
2394
|
-
Returns:
|
2395
|
-
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
2396
|
-
label, segmentation mask and bounding boxes. The outer list represents each
|
2397
|
-
frame and the inner list is the entities per frame. The detected objects
|
2398
|
-
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
2399
|
-
and ymin are the coordinates of the top-left and xmax and ymax are the
|
2400
|
-
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
2401
|
-
numpy array where 1 indicates the object and 0 indicates the background.
|
2402
|
-
The label names are prefixed with their ID represent the total count.
|
2403
|
-
|
2404
|
-
Example
|
2405
|
-
-------
|
2406
|
-
>>> agentic_sam2_video_tracking("dinosaur", frames)
|
2407
|
-
[
|
2408
|
-
[
|
2409
|
-
{
|
2410
|
-
'label': '0: dinosaur',
|
2411
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2412
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2413
|
-
[0, 0, 0, ..., 0, 0, 0],
|
2414
|
-
...,
|
2415
|
-
[0, 0, 0, ..., 0, 0, 0],
|
2416
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2417
|
-
},
|
2418
|
-
],
|
2419
|
-
...
|
2420
|
-
]
|
2421
|
-
"""
|
2422
|
-
|
2423
|
-
ret = od_sam2_video_tracking(
|
2424
|
-
ODModels.AGENTIC,
|
2425
|
-
prompt=prompt,
|
2426
|
-
frames=frames,
|
2427
|
-
chunk_length=chunk_length,
|
2428
|
-
fine_tune_id=fine_tune_id,
|
2429
|
-
)
|
2430
|
-
_display_tool_trace(
|
2431
|
-
agentic_sam2_video_tracking.__name__,
|
2432
|
-
{},
|
2433
|
-
ret["display_data"],
|
2434
|
-
ret["files"],
|
2435
|
-
)
|
2436
|
-
return ret["return_data"] # type: ignore
|
2437
|
-
|
2438
|
-
|
2439
2508
|
def minimum_distance(
|
2440
2509
|
det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
|
2441
2510
|
) -> float:
|
@@ -3103,19 +3172,19 @@ FUNCTION_TOOLS = [
|
|
3103
3172
|
countgd_sam2_instance_segmentation,
|
3104
3173
|
countgd_sam2_video_tracking,
|
3105
3174
|
florence2_ocr,
|
3175
|
+
florence2_object_detection,
|
3106
3176
|
florence2_sam2_instance_segmentation,
|
3107
3177
|
florence2_sam2_video_tracking,
|
3108
|
-
florence2_object_detection,
|
3109
3178
|
claude35_text_extraction,
|
3110
3179
|
document_extraction,
|
3111
3180
|
document_qa,
|
3112
3181
|
ocr,
|
3113
3182
|
qwen2_vl_images_vqa,
|
3114
3183
|
qwen2_vl_video_vqa,
|
3184
|
+
activity_recognition,
|
3115
3185
|
depth_anything_v2,
|
3116
3186
|
generate_pose_image,
|
3117
3187
|
vit_nsfw_classification,
|
3118
|
-
video_temporal_localization,
|
3119
3188
|
flux_image_inpainting,
|
3120
3189
|
siglip_classification,
|
3121
3190
|
minimum_distance,
|
@@ -3129,13 +3198,30 @@ UTIL_TOOLS = [
|
|
3129
3198
|
save_video,
|
3130
3199
|
overlay_bounding_boxes,
|
3131
3200
|
overlay_segmentation_masks,
|
3132
|
-
overlay_heat_map,
|
3133
3201
|
]
|
3134
3202
|
|
3135
3203
|
TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
|
3136
3204
|
|
3137
|
-
|
3138
|
-
|
3139
|
-
|
3140
|
-
|
3141
|
-
|
3205
|
+
|
3206
|
+
def get_tools() -> List[Callable]:
|
3207
|
+
return TOOLS # type: ignore
|
3208
|
+
|
3209
|
+
|
3210
|
+
def get_tools_info() -> Dict[str, str]:
|
3211
|
+
return _get_tools_info(FUNCTION_TOOLS) # type: ignore
|
3212
|
+
|
3213
|
+
|
3214
|
+
def get_tools_df() -> pd.DataFrame:
|
3215
|
+
return _get_tools_df(TOOLS) # type: ignore
|
3216
|
+
|
3217
|
+
|
3218
|
+
def get_tools_descriptions() -> str:
|
3219
|
+
return _get_tool_descriptions(TOOLS) # type: ignore
|
3220
|
+
|
3221
|
+
|
3222
|
+
def get_tools_docstring() -> str:
|
3223
|
+
return _get_tool_documentation(TOOLS) # type: ignore
|
3224
|
+
|
3225
|
+
|
3226
|
+
def get_utilties_docstring() -> str:
|
3227
|
+
return _get_tool_documentation(UTIL_TOOLS) # type: ignore
|