vision-agent 0.2.236__py3-none-any.whl → 0.2.237__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. vision_agent/.sim_tools/df.csv +57 -80
  2. vision_agent/.sim_tools/embs.npy +0 -0
  3. vision_agent/agent/agent.py +2 -2
  4. vision_agent/agent/vision_agent.py +3 -2
  5. vision_agent/agent/vision_agent_coder.py +13 -19
  6. vision_agent/agent/vision_agent_coder_v2.py +17 -17
  7. vision_agent/agent/vision_agent_planner.py +16 -21
  8. vision_agent/agent/vision_agent_planner_prompts_v2.py +19 -20
  9. vision_agent/agent/vision_agent_planner_v2.py +29 -15
  10. vision_agent/agent/vision_agent_v2.py +12 -12
  11. vision_agent/clients/landing_public_api.py +1 -1
  12. vision_agent/configs/config.py +17 -3
  13. vision_agent/lmm/__init__.py +0 -1
  14. vision_agent/lmm/lmm.py +4 -3
  15. vision_agent/models/__init__.py +11 -0
  16. vision_agent/{lmm/types.py → models/lmm_types.py} +4 -1
  17. vision_agent/sim/__init__.py +8 -0
  18. vision_agent/{utils → sim}/sim.py +3 -3
  19. vision_agent/tools/__init__.py +10 -23
  20. vision_agent/tools/meta_tools.py +4 -5
  21. vision_agent/tools/planner_tools.py +127 -37
  22. vision_agent/tools/tools.py +388 -302
  23. vision_agent/utils/__init__.py +0 -1
  24. vision_agent/{agent/agent_utils.py → utils/agent.py} +11 -2
  25. vision_agent/utils/image_utils.py +18 -7
  26. vision_agent/{tools/tool_utils.py → utils/tools.py} +1 -93
  27. vision_agent/utils/tools_doc.py +87 -0
  28. vision_agent/utils/video.py +15 -0
  29. vision_agent/utils/video_tracking.py +38 -5
  30. {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/METADATA +2 -2
  31. vision_agent-0.2.237.dist-info/RECORD +55 -0
  32. vision_agent-0.2.236.dist-info/RECORD +0 -52
  33. /vision_agent/{agent/types.py → models/agent_types.py} +0 -0
  34. /vision_agent/{tools → models}/tools_types.py +0 -0
  35. {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/LICENSE +0 -0
  36. {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/WHEEL +0 -0
@@ -8,11 +8,12 @@ from base64 import b64encode
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from importlib import resources
10
10
  from pathlib import Path
11
- from typing import Any, Dict, List, Optional, Tuple, Union, cast
11
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
12
12
  from uuid import UUID
13
13
 
14
14
  import cv2
15
15
  import numpy as np
16
+ import pandas as pd
16
17
  import requests
17
18
  from IPython.display import display
18
19
  from PIL import Image, ImageDraw, ImageFont
@@ -20,21 +21,8 @@ from pillow_heif import register_heif_opener # type: ignore
20
21
  from pytube import YouTube # type: ignore
21
22
 
22
23
  from vision_agent.clients.landing_public_api import LandingPublicAPI
23
- from vision_agent.lmm.lmm import AnthropicLMM
24
- from vision_agent.tools.tool_utils import (
25
- ToolCallTrace,
26
- add_bboxes_from_masks,
27
- get_tool_descriptions,
28
- get_tool_documentation,
29
- get_tools_df,
30
- get_tools_info,
31
- nms,
32
- send_inference_request,
33
- send_task_inference_request,
34
- should_report_tool_traces,
35
- single_nms,
36
- )
37
- from vision_agent.tools.tools_types import JobStatus
24
+ from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
25
+ from vision_agent.models import JobStatus
38
26
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
39
27
  from vision_agent.utils.execute import FileSerializer, MimeType
40
28
  from vision_agent.utils.image_utils import (
@@ -48,6 +36,21 @@ from vision_agent.utils.image_utils import (
48
36
  rle_decode,
49
37
  rle_decode_array,
50
38
  )
39
+ from vision_agent.utils.tools import (
40
+ ToolCallTrace,
41
+ add_bboxes_from_masks,
42
+ nms,
43
+ send_inference_request,
44
+ send_task_inference_request,
45
+ should_report_tool_traces,
46
+ single_nms,
47
+ )
48
+ from vision_agent.utils.tools_doc import get_tool_descriptions as _get_tool_descriptions
49
+ from vision_agent.utils.tools_doc import (
50
+ get_tool_documentation as _get_tool_documentation,
51
+ )
52
+ from vision_agent.utils.tools_doc import get_tools_df as _get_tools_df
53
+ from vision_agent.utils.tools_doc import get_tools_info as _get_tools_info
51
54
  from vision_agent.utils.video import (
52
55
  extract_frames_from_video,
53
56
  frames_to_bytes,
@@ -234,6 +237,7 @@ def od_sam2_video_tracking(
234
237
  od_model: ODModels,
235
238
  prompt: str,
236
239
  frames: List[np.ndarray],
240
+ box_threshold: float = 0.30,
237
241
  chunk_length: Optional[int] = 50,
238
242
  fine_tune_id: Optional[str] = None,
239
243
  ) -> Dict[str, Any]:
@@ -278,7 +282,9 @@ def od_sam2_video_tracking(
278
282
 
279
283
  if od_model == ODModels.COUNTGD:
280
284
  segment_results = countgd_object_detection(
281
- prompt=prompt, image=segment_frames[frame_number]
285
+ prompt=prompt,
286
+ image=segment_frames[frame_number],
287
+ box_threshold=box_threshold,
282
288
  )
283
289
  function_name = "countgd_object_detection"
284
290
 
@@ -286,6 +292,7 @@ def od_sam2_video_tracking(
286
292
  segment_results = owlv2_object_detection(
287
293
  prompt=prompt,
288
294
  image=segment_frames[frame_number],
295
+ box_threshold=box_threshold,
289
296
  fine_tune_id=fine_tune_id,
290
297
  )
291
298
  function_name = "owlv2_object_detection"
@@ -310,6 +317,7 @@ def od_sam2_video_tracking(
310
317
  segment_results = custom_object_detection(
311
318
  deployment_id=fine_tune_id,
312
319
  image=segment_frames[frame_number],
320
+ box_threshold=box_threshold,
313
321
  )
314
322
  function_name = "custom_object_detection"
315
323
 
@@ -546,6 +554,7 @@ def owlv2_sam2_instance_segmentation(
546
554
  def owlv2_sam2_video_tracking(
547
555
  prompt: str,
548
556
  frames: List[np.ndarray],
557
+ box_threshold: float = 0.10,
549
558
  chunk_length: Optional[int] = 25,
550
559
  fine_tune_id: Optional[str] = None,
551
560
  ) -> List[List[Dict[str, Any]]]:
@@ -558,6 +567,8 @@ def owlv2_sam2_video_tracking(
558
567
  Parameters:
559
568
  prompt (str): The prompt to ground to the image.
560
569
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
570
+ box_threshold (float, optional): The threshold for the box detection. Defaults
571
+ to 0.10.
561
572
  chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
562
573
  new objects.
563
574
  fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
@@ -596,6 +607,7 @@ def owlv2_sam2_video_tracking(
596
607
  ODModels.OWLV2,
597
608
  prompt=prompt,
598
609
  frames=frames,
610
+ box_threshold=box_threshold,
599
611
  chunk_length=chunk_length,
600
612
  fine_tune_id=fine_tune_id,
601
613
  )
@@ -1118,6 +1130,7 @@ def countgd_sam2_instance_segmentation(
1118
1130
  def countgd_sam2_video_tracking(
1119
1131
  prompt: str,
1120
1132
  frames: List[np.ndarray],
1133
+ box_threshold: float = 0.23,
1121
1134
  chunk_length: Optional[int] = 25,
1122
1135
  ) -> List[List[Dict[str, Any]]]:
1123
1136
  """'countgd_sam2_video_tracking' is a tool that can track and segment multiple
@@ -1129,6 +1142,8 @@ def countgd_sam2_video_tracking(
1129
1142
  Parameters:
1130
1143
  prompt (str): The prompt to ground to the image.
1131
1144
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
1145
+ box_threshold (float, optional): The threshold for detection. Defaults
1146
+ to 0.23.
1132
1147
  chunk_length (Optional[int]): The number of frames to re-run countgd to find
1133
1148
  new objects.
1134
1149
 
@@ -1162,7 +1177,11 @@ def countgd_sam2_video_tracking(
1162
1177
  """
1163
1178
 
1164
1179
  ret = od_sam2_video_tracking(
1165
- ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
1180
+ ODModels.COUNTGD,
1181
+ prompt=prompt,
1182
+ frames=frames,
1183
+ box_threshold=box_threshold,
1184
+ chunk_length=chunk_length,
1166
1185
  )
1167
1186
  _display_tool_trace(
1168
1187
  countgd_sam2_video_tracking.__name__,
@@ -1173,6 +1192,9 @@ def countgd_sam2_video_tracking(
1173
1192
  return ret["return_data"] # type: ignore
1174
1193
 
1175
1194
 
1195
+ # Custom Models
1196
+
1197
+
1176
1198
  def countgd_visual_prompt_object_detection(
1177
1199
  visual_prompts: List[List[float]],
1178
1200
  image: np.ndarray,
@@ -1386,6 +1408,247 @@ def custom_od_sam2_video_tracking(
1386
1408
  return ret["return_data"] # type: ignore
1387
1409
 
1388
1410
 
1411
+ # Agentic OD Tools
1412
+
1413
+
1414
+ def _agentic_object_detection(
1415
+ prompt: str,
1416
+ image: np.ndarray,
1417
+ image_size: Tuple[int, ...],
1418
+ image_bytes: Optional[bytes] = None,
1419
+ fine_tune_id: Optional[str] = None,
1420
+ ) -> Dict[str, Any]:
1421
+ if image_bytes is None:
1422
+ image_bytes = numpy_to_bytes(image)
1423
+
1424
+ files = [("image", image_bytes)]
1425
+ payload = {
1426
+ "prompts": [s.strip() for s in prompt.split(",")],
1427
+ "model": "agentic",
1428
+ }
1429
+ metadata = {"function_name": "agentic_object_detection"}
1430
+
1431
+ if fine_tune_id is not None:
1432
+ landing_api = LandingPublicAPI()
1433
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
1434
+ if status is not JobStatus.SUCCEEDED:
1435
+ raise FineTuneModelIsNotReady(
1436
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
1437
+ )
1438
+
1439
+ # we can only execute fine-tuned models with florence2
1440
+ payload = {
1441
+ "prompts": payload["prompts"],
1442
+ "jobId": fine_tune_id,
1443
+ "model": "florence2",
1444
+ }
1445
+
1446
+ detections = send_task_inference_request(
1447
+ payload,
1448
+ "text-to-object-detection",
1449
+ files=files,
1450
+ metadata=metadata,
1451
+ )
1452
+
1453
+ # get the first frame
1454
+ bboxes = detections[0]
1455
+ bboxes_formatted = [
1456
+ {
1457
+ "label": bbox["label"],
1458
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1459
+ "score": bbox["score"],
1460
+ }
1461
+ for bbox in bboxes
1462
+ ]
1463
+ display_data = [
1464
+ {
1465
+ "label": bbox["label"],
1466
+ "bbox": bbox["bounding_box"],
1467
+ "score": bbox["score"],
1468
+ }
1469
+ for bbox in bboxes
1470
+ ]
1471
+ return {
1472
+ "files": files,
1473
+ "return_data": bboxes_formatted,
1474
+ "display_data": display_data,
1475
+ }
1476
+
1477
+
1478
+ def agentic_object_detection(
1479
+ prompt: str,
1480
+ image: np.ndarray,
1481
+ fine_tune_id: Optional[str] = None,
1482
+ ) -> List[Dict[str, Any]]:
1483
+ """'agentic_object_detection' is a tool that can detect multiple objects given a
1484
+ text prompt such as object names or referring expressions on images. It's
1485
+ particularly good at detecting specific objects given detailed descriptive prompts.
1486
+ It returns a list of bounding boxes with normalized coordinates, label names and
1487
+ associated probability scores.
1488
+
1489
+ Parameters:
1490
+ prompt (str): The prompt to ground to the image, only supports a single prompt
1491
+ with no commas or periods.
1492
+ image (np.ndarray): The image to ground the prompt to.
1493
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
1494
+ fine-tuned model ID here to use it.
1495
+
1496
+ Returns:
1497
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1498
+ bounding box of the detected objects with normalized coordinates between 0
1499
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1500
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
1501
+ bounding box.
1502
+
1503
+ Example
1504
+ -------
1505
+ >>> agentic_object_detection("a red car", image)
1506
+ [
1507
+ {'score': 0.99, 'label': 'a red car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1508
+ {'score': 0.98, 'label': 'a red car', 'bbox': [0.2, 0.21, 0.45, 0.5},
1509
+ ]
1510
+ """
1511
+
1512
+ image_size = image.shape[:2]
1513
+ if image_size[0] < 1 or image_size[1] < 1:
1514
+ return []
1515
+
1516
+ ret = _agentic_object_detection(
1517
+ prompt, image, image_size, fine_tune_id=fine_tune_id
1518
+ )
1519
+
1520
+ _display_tool_trace(
1521
+ agentic_object_detection.__name__,
1522
+ {"prompts": prompt},
1523
+ ret["display_data"],
1524
+ ret["files"],
1525
+ )
1526
+ return ret["return_data"] # type: ignore
1527
+
1528
+
1529
+ def agentic_sam2_instance_segmentation(
1530
+ prompt: str, image: np.ndarray
1531
+ ) -> List[Dict[str, Any]]:
1532
+ """'agentic_sam2_instance_segmentation' is a tool that can detect multiple
1533
+ instances given a text prompt such as object names or referring expressions on
1534
+ images. It's particularly good at detecting specific objects given detailed
1535
+ descriptive prompts. It returns a list of bounding boxes with normalized
1536
+ coordinates, label names, masks and associated probability scores.
1537
+
1538
+ Parameters:
1539
+ prompt (str): The object that needs to be counted, only supports a single
1540
+ prompt with no commas or periods.
1541
+ image (np.ndarray): The image that contains multiple instances of the object.
1542
+
1543
+ Returns:
1544
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1545
+ bounding box, and mask of the detected objects with normalized coordinates
1546
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1547
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1548
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1549
+ the background.
1550
+
1551
+ Example
1552
+ -------
1553
+ >>> agentic_sam2_instance_segmentation("a large blue flower", image)
1554
+ [
1555
+ {
1556
+ 'score': 0.49,
1557
+ 'label': 'a large blue flower',
1558
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1559
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1560
+ [0, 0, 0, ..., 0, 0, 0],
1561
+ ...,
1562
+ [0, 0, 0, ..., 0, 0, 0],
1563
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1564
+ },
1565
+ ]
1566
+ """
1567
+
1568
+ od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
1569
+ seg_ret = _sam2(
1570
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
1571
+ )
1572
+
1573
+ _display_tool_trace(
1574
+ agentic_sam2_instance_segmentation.__name__,
1575
+ {
1576
+ "prompts": prompt,
1577
+ },
1578
+ seg_ret["display_data"],
1579
+ seg_ret["files"],
1580
+ )
1581
+
1582
+ return seg_ret["return_data"] # type: ignore
1583
+
1584
+
1585
+ def agentic_sam2_video_tracking(
1586
+ prompt: str,
1587
+ frames: List[np.ndarray],
1588
+ chunk_length: Optional[int] = 25,
1589
+ fine_tune_id: Optional[str] = None,
1590
+ ) -> List[List[Dict[str, Any]]]:
1591
+ """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
1592
+ objects in a video given a text prompt such as object names or referring
1593
+ expressions. It's particularly good at detecting specific objects given detailed
1594
+ descriptive prompts and returns a list of bounding boxes, label names, masks and
1595
+ associated probability scores and is useful for tracking and counting without
1596
+ duplicating counts.
1597
+
1598
+ Parameters:
1599
+ prompt (str): The prompt to ground to the image, only supports a single prompt
1600
+ with no commas or periods.
1601
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
1602
+ chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
1603
+ to find new objects.
1604
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
1605
+ fine-tuned model ID here to use it.
1606
+
1607
+ Returns:
1608
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
1609
+ label, segmentation mask and bounding boxes. The outer list represents each
1610
+ frame and the inner list is the entities per frame. The detected objects
1611
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
1612
+ and ymin are the coordinates of the top-left and xmax and ymax are the
1613
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
1614
+ numpy array where 1 indicates the object and 0 indicates the background.
1615
+ The label names are prefixed with their ID represent the total count.
1616
+
1617
+ Example
1618
+ -------
1619
+ >>> agentic_sam2_video_tracking("a runner with yellow shoes", frames)
1620
+ [
1621
+ [
1622
+ {
1623
+ 'label': '0: a runner with yellow shoes',
1624
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1625
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1626
+ [0, 0, 0, ..., 0, 0, 0],
1627
+ ...,
1628
+ [0, 0, 0, ..., 0, 0, 0],
1629
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1630
+ },
1631
+ ],
1632
+ ...
1633
+ ]
1634
+ """
1635
+
1636
+ ret = od_sam2_video_tracking(
1637
+ ODModels.AGENTIC,
1638
+ prompt=prompt,
1639
+ frames=frames,
1640
+ chunk_length=chunk_length,
1641
+ fine_tune_id=fine_tune_id,
1642
+ )
1643
+ _display_tool_trace(
1644
+ agentic_sam2_video_tracking.__name__,
1645
+ {},
1646
+ ret["display_data"],
1647
+ ret["files"],
1648
+ )
1649
+ return ret["return_data"] # type: ignore
1650
+
1651
+
1389
1652
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1390
1653
  """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
1391
1654
  images including regular images or images of documents or presentations. It can be
@@ -1696,85 +1959,127 @@ Answer the question directly using only the information from the document, do no
1696
1959
  return llm_output
1697
1960
 
1698
1961
 
1699
- def video_temporal_localization(
1962
+ def _sample(frames: List[np.ndarray], sample_size: int) -> List[np.ndarray]:
1963
+ sample_indices = np.linspace(0, len(frames) - 1, sample_size, dtype=int)
1964
+ sampled_frames = []
1965
+
1966
+ for i, frame in enumerate(frames):
1967
+ if i in sample_indices:
1968
+ sampled_frames.append(frame)
1969
+ if len(sampled_frames) >= sample_size:
1970
+ break
1971
+ return sampled_frames
1972
+
1973
+
1974
+ def activity_recognition(
1700
1975
  prompt: str,
1701
1976
  frames: List[np.ndarray],
1702
1977
  model: str = "qwen2vl",
1703
- chunk_length_frames: int = 2,
1978
+ chunk_length_frames: int = 10,
1704
1979
  ) -> List[float]:
1705
- """'video_temporal_localization' will run qwen2vl on each chunk_length_frames
1706
- value selected for the video. It can detect multiple objects independently per
1707
- chunk_length_frames given a text prompt such as a referring expression
1708
- but does not track objects across frames.
1709
- It returns a list of floats with a value of 1.0 if the objects are found in a given
1710
- chunk_length_frames of the video.
1980
+ """'activity_recognition' is a tool that can recognize activities in a video given a
1981
+ text prompt. It can be used to identify where specific activities or actions
1982
+ happen in a video and returns a list of 0s and 1s to indicate the activity.
1711
1983
 
1712
1984
  Parameters:
1713
- prompt (str): The question about the video
1985
+ prompt (str): The event you want to identify, should be phrased as a question,
1986
+ for example, "Did a goal happen?".
1714
1987
  frames (List[np.ndarray]): The reference frames used for the question
1715
1988
  model (str): The model to use for the inference. Valid values are
1716
- 'qwen2vl', 'gpt4o'.
1989
+ 'claude-35', 'gpt-4o', 'qwen2vl'.
1717
1990
  chunk_length_frames (int): length of each chunk in frames
1718
1991
 
1719
1992
  Returns:
1720
- List[float]: A list of floats with a value of 1.0 if the objects to be found
1721
- are present in the chunk_length_frames of the video.
1993
+ List[float]: A list of floats with a value of 1.0 if the activity is detected in
1994
+ the chunk_length_frames of the video.
1722
1995
 
1723
1996
  Example
1724
1997
  -------
1725
- >>> video_temporal_localization('Did a goal happened?', frames)
1998
+ >>> activity_recognition('Did a goal happened?', frames)
1726
1999
  [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
1727
2000
  """
1728
2001
 
1729
2002
  buffer_bytes = frames_to_bytes(frames)
1730
2003
  files = [("video", buffer_bytes)]
1731
- payload: Dict[str, Any] = {
1732
- "prompt": prompt,
1733
- "model": model,
1734
- "function_name": "video_temporal_localization",
1735
- }
1736
- payload["chunk_length_frames"] = chunk_length_frames
1737
2004
 
1738
- segments = split_frames_into_segments(frames, segment_size=50, overlap=0)
2005
+ segments = split_frames_into_segments(
2006
+ frames, segment_size=chunk_length_frames, overlap=0
2007
+ )
2008
+
2009
+ prompt = (
2010
+ f"{prompt} Please respond with a 'yes' or 'no' based on the frames provided."
2011
+ )
1739
2012
 
1740
- def _apply_temporal_localization(
2013
+ def _lmm_activity_recognition(
2014
+ lmm: LMM,
1741
2015
  segment: List[np.ndarray],
1742
2016
  ) -> List[float]:
2017
+ frames = _sample(segment, 10)
2018
+ media = []
2019
+ for frame in frames:
2020
+ buffer = io.BytesIO()
2021
+ image_pil = Image.fromarray(frame)
2022
+ if image_pil.size[0] > 768:
2023
+ image_pil.thumbnail((768, 768))
2024
+ image_pil.save(buffer, format="PNG")
2025
+ image_bytes = buffer.getvalue()
2026
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
2027
+ media.append(image_b64)
2028
+
2029
+ response = cast(str, lmm.generate(prompt, media))
2030
+ if "yes" in response.lower():
2031
+ return [1.0] * len(segment)
2032
+ return [0.0] * len(segment)
2033
+
2034
+ def _qwen2vl_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2035
+ payload: Dict[str, Any] = {
2036
+ "prompt": prompt,
2037
+ "model": "qwen2vl",
2038
+ "function_name": "qwen2_vl_video_vqa",
2039
+ }
1743
2040
  segment_buffer_bytes = [("video", frames_to_bytes(segment))]
1744
- data = send_inference_request(
1745
- payload, "video-temporal-localization", files=segment_buffer_bytes, v2=True
2041
+ response = send_inference_request(
2042
+ payload, "image-to-text", files=segment_buffer_bytes, v2=True
1746
2043
  )
1747
- chunked_data = [cast(float, value) for value in data]
2044
+ if "yes" in response.lower():
2045
+ return [1.0] * len(segment)
2046
+ return [0.0] * len(segment)
2047
+
2048
+ if model == "claude-35":
2049
+
2050
+ def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2051
+ return _lmm_activity_recognition(AnthropicLMM(), segment)
2052
+
2053
+ elif model == "gpt-4o":
1748
2054
 
1749
- full_data = []
1750
- for value in chunked_data:
1751
- full_data.extend([value] * chunk_length_frames)
2055
+ def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2056
+ return _lmm_activity_recognition(OpenAILMM(), segment)
1752
2057
 
1753
- return full_data[: len(segment)]
2058
+ elif model == "qwen2vl":
2059
+ _apply_activity_recognition = _qwen2vl_activity_recognition
2060
+ else:
2061
+ raise ValueError(f"Invalid model: {model}")
1754
2062
 
1755
2063
  with ThreadPoolExecutor() as executor:
1756
2064
  futures = {
1757
- executor.submit(_apply_temporal_localization, segment): segment_index
2065
+ executor.submit(_apply_activity_recognition, segment): segment_index
1758
2066
  for segment_index, segment in enumerate(segments)
1759
2067
  }
1760
2068
 
1761
- localization_per_segment = []
2069
+ return_value_tuples = []
1762
2070
  for future in as_completed(futures):
1763
2071
  segment_index = futures[future]
1764
- localization_per_segment.append((segment_index, future.result()))
1765
-
1766
- localization_per_segment = [
1767
- x[1] for x in sorted(localization_per_segment, key=lambda x: x[0]) # type: ignore
1768
- ]
1769
- localizations = cast(List[float], [e for o in localization_per_segment for e in o])
2072
+ return_value_tuples.append((segment_index, future.result()))
2073
+ return_values = [x[1] for x in sorted(return_value_tuples, key=lambda x: x[0])]
2074
+ return_values_flattened = cast(List[float], [e for o in return_values for e in o])
1770
2075
 
1771
2076
  _display_tool_trace(
1772
- video_temporal_localization.__name__,
1773
- payload,
1774
- localization_per_segment,
2077
+ activity_recognition.__name__,
2078
+ {"prompt": prompt, "model": model},
2079
+ return_values,
1775
2080
  files,
1776
2081
  )
1777
- return localizations
2082
+ return return_values_flattened
1778
2083
 
1779
2084
 
1780
2085
  def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
@@ -2200,242 +2505,6 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2200
2505
  return response
2201
2506
 
2202
2507
 
2203
- # Agentic OD Tools
2204
-
2205
-
2206
- def _agentic_object_detection(
2207
- prompt: str,
2208
- image: np.ndarray,
2209
- image_size: Tuple[int, ...],
2210
- image_bytes: Optional[bytes] = None,
2211
- fine_tune_id: Optional[str] = None,
2212
- ) -> Dict[str, Any]:
2213
- if image_bytes is None:
2214
- image_bytes = numpy_to_bytes(image)
2215
-
2216
- files = [("image", image_bytes)]
2217
- payload = {
2218
- "prompts": [s.strip() for s in prompt.split(",")],
2219
- "model": "agentic",
2220
- }
2221
- metadata = {"function_name": "agentic_object_detection"}
2222
-
2223
- if fine_tune_id is not None:
2224
- landing_api = LandingPublicAPI()
2225
- status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
2226
- if status is not JobStatus.SUCCEEDED:
2227
- raise FineTuneModelIsNotReady(
2228
- f"Fine-tuned model {fine_tune_id} is not ready yet"
2229
- )
2230
-
2231
- # we can only execute fine-tuned models with florence2
2232
- payload = {
2233
- "prompts": payload["prompts"],
2234
- "jobId": fine_tune_id,
2235
- "model": "florence2",
2236
- }
2237
-
2238
- detections = send_task_inference_request(
2239
- payload,
2240
- "text-to-object-detection",
2241
- files=files,
2242
- metadata=metadata,
2243
- )
2244
-
2245
- # get the first frame
2246
- bboxes = detections[0]
2247
- bboxes_formatted = [
2248
- {
2249
- "label": bbox["label"],
2250
- "bbox": normalize_bbox(bbox["bounding_box"], image_size),
2251
- "score": bbox["score"],
2252
- }
2253
- for bbox in bboxes
2254
- ]
2255
- display_data = [
2256
- {
2257
- "label": bbox["label"],
2258
- "bbox": bbox["bounding_box"],
2259
- "score": bbox["score"],
2260
- }
2261
- for bbox in bboxes
2262
- ]
2263
- return {
2264
- "files": files,
2265
- "return_data": bboxes_formatted,
2266
- "display_data": display_data,
2267
- }
2268
-
2269
-
2270
- def agentic_object_detection(
2271
- prompt: str,
2272
- image: np.ndarray,
2273
- fine_tune_id: Optional[str] = None,
2274
- ) -> List[Dict[str, Any]]:
2275
- """'agentic_object_detection' is a tool that can detect and count multiple objects
2276
- given a text prompt such as category names or referring expressions on images. The
2277
- categories in text prompt are separated by commas. It returns a list of bounding
2278
- boxes with normalized coordinates, label names and associated probability scores.
2279
-
2280
- Parameters:
2281
- prompt (str): The prompt to ground to the image.
2282
- image (np.ndarray): The image to ground the prompt to.
2283
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2284
- fine-tuned model ID here to use it.
2285
-
2286
- Returns:
2287
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
2288
- bounding box of the detected objects with normalized coordinates between 0
2289
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
2290
- top-left and xmax and ymax are the coordinates of the bottom-right of the
2291
- bounding box.
2292
-
2293
- Example
2294
- -------
2295
- >>> agentic_object_detection("car", image)
2296
- [
2297
- {'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
2298
- {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
2299
- ]
2300
- """
2301
-
2302
- image_size = image.shape[:2]
2303
- if image_size[0] < 1 or image_size[1] < 1:
2304
- return []
2305
-
2306
- ret = _agentic_object_detection(
2307
- prompt, image, image_size, fine_tune_id=fine_tune_id
2308
- )
2309
-
2310
- _display_tool_trace(
2311
- agentic_object_detection.__name__,
2312
- {"prompts": prompt},
2313
- ret["display_data"],
2314
- ret["files"],
2315
- )
2316
- return ret["return_data"] # type: ignore
2317
-
2318
-
2319
- def agentic_sam2_instance_segmentation(
2320
- prompt: str, image: np.ndarray
2321
- ) -> List[Dict[str, Any]]:
2322
- """'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
2323
- instances of objects given a text prompt such as category names or referring
2324
- expressions on images. The categories in text prompt are separated by commas. It
2325
- returns a list of bounding boxes with normalized coordinates, label names, masks
2326
- and associated probability scores.
2327
-
2328
- Parameters:
2329
- prompt (str): The object that needs to be counted.
2330
- image (np.ndarray): The image that contains multiple instances of the object.
2331
-
2332
- Returns:
2333
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2334
- bounding box, and mask of the detected objects with normalized coordinates
2335
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2336
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2337
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2338
- the background.
2339
-
2340
- Example
2341
- -------
2342
- >>> agentic_sam2_instance_segmentation("flower", image)
2343
- [
2344
- {
2345
- 'score': 0.49,
2346
- 'label': 'flower',
2347
- 'bbox': [0.1, 0.11, 0.35, 0.4],
2348
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2349
- [0, 0, 0, ..., 0, 0, 0],
2350
- ...,
2351
- [0, 0, 0, ..., 0, 0, 0],
2352
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2353
- },
2354
- ]
2355
- """
2356
-
2357
- od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
2358
- seg_ret = _sam2(
2359
- image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
2360
- )
2361
-
2362
- _display_tool_trace(
2363
- agentic_sam2_instance_segmentation.__name__,
2364
- {
2365
- "prompts": prompt,
2366
- },
2367
- seg_ret["display_data"],
2368
- seg_ret["files"],
2369
- )
2370
-
2371
- return seg_ret["return_data"] # type: ignore
2372
-
2373
-
2374
- def agentic_sam2_video_tracking(
2375
- prompt: str,
2376
- frames: List[np.ndarray],
2377
- chunk_length: Optional[int] = 25,
2378
- fine_tune_id: Optional[str] = None,
2379
- ) -> List[List[Dict[str, Any]]]:
2380
- """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
2381
- objects in a video given a text prompt such as category names or referring
2382
- expressions. The categories in the text prompt are separated by commas. It returns
2383
- a list of bounding boxes, label names, masks and associated probability scores and
2384
- is useful for tracking and counting without duplicating counts.
2385
-
2386
- Parameters:
2387
- prompt (str): The prompt to ground to the image.
2388
- frames (List[np.ndarray]): The list of frames to ground the prompt to.
2389
- chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
2390
- to find new objects.
2391
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2392
- fine-tuned model ID here to use it.
2393
-
2394
- Returns:
2395
- List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
2396
- label, segmentation mask and bounding boxes. The outer list represents each
2397
- frame and the inner list is the entities per frame. The detected objects
2398
- have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
2399
- and ymin are the coordinates of the top-left and xmax and ymax are the
2400
- coordinates of the bottom-right of the bounding box. The mask is binary 2D
2401
- numpy array where 1 indicates the object and 0 indicates the background.
2402
- The label names are prefixed with their ID represent the total count.
2403
-
2404
- Example
2405
- -------
2406
- >>> agentic_sam2_video_tracking("dinosaur", frames)
2407
- [
2408
- [
2409
- {
2410
- 'label': '0: dinosaur',
2411
- 'bbox': [0.1, 0.11, 0.35, 0.4],
2412
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2413
- [0, 0, 0, ..., 0, 0, 0],
2414
- ...,
2415
- [0, 0, 0, ..., 0, 0, 0],
2416
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2417
- },
2418
- ],
2419
- ...
2420
- ]
2421
- """
2422
-
2423
- ret = od_sam2_video_tracking(
2424
- ODModels.AGENTIC,
2425
- prompt=prompt,
2426
- frames=frames,
2427
- chunk_length=chunk_length,
2428
- fine_tune_id=fine_tune_id,
2429
- )
2430
- _display_tool_trace(
2431
- agentic_sam2_video_tracking.__name__,
2432
- {},
2433
- ret["display_data"],
2434
- ret["files"],
2435
- )
2436
- return ret["return_data"] # type: ignore
2437
-
2438
-
2439
2508
  def minimum_distance(
2440
2509
  det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
2441
2510
  ) -> float:
@@ -3103,19 +3172,19 @@ FUNCTION_TOOLS = [
3103
3172
  countgd_sam2_instance_segmentation,
3104
3173
  countgd_sam2_video_tracking,
3105
3174
  florence2_ocr,
3175
+ florence2_object_detection,
3106
3176
  florence2_sam2_instance_segmentation,
3107
3177
  florence2_sam2_video_tracking,
3108
- florence2_object_detection,
3109
3178
  claude35_text_extraction,
3110
3179
  document_extraction,
3111
3180
  document_qa,
3112
3181
  ocr,
3113
3182
  qwen2_vl_images_vqa,
3114
3183
  qwen2_vl_video_vqa,
3184
+ activity_recognition,
3115
3185
  depth_anything_v2,
3116
3186
  generate_pose_image,
3117
3187
  vit_nsfw_classification,
3118
- video_temporal_localization,
3119
3188
  flux_image_inpainting,
3120
3189
  siglip_classification,
3121
3190
  minimum_distance,
@@ -3129,13 +3198,30 @@ UTIL_TOOLS = [
3129
3198
  save_video,
3130
3199
  overlay_bounding_boxes,
3131
3200
  overlay_segmentation_masks,
3132
- overlay_heat_map,
3133
3201
  ]
3134
3202
 
3135
3203
  TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
3136
3204
 
3137
- TOOLS_DF = get_tools_df(TOOLS) # type: ignore
3138
- TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
3139
- TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
3140
- TOOLS_INFO = get_tools_info(FUNCTION_TOOLS) # type: ignore
3141
- UTILITIES_DOCSTRING = get_tool_documentation(UTIL_TOOLS) # type: ignore
3205
+
3206
+ def get_tools() -> List[Callable]:
3207
+ return TOOLS # type: ignore
3208
+
3209
+
3210
+ def get_tools_info() -> Dict[str, str]:
3211
+ return _get_tools_info(FUNCTION_TOOLS) # type: ignore
3212
+
3213
+
3214
+ def get_tools_df() -> pd.DataFrame:
3215
+ return _get_tools_df(TOOLS) # type: ignore
3216
+
3217
+
3218
+ def get_tools_descriptions() -> str:
3219
+ return _get_tool_descriptions(TOOLS) # type: ignore
3220
+
3221
+
3222
+ def get_tools_docstring() -> str:
3223
+ return _get_tool_documentation(TOOLS) # type: ignore
3224
+
3225
+
3226
+ def get_utilties_docstring() -> str:
3227
+ return _get_tool_documentation(UTIL_TOOLS) # type: ignore