vision-agent 0.2.10__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -0
- vision_agent/agent/agent_coder.py +196 -0
- vision_agent/agent/agent_coder_prompts.py +135 -0
- vision_agent/agent/vision_agent.py +46 -30
- vision_agent/agent/vision_agent_prompts.py +3 -3
- vision_agent/agent/vision_agent_v2.py +396 -0
- vision_agent/agent/vision_agent_v2_prompt.py +185 -0
- vision_agent/llm/llm.py +12 -4
- vision_agent/tools/__init__.py +3 -1
- vision_agent/tools/tool_utils.py +30 -0
- vision_agent/tools/tools.py +157 -79
- vision_agent/tools/tools_v2.py +442 -0
- vision_agent/utils/__init__.py +3 -0
- vision_agent/utils/execute.py +104 -0
- vision_agent/utils/sim.py +85 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/METADATA +7 -3
- vision_agent-0.2.22.dist-info/RECORD +34 -0
- vision_agent-0.2.10.dist-info/RECORD +0 -25
- /vision_agent/{image_utils.py → utils/image_utils.py} +0 -0
- /vision_agent/{type_defs.py → utils/type_defs.py} +0 -0
- /vision_agent/{tools → utils}/video.py +0 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -9,8 +9,12 @@ import numpy as np
|
|
9
9
|
import requests
|
10
10
|
from PIL import Image
|
11
11
|
from PIL.Image import Image as ImageType
|
12
|
+
from scipy.spatial import distance # type: ignore
|
12
13
|
|
13
|
-
from vision_agent.
|
14
|
+
from vision_agent.lmm import OpenAILMM
|
15
|
+
from vision_agent.tools.tool_utils import _send_inference_request
|
16
|
+
from vision_agent.utils import extract_frames_from_video
|
17
|
+
from vision_agent.utils.image_utils import (
|
14
18
|
b64_to_pil,
|
15
19
|
convert_to_b64,
|
16
20
|
denormalize_bbox,
|
@@ -18,13 +22,8 @@ from vision_agent.image_utils import (
|
|
18
22
|
normalize_bbox,
|
19
23
|
rle_decode,
|
20
24
|
)
|
21
|
-
from vision_agent.lmm import OpenAILMM
|
22
|
-
from vision_agent.tools.video import extract_frames_from_video
|
23
|
-
from vision_agent.type_defs import LandingaiAPIKey
|
24
25
|
|
25
26
|
_LOGGER = logging.getLogger(__name__)
|
26
|
-
_LND_API_KEY = LandingaiAPIKey().api_key
|
27
|
-
_LND_API_URL = "https://api.dev.landing.ai/v1/agent"
|
28
27
|
|
29
28
|
|
30
29
|
class Tool(ABC):
|
@@ -175,15 +174,15 @@ class GroundingDINO(Tool):
|
|
175
174
|
"""
|
176
175
|
|
177
176
|
name = "grounding_dino_"
|
178
|
-
description = "'grounding_dino_' is a tool that can detect and count objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
|
177
|
+
description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
|
179
178
|
usage = {
|
180
179
|
"required_parameters": [
|
181
180
|
{"name": "prompt", "type": "str"},
|
182
181
|
{"name": "image", "type": "str"},
|
183
182
|
],
|
184
183
|
"optional_parameters": [
|
185
|
-
{"name": "box_threshold", "type": "float"},
|
186
|
-
{"name": "iou_threshold", "type": "float"},
|
184
|
+
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
185
|
+
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
187
186
|
],
|
188
187
|
"examples": [
|
189
188
|
{
|
@@ -210,7 +209,7 @@ class GroundingDINO(Tool):
|
|
210
209
|
"prompt": "red shirt. green shirt",
|
211
210
|
"image": "shirts.jpg",
|
212
211
|
"box_threshold": 0.20,
|
213
|
-
"iou_threshold": 0.
|
212
|
+
"iou_threshold": 0.20,
|
214
213
|
},
|
215
214
|
},
|
216
215
|
],
|
@@ -222,7 +221,7 @@ class GroundingDINO(Tool):
|
|
222
221
|
prompt: str,
|
223
222
|
image: Union[str, Path, ImageType],
|
224
223
|
box_threshold: float = 0.20,
|
225
|
-
iou_threshold: float = 0.
|
224
|
+
iou_threshold: float = 0.20,
|
226
225
|
) -> Dict:
|
227
226
|
"""Invoke the Grounding DINO model.
|
228
227
|
|
@@ -250,7 +249,7 @@ class GroundingDINO(Tool):
|
|
250
249
|
data["scores"] = [round(score, 2) for score in data["scores"]]
|
251
250
|
if "labels" in data:
|
252
251
|
data["labels"] = list(data["labels"])
|
253
|
-
data["
|
252
|
+
data["image_size"] = image_size
|
254
253
|
return data
|
255
254
|
|
256
255
|
|
@@ -278,15 +277,15 @@ class GroundingSAM(Tool):
|
|
278
277
|
"""
|
279
278
|
|
280
279
|
name = "grounding_sam_"
|
281
|
-
description = "'grounding_sam_' is a tool that can detect and segment objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
280
|
+
description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
282
281
|
usage = {
|
283
282
|
"required_parameters": [
|
284
283
|
{"name": "prompt", "type": "str"},
|
285
284
|
{"name": "image", "type": "str"},
|
286
285
|
],
|
287
286
|
"optional_parameters": [
|
288
|
-
{"name": "box_threshold", "type": "float"},
|
289
|
-
{"name": "iou_threshold", "type": "float"},
|
287
|
+
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
288
|
+
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
290
289
|
],
|
291
290
|
"examples": [
|
292
291
|
{
|
@@ -313,7 +312,7 @@ class GroundingSAM(Tool):
|
|
313
312
|
"prompt": "red shirt, green shirt",
|
314
313
|
"image": "shirts.jpg",
|
315
314
|
"box_threshold": 0.20,
|
316
|
-
"iou_threshold": 0.
|
315
|
+
"iou_threshold": 0.20,
|
317
316
|
},
|
318
317
|
},
|
319
318
|
],
|
@@ -325,7 +324,7 @@ class GroundingSAM(Tool):
|
|
325
324
|
prompt: str,
|
326
325
|
image: Union[str, ImageType],
|
327
326
|
box_threshold: float = 0.2,
|
328
|
-
iou_threshold: float = 0.
|
327
|
+
iou_threshold: float = 0.2,
|
329
328
|
) -> Dict:
|
330
329
|
"""Invoke the Grounding SAM model.
|
331
330
|
|
@@ -354,6 +353,7 @@ class GroundingSAM(Tool):
|
|
354
353
|
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
355
354
|
for mask in data["masks"]
|
356
355
|
]
|
356
|
+
data["image_size"] = image_size
|
357
357
|
data.pop("mask_shape", None)
|
358
358
|
return data
|
359
359
|
|
@@ -423,7 +423,6 @@ class DINOv(Tool):
|
|
423
423
|
request_data = {
|
424
424
|
"prompt": prompt,
|
425
425
|
"image": image_b64,
|
426
|
-
"tool": "dinov",
|
427
426
|
}
|
428
427
|
data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
|
429
428
|
if "bboxes" in data:
|
@@ -436,6 +435,8 @@ class DINOv(Tool):
|
|
436
435
|
for mask in data["masks"]
|
437
436
|
]
|
438
437
|
data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
|
438
|
+
mask_shape = data.pop("mask_shape", None)
|
439
|
+
data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
|
439
440
|
return data
|
440
441
|
|
441
442
|
|
@@ -544,7 +545,7 @@ class VisualPromptCounting(Tool):
|
|
544
545
|
-------
|
545
546
|
>>> import vision_agent as va
|
546
547
|
>>> prompt_count = va.tools.VisualPromptCounting()
|
547
|
-
>>> prompt_count(image="image1.jpg", prompt="0.1, 0.1, 0.4, 0.42
|
548
|
+
>>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]})
|
548
549
|
{'count': 23}
|
549
550
|
"""
|
550
551
|
|
@@ -554,52 +555,60 @@ class VisualPromptCounting(Tool):
|
|
554
555
|
usage = {
|
555
556
|
"required_parameters": [
|
556
557
|
{"name": "image", "type": "str"},
|
557
|
-
{"name": "prompt", "type": "str"},
|
558
|
+
{"name": "prompt", "type": "Dict[str, List[float]"},
|
558
559
|
],
|
559
560
|
"examples": [
|
560
561
|
{
|
561
562
|
"scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg",
|
562
|
-
"parameters": {
|
563
|
+
"parameters": {
|
564
|
+
"image": "lids.jpg",
|
565
|
+
"prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]},
|
566
|
+
},
|
563
567
|
},
|
564
568
|
{
|
565
|
-
"scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
|
566
|
-
"parameters": {
|
569
|
+
"scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}",
|
570
|
+
"parameters": {
|
571
|
+
"image": "tray.jpg",
|
572
|
+
"prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]},
|
573
|
+
},
|
567
574
|
},
|
568
575
|
{
|
569
|
-
"scenario": "Can you count this item based on an example, reference_data: '
|
576
|
+
"scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg",
|
570
577
|
"parameters": {
|
571
578
|
"image": "shirts.jpg",
|
572
|
-
"prompt": "
|
579
|
+
"prompt": {"bbox": [100, 115, 200, 200]},
|
573
580
|
},
|
574
581
|
},
|
575
582
|
{
|
576
|
-
"scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg",
|
583
|
+
"scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}",
|
577
584
|
"parameters": {
|
578
585
|
"image": "shoes.jpg",
|
579
|
-
"prompt": "0.1, 0.1, 0.6, 0.65
|
586
|
+
"prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]},
|
580
587
|
},
|
581
588
|
},
|
582
589
|
],
|
583
590
|
}
|
584
591
|
|
585
|
-
|
586
|
-
|
592
|
+
def __call__(
|
593
|
+
self, image: Union[str, ImageType], prompt: Dict[str, List[float]]
|
594
|
+
) -> Dict:
|
587
595
|
"""Invoke the few shot counting model.
|
588
596
|
|
589
597
|
Parameters:
|
590
598
|
image: the input image.
|
599
|
+
prompt: the visual prompt which is a bounding box describing the object.
|
591
600
|
|
592
601
|
Returns:
|
593
602
|
A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
|
594
603
|
"""
|
595
604
|
image_size = get_image_size(image)
|
596
|
-
bbox = [
|
597
|
-
|
605
|
+
bbox = prompt["bbox"]
|
606
|
+
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
598
607
|
image_b64 = convert_to_b64(image)
|
599
608
|
|
600
609
|
data = {
|
601
610
|
"image": image_b64,
|
602
|
-
"prompt":
|
611
|
+
"prompt": bbox_str,
|
603
612
|
"tool": "few_shot_counting",
|
604
613
|
}
|
605
614
|
resp_data = _send_inference_request(data, "tools")
|
@@ -783,33 +792,49 @@ class Crop(Tool):
|
|
783
792
|
return {"image": tmp.name}
|
784
793
|
|
785
794
|
|
786
|
-
class
|
787
|
-
r"""
|
795
|
+
class BboxStats(Tool):
|
796
|
+
r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
|
788
797
|
|
789
|
-
name = "
|
790
|
-
description = "'
|
798
|
+
name = "bbox_stats_"
|
799
|
+
description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
|
791
800
|
usage = {
|
792
|
-
"required_parameters": [
|
801
|
+
"required_parameters": [
|
802
|
+
{"name": "bboxes", "type": "List[int]"},
|
803
|
+
{"name": "image_size", "type": "Tuple[int]"},
|
804
|
+
],
|
793
805
|
"examples": [
|
794
806
|
{
|
795
|
-
"scenario": "
|
796
|
-
"parameters": {
|
797
|
-
|
807
|
+
"scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
808
|
+
"parameters": {
|
809
|
+
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
810
|
+
"image_size": (500, 1200),
|
811
|
+
},
|
812
|
+
},
|
813
|
+
{
|
814
|
+
"scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
815
|
+
"parameters": {
|
816
|
+
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
817
|
+
"image_size": (640, 480),
|
818
|
+
},
|
819
|
+
},
|
798
820
|
],
|
799
821
|
}
|
800
822
|
|
801
|
-
def __call__(
|
823
|
+
def __call__(
|
824
|
+
self, bboxes: List[List[int]], image_size: Tuple[int, int]
|
825
|
+
) -> List[Dict]:
|
802
826
|
areas = []
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
827
|
+
height, width = image_size
|
828
|
+
for bbox in bboxes:
|
829
|
+
x1, y1, x2, y2 = bbox
|
830
|
+
areas.append(
|
831
|
+
{
|
832
|
+
"width": round((x2 - x1) * width, 2),
|
833
|
+
"height": round((y2 - y1) * height, 2),
|
834
|
+
"area": round((x2 - x1) * (y2 - y1) * width * height, 2),
|
835
|
+
}
|
836
|
+
)
|
837
|
+
|
813
838
|
return areas
|
814
839
|
|
815
840
|
|
@@ -878,7 +903,7 @@ class SegIoU(Tool):
|
|
878
903
|
],
|
879
904
|
"examples": [
|
880
905
|
{
|
881
|
-
"scenario": "
|
906
|
+
"scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
|
882
907
|
"parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
|
883
908
|
}
|
884
909
|
],
|
@@ -947,6 +972,46 @@ class BboxContains(Tool):
|
|
947
972
|
}
|
948
973
|
|
949
974
|
|
975
|
+
class ObjectDistance(Tool):
|
976
|
+
name = "object_distance_"
|
977
|
+
description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects."
|
978
|
+
usage = {
|
979
|
+
"required_parameters": [
|
980
|
+
{"name": "object1", "type": "Dict[str, Any]"},
|
981
|
+
{"name": "object2", "type": "Dict[str, Any]"},
|
982
|
+
],
|
983
|
+
"examples": [
|
984
|
+
{
|
985
|
+
"scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}",
|
986
|
+
"parameters": {
|
987
|
+
"object1": {
|
988
|
+
"bboxes": [0.2, 0.21, 0.34, 0.42],
|
989
|
+
"scores": 0.54,
|
990
|
+
"masks": "mask_file1.png",
|
991
|
+
},
|
992
|
+
"object2": {
|
993
|
+
"bboxes": [0.3, 0.31, 0.44, 0.52],
|
994
|
+
"scores": 0.66,
|
995
|
+
"masks": "mask_file2.png",
|
996
|
+
},
|
997
|
+
},
|
998
|
+
}
|
999
|
+
],
|
1000
|
+
}
|
1001
|
+
|
1002
|
+
def __call__(self, object1: Dict[str, Any], object2: Dict[str, Any]) -> float:
|
1003
|
+
if "masks" in object1 and "masks" in object2:
|
1004
|
+
mask1 = object1["masks"]
|
1005
|
+
mask2 = object2["masks"]
|
1006
|
+
return MaskDistance()(mask1, mask2)
|
1007
|
+
elif "bboxes" in object1 and "bboxes" in object2:
|
1008
|
+
bbox1 = object1["bboxes"]
|
1009
|
+
bbox2 = object2["bboxes"]
|
1010
|
+
return BoxDistance()(bbox1, bbox2)
|
1011
|
+
else:
|
1012
|
+
raise ValueError("Either of the objects should have masks or bboxes")
|
1013
|
+
|
1014
|
+
|
950
1015
|
class BoxDistance(Tool):
|
951
1016
|
name = "box_distance_"
|
952
1017
|
description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes"
|
@@ -957,7 +1022,7 @@ class BoxDistance(Tool):
|
|
957
1022
|
],
|
958
1023
|
"examples": [
|
959
1024
|
{
|
960
|
-
"scenario": "Calculate the distance between
|
1025
|
+
"scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
|
961
1026
|
"parameters": {
|
962
1027
|
"bbox1": [0.2, 0.21, 0.34, 0.42],
|
963
1028
|
"bbox2": [0.3, 0.31, 0.44, 0.52],
|
@@ -976,26 +1041,57 @@ class BoxDistance(Tool):
|
|
976
1041
|
return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
|
977
1042
|
|
978
1043
|
|
1044
|
+
class MaskDistance(Tool):
|
1045
|
+
name = "mask_distance_"
|
1046
|
+
description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks"
|
1047
|
+
usage = {
|
1048
|
+
"required_parameters": [
|
1049
|
+
{"name": "mask1", "type": "str"},
|
1050
|
+
{"name": "mask2", "type": "str"},
|
1051
|
+
],
|
1052
|
+
"examples": [
|
1053
|
+
{
|
1054
|
+
"scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg",
|
1055
|
+
"parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
|
1056
|
+
}
|
1057
|
+
],
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
|
1061
|
+
pil_mask1 = Image.open(str(mask1))
|
1062
|
+
pil_mask2 = Image.open(str(mask2))
|
1063
|
+
np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
|
1064
|
+
np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
|
1065
|
+
|
1066
|
+
mask1_points = np.transpose(np.nonzero(np_mask1))
|
1067
|
+
mask2_points = np.transpose(np.nonzero(np_mask2))
|
1068
|
+
dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
|
1069
|
+
return cast(float, np.round(np.min(dist_matrix), 2))
|
1070
|
+
|
1071
|
+
|
979
1072
|
class ExtractFrames(Tool):
|
980
1073
|
r"""Extract frames from a video."""
|
981
1074
|
|
982
1075
|
name = "extract_frames_"
|
983
|
-
description = "'extract_frames_' extracts frames from a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
|
1076
|
+
description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
|
984
1077
|
usage = {
|
985
1078
|
"required_parameters": [{"name": "video_uri", "type": "str"}],
|
1079
|
+
"optional_parameters": [{"name": "frames_every", "type": "float"}],
|
986
1080
|
"examples": [
|
987
1081
|
{
|
988
1082
|
"scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
|
989
1083
|
"parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
|
990
1084
|
},
|
991
1085
|
{
|
992
|
-
"scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
|
993
|
-
"parameters": {"video_uri": "tests/data/test.mp4"},
|
1086
|
+
"scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
|
1087
|
+
"parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
|
994
1088
|
},
|
995
1089
|
],
|
996
1090
|
}
|
997
1091
|
|
998
|
-
def __call__(
|
1092
|
+
def __call__(
|
1093
|
+
self, video_uri: str, frames_every: float = 2
|
1094
|
+
) -> List[Tuple[str, float]]:
|
999
1095
|
"""Extract frames from a video.
|
1000
1096
|
|
1001
1097
|
|
@@ -1005,7 +1101,7 @@ class ExtractFrames(Tool):
|
|
1005
1101
|
Returns:
|
1006
1102
|
a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
|
1007
1103
|
"""
|
1008
|
-
frames = extract_frames_from_video(video_uri)
|
1104
|
+
frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
|
1009
1105
|
result = []
|
1010
1106
|
_LOGGER.info(
|
1011
1107
|
f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
|
@@ -1108,12 +1204,11 @@ TOOLS = {
|
|
1108
1204
|
AgentDINOv,
|
1109
1205
|
ExtractFrames,
|
1110
1206
|
Crop,
|
1111
|
-
|
1207
|
+
BboxStats,
|
1112
1208
|
SegArea,
|
1113
|
-
|
1114
|
-
SegIoU,
|
1209
|
+
ObjectDistance,
|
1115
1210
|
BboxContains,
|
1116
|
-
|
1211
|
+
SegIoU,
|
1117
1212
|
OCR,
|
1118
1213
|
Calculator,
|
1119
1214
|
]
|
@@ -1145,20 +1240,3 @@ def register_tool(tool: Type[Tool]) -> Type[Tool]:
|
|
1145
1240
|
"class": tool,
|
1146
1241
|
}
|
1147
1242
|
return tool
|
1148
|
-
|
1149
|
-
|
1150
|
-
def _send_inference_request(
|
1151
|
-
payload: Dict[str, Any], endpoint_name: str
|
1152
|
-
) -> Dict[str, Any]:
|
1153
|
-
res = requests.post(
|
1154
|
-
f"{_LND_API_URL}/model/{endpoint_name}",
|
1155
|
-
headers={
|
1156
|
-
"Content-Type": "application/json",
|
1157
|
-
"apikey": _LND_API_KEY,
|
1158
|
-
},
|
1159
|
-
json=payload,
|
1160
|
-
)
|
1161
|
-
if res.status_code != 200:
|
1162
|
-
_LOGGER.error(f"Request failed: {res.text}")
|
1163
|
-
raise ValueError(f"Request failed: {res.text}")
|
1164
|
-
return res.json()["data"] # type: ignore
|