vision-agent 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/tools.py +58 -36
- {vision_agent-0.2.15.dist-info → vision_agent-0.2.16.dist-info}/METADATA +1 -1
- {vision_agent-0.2.15.dist-info → vision_agent-0.2.16.dist-info}/RECORD +7 -7
- {vision_agent-0.2.15.dist-info → vision_agent-0.2.16.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.15.dist-info → vision_agent-0.2.16.dist-info}/WHEEL +0 -0
@@ -308,7 +308,7 @@ def _handle_extract_frames(
|
|
308
308
|
# any following processing
|
309
309
|
for video_file_output in tool_result["call_results"]:
|
310
310
|
# When the video tool is run with wrong parameters, exit the loop
|
311
|
-
if len(video_file_output) < 2:
|
311
|
+
if not isinstance(video_file_output, tuple) or len(video_file_output) < 2:
|
312
312
|
break
|
313
313
|
for frame, _ in video_file_output:
|
314
314
|
image = frame
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -174,15 +174,15 @@ class GroundingDINO(Tool):
|
|
174
174
|
"""
|
175
175
|
|
176
176
|
name = "grounding_dino_"
|
177
|
-
description = "'grounding_dino_' is a tool that can detect and count objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
|
177
|
+
description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
|
178
178
|
usage = {
|
179
179
|
"required_parameters": [
|
180
180
|
{"name": "prompt", "type": "str"},
|
181
181
|
{"name": "image", "type": "str"},
|
182
182
|
],
|
183
183
|
"optional_parameters": [
|
184
|
-
{"name": "box_threshold", "type": "float"},
|
185
|
-
{"name": "iou_threshold", "type": "float"},
|
184
|
+
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
185
|
+
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
186
186
|
],
|
187
187
|
"examples": [
|
188
188
|
{
|
@@ -209,7 +209,7 @@ class GroundingDINO(Tool):
|
|
209
209
|
"prompt": "red shirt. green shirt",
|
210
210
|
"image": "shirts.jpg",
|
211
211
|
"box_threshold": 0.20,
|
212
|
-
"iou_threshold": 0.
|
212
|
+
"iou_threshold": 0.20,
|
213
213
|
},
|
214
214
|
},
|
215
215
|
],
|
@@ -221,7 +221,7 @@ class GroundingDINO(Tool):
|
|
221
221
|
prompt: str,
|
222
222
|
image: Union[str, Path, ImageType],
|
223
223
|
box_threshold: float = 0.20,
|
224
|
-
iou_threshold: float = 0.
|
224
|
+
iou_threshold: float = 0.20,
|
225
225
|
) -> Dict:
|
226
226
|
"""Invoke the Grounding DINO model.
|
227
227
|
|
@@ -249,7 +249,7 @@ class GroundingDINO(Tool):
|
|
249
249
|
data["scores"] = [round(score, 2) for score in data["scores"]]
|
250
250
|
if "labels" in data:
|
251
251
|
data["labels"] = list(data["labels"])
|
252
|
-
data["
|
252
|
+
data["image_size"] = image_size
|
253
253
|
return data
|
254
254
|
|
255
255
|
|
@@ -277,15 +277,15 @@ class GroundingSAM(Tool):
|
|
277
277
|
"""
|
278
278
|
|
279
279
|
name = "grounding_sam_"
|
280
|
-
description = "'grounding_sam_' is a tool that can detect and segment objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
280
|
+
description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
281
281
|
usage = {
|
282
282
|
"required_parameters": [
|
283
283
|
{"name": "prompt", "type": "str"},
|
284
284
|
{"name": "image", "type": "str"},
|
285
285
|
],
|
286
286
|
"optional_parameters": [
|
287
|
-
{"name": "box_threshold", "type": "float"},
|
288
|
-
{"name": "iou_threshold", "type": "float"},
|
287
|
+
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
288
|
+
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
289
289
|
],
|
290
290
|
"examples": [
|
291
291
|
{
|
@@ -312,7 +312,7 @@ class GroundingSAM(Tool):
|
|
312
312
|
"prompt": "red shirt, green shirt",
|
313
313
|
"image": "shirts.jpg",
|
314
314
|
"box_threshold": 0.20,
|
315
|
-
"iou_threshold": 0.
|
315
|
+
"iou_threshold": 0.20,
|
316
316
|
},
|
317
317
|
},
|
318
318
|
],
|
@@ -324,7 +324,7 @@ class GroundingSAM(Tool):
|
|
324
324
|
prompt: str,
|
325
325
|
image: Union[str, ImageType],
|
326
326
|
box_threshold: float = 0.2,
|
327
|
-
iou_threshold: float = 0.
|
327
|
+
iou_threshold: float = 0.2,
|
328
328
|
) -> Dict:
|
329
329
|
"""Invoke the Grounding SAM model.
|
330
330
|
|
@@ -353,6 +353,7 @@ class GroundingSAM(Tool):
|
|
353
353
|
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
354
354
|
for mask in data["masks"]
|
355
355
|
]
|
356
|
+
data["image_size"] = image_size
|
356
357
|
data.pop("mask_shape", None)
|
357
358
|
return data
|
358
359
|
|
@@ -434,6 +435,8 @@ class DINOv(Tool):
|
|
434
435
|
for mask in data["masks"]
|
435
436
|
]
|
436
437
|
data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
|
438
|
+
mask_shape = data.pop("mask_shape", None)
|
439
|
+
data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
|
437
440
|
return data
|
438
441
|
|
439
442
|
|
@@ -789,33 +792,49 @@ class Crop(Tool):
|
|
789
792
|
return {"image": tmp.name}
|
790
793
|
|
791
794
|
|
792
|
-
class
|
793
|
-
r"""
|
795
|
+
class BboxStats(Tool):
|
796
|
+
r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
|
794
797
|
|
795
|
-
name = "
|
796
|
-
description = "'
|
798
|
+
name = "bbox_stats_"
|
799
|
+
description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
|
797
800
|
usage = {
|
798
|
-
"required_parameters": [
|
801
|
+
"required_parameters": [
|
802
|
+
{"name": "bboxes", "type": "List[int]"},
|
803
|
+
{"name": "image_size", "type": "Tuple[int]"},
|
804
|
+
],
|
799
805
|
"examples": [
|
800
806
|
{
|
801
|
-
"scenario": "
|
802
|
-
"parameters": {
|
803
|
-
|
807
|
+
"scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
808
|
+
"parameters": {
|
809
|
+
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
810
|
+
"image_size": (500, 1200),
|
811
|
+
},
|
812
|
+
},
|
813
|
+
{
|
814
|
+
"scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
815
|
+
"parameters": {
|
816
|
+
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
817
|
+
"image_size": (640, 480),
|
818
|
+
},
|
819
|
+
},
|
804
820
|
],
|
805
821
|
}
|
806
822
|
|
807
|
-
def __call__(
|
823
|
+
def __call__(
|
824
|
+
self, bboxes: List[List[int]], image_size: Tuple[int, int]
|
825
|
+
) -> List[Dict]:
|
808
826
|
areas = []
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
827
|
+
height, width = image_size
|
828
|
+
for bbox in bboxes:
|
829
|
+
x1, y1, x2, y2 = bbox
|
830
|
+
areas.append(
|
831
|
+
{
|
832
|
+
"width": round((x2 - x1) * width, 2),
|
833
|
+
"height": round((y2 - y1) * height, 2),
|
834
|
+
"area": round((x2 - x1) * (y2 - y1) * width * height, 2),
|
835
|
+
}
|
836
|
+
)
|
837
|
+
|
819
838
|
return areas
|
820
839
|
|
821
840
|
|
@@ -1054,22 +1073,25 @@ class ExtractFrames(Tool):
|
|
1054
1073
|
r"""Extract frames from a video."""
|
1055
1074
|
|
1056
1075
|
name = "extract_frames_"
|
1057
|
-
description = "'extract_frames_' extracts frames from a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
|
1076
|
+
description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
|
1058
1077
|
usage = {
|
1059
1078
|
"required_parameters": [{"name": "video_uri", "type": "str"}],
|
1079
|
+
"optional_parameters": [{"name": "frames_every", "type": "float"}],
|
1060
1080
|
"examples": [
|
1061
1081
|
{
|
1062
1082
|
"scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
|
1063
1083
|
"parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
|
1064
1084
|
},
|
1065
1085
|
{
|
1066
|
-
"scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
|
1067
|
-
"parameters": {"video_uri": "tests/data/test.mp4"},
|
1086
|
+
"scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
|
1087
|
+
"parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
|
1068
1088
|
},
|
1069
1089
|
],
|
1070
1090
|
}
|
1071
1091
|
|
1072
|
-
def __call__(
|
1092
|
+
def __call__(
|
1093
|
+
self, video_uri: str, frames_every: float = 2
|
1094
|
+
) -> List[Tuple[str, float]]:
|
1073
1095
|
"""Extract frames from a video.
|
1074
1096
|
|
1075
1097
|
|
@@ -1079,7 +1101,7 @@ class ExtractFrames(Tool):
|
|
1079
1101
|
Returns:
|
1080
1102
|
a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
|
1081
1103
|
"""
|
1082
|
-
frames = extract_frames_from_video(video_uri)
|
1104
|
+
frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
|
1083
1105
|
result = []
|
1084
1106
|
_LOGGER.info(
|
1085
1107
|
f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
|
@@ -1182,7 +1204,7 @@ TOOLS = {
|
|
1182
1204
|
AgentDINOv,
|
1183
1205
|
ExtractFrames,
|
1184
1206
|
Crop,
|
1185
|
-
|
1207
|
+
BboxStats,
|
1186
1208
|
SegArea,
|
1187
1209
|
ObjectDistance,
|
1188
1210
|
BboxContains,
|
@@ -7,7 +7,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
7
7
|
vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
|
8
8
|
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
9
9
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
10
|
-
vision_agent/agent/vision_agent.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent.py,sha256=ywOowbuwNSapVwl02ePZP_EzW1FlZULoCV59LR5nFww,27028
|
11
11
|
vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
12
|
vision_agent/agent/vision_agent_v2.py,sha256=CDgGBSoa2LoMS0b4JhyDkoS3PJJNmCCPfxIGUc4RfQg,9658
|
13
13
|
vision_agent/agent/vision_agent_v2_prompt.py,sha256=-90Hlbtqb5Fp7OVjGabpTdgr-yCr8AYKIfiMRfoL4SY,5141
|
@@ -17,10 +17,10 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
|
|
17
17
|
vision_agent/llm/llm.py,sha256=qWDBpJolGLWNwDjpEXu1NrjlJbo7Fj9efJYkSfVn6oE,5784
|
18
18
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
19
19
|
vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
|
20
|
-
vision_agent/tools/__init__.py,sha256=
|
20
|
+
vision_agent/tools/__init__.py,sha256=p5SM0YhThSVO_jRF9O-OjH2fYDPv-iMjexDX9xPPb7M,452
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=moR7X4hkLKQzC56axdojo_OcIuVOv45bKcHPUVZrPvk,753
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
|
24
24
|
vision_agent/tools/tools_v2.py,sha256=1Y_ZbYJyuo2eZZkq7jY3YfuKWC82C-GFCZMLYH-I5ew,13800
|
25
25
|
vision_agent/utils/__init__.py,sha256=AKXf1QVOpO6MnqU8RSaFLQ_4us4DcKf8ibgEbhuHjvI,95
|
26
26
|
vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
|
@@ -28,7 +28,7 @@ vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hs
|
|
28
28
|
vision_agent/utils/sim.py,sha256=FaD16kKL1-JR2aSCmznF9KkJux9u3_Nr9tF4smBeoK0,2327
|
29
29
|
vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
30
30
|
vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
31
|
+
vision_agent-0.2.16.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
32
|
+
vision_agent-0.2.16.dist-info/METADATA,sha256=gbDID2drbfeDyy0jHQYDQZN81zRet90-bAVQKTSVdC4,9121
|
33
|
+
vision_agent-0.2.16.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
34
|
+
vision_agent-0.2.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|