vision-agent 0.2.10__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,8 +9,12 @@ import numpy as np
9
9
  import requests
10
10
  from PIL import Image
11
11
  from PIL.Image import Image as ImageType
12
+ from scipy.spatial import distance # type: ignore
12
13
 
13
- from vision_agent.image_utils import (
14
+ from vision_agent.lmm import OpenAILMM
15
+ from vision_agent.tools.tool_utils import _send_inference_request
16
+ from vision_agent.utils import extract_frames_from_video
17
+ from vision_agent.utils.image_utils import (
14
18
  b64_to_pil,
15
19
  convert_to_b64,
16
20
  denormalize_bbox,
@@ -18,13 +22,8 @@ from vision_agent.image_utils import (
18
22
  normalize_bbox,
19
23
  rle_decode,
20
24
  )
21
- from vision_agent.lmm import OpenAILMM
22
- from vision_agent.tools.video import extract_frames_from_video
23
- from vision_agent.type_defs import LandingaiAPIKey
24
25
 
25
26
  _LOGGER = logging.getLogger(__name__)
26
- _LND_API_KEY = LandingaiAPIKey().api_key
27
- _LND_API_URL = "https://api.dev.landing.ai/v1/agent"
28
27
 
29
28
 
30
29
  class Tool(ABC):
@@ -175,15 +174,15 @@ class GroundingDINO(Tool):
175
174
  """
176
175
 
177
176
  name = "grounding_dino_"
178
- description = "'grounding_dino_' is a tool that can detect and count objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
177
+ description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
179
178
  usage = {
180
179
  "required_parameters": [
181
180
  {"name": "prompt", "type": "str"},
182
181
  {"name": "image", "type": "str"},
183
182
  ],
184
183
  "optional_parameters": [
185
- {"name": "box_threshold", "type": "float"},
186
- {"name": "iou_threshold", "type": "float"},
184
+ {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
185
+ {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
187
186
  ],
188
187
  "examples": [
189
188
  {
@@ -210,7 +209,7 @@ class GroundingDINO(Tool):
210
209
  "prompt": "red shirt. green shirt",
211
210
  "image": "shirts.jpg",
212
211
  "box_threshold": 0.20,
213
- "iou_threshold": 0.75,
212
+ "iou_threshold": 0.20,
214
213
  },
215
214
  },
216
215
  ],
@@ -222,7 +221,7 @@ class GroundingDINO(Tool):
222
221
  prompt: str,
223
222
  image: Union[str, Path, ImageType],
224
223
  box_threshold: float = 0.20,
225
- iou_threshold: float = 0.75,
224
+ iou_threshold: float = 0.20,
226
225
  ) -> Dict:
227
226
  """Invoke the Grounding DINO model.
228
227
 
@@ -250,7 +249,7 @@ class GroundingDINO(Tool):
250
249
  data["scores"] = [round(score, 2) for score in data["scores"]]
251
250
  if "labels" in data:
252
251
  data["labels"] = list(data["labels"])
253
- data["size"] = (image_size[1], image_size[0])
252
+ data["image_size"] = image_size
254
253
  return data
255
254
 
256
255
 
@@ -278,15 +277,15 @@ class GroundingSAM(Tool):
278
277
  """
279
278
 
280
279
  name = "grounding_sam_"
281
- description = "'grounding_sam_' is a tool that can detect and segment objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
280
+ description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
282
281
  usage = {
283
282
  "required_parameters": [
284
283
  {"name": "prompt", "type": "str"},
285
284
  {"name": "image", "type": "str"},
286
285
  ],
287
286
  "optional_parameters": [
288
- {"name": "box_threshold", "type": "float"},
289
- {"name": "iou_threshold", "type": "float"},
287
+ {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
288
+ {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
290
289
  ],
291
290
  "examples": [
292
291
  {
@@ -313,7 +312,7 @@ class GroundingSAM(Tool):
313
312
  "prompt": "red shirt, green shirt",
314
313
  "image": "shirts.jpg",
315
314
  "box_threshold": 0.20,
316
- "iou_threshold": 0.75,
315
+ "iou_threshold": 0.20,
317
316
  },
318
317
  },
319
318
  ],
@@ -325,7 +324,7 @@ class GroundingSAM(Tool):
325
324
  prompt: str,
326
325
  image: Union[str, ImageType],
327
326
  box_threshold: float = 0.2,
328
- iou_threshold: float = 0.75,
327
+ iou_threshold: float = 0.2,
329
328
  ) -> Dict:
330
329
  """Invoke the Grounding SAM model.
331
330
 
@@ -354,6 +353,7 @@ class GroundingSAM(Tool):
354
353
  rle_decode(mask_rle=mask, shape=data["mask_shape"])
355
354
  for mask in data["masks"]
356
355
  ]
356
+ data["image_size"] = image_size
357
357
  data.pop("mask_shape", None)
358
358
  return data
359
359
 
@@ -423,7 +423,6 @@ class DINOv(Tool):
423
423
  request_data = {
424
424
  "prompt": prompt,
425
425
  "image": image_b64,
426
- "tool": "dinov",
427
426
  }
428
427
  data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
429
428
  if "bboxes" in data:
@@ -436,6 +435,8 @@ class DINOv(Tool):
436
435
  for mask in data["masks"]
437
436
  ]
438
437
  data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
438
+ mask_shape = data.pop("mask_shape", None)
439
+ data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
439
440
  return data
440
441
 
441
442
 
@@ -544,7 +545,7 @@ class VisualPromptCounting(Tool):
544
545
  -------
545
546
  >>> import vision_agent as va
546
547
  >>> prompt_count = va.tools.VisualPromptCounting()
547
- >>> prompt_count(image="image1.jpg", prompt="0.1, 0.1, 0.4, 0.42")
548
+ >>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]})
548
549
  {'count': 23}
549
550
  """
550
551
 
@@ -554,52 +555,60 @@ class VisualPromptCounting(Tool):
554
555
  usage = {
555
556
  "required_parameters": [
556
557
  {"name": "image", "type": "str"},
557
- {"name": "prompt", "type": "str"},
558
+ {"name": "prompt", "type": "Dict[str, List[float]"},
558
559
  ],
559
560
  "examples": [
560
561
  {
561
562
  "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg",
562
- "parameters": {"image": "lids.jpg", "prompt": "0.1, 0.1, 0.14, 0.2"},
563
+ "parameters": {
564
+ "image": "lids.jpg",
565
+ "prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]},
566
+ },
563
567
  },
564
568
  {
565
- "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
566
- "parameters": {"image": "tray.jpg", "prompt": "0.1, 0.1, 0.2, 0.25"},
569
+ "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}",
570
+ "parameters": {
571
+ "image": "tray.jpg",
572
+ "prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]},
573
+ },
567
574
  },
568
575
  {
569
- "scenario": "Can you count this item based on an example, reference_data: '0.1, 0.15, 0.2, 0.2' ? Image name: shirts.jpg",
576
+ "scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg",
570
577
  "parameters": {
571
578
  "image": "shirts.jpg",
572
- "prompt": "0.1, 0.15, 0.2, 0.2",
579
+ "prompt": {"bbox": [100, 115, 200, 200]},
573
580
  },
574
581
  },
575
582
  {
576
- "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg",
583
+ "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}",
577
584
  "parameters": {
578
585
  "image": "shoes.jpg",
579
- "prompt": "0.1, 0.1, 0.6, 0.65",
586
+ "prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]},
580
587
  },
581
588
  },
582
589
  ],
583
590
  }
584
591
 
585
- # TODO: Add support for input multiple images, which aligns with the output type.
586
- def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
592
+ def __call__(
593
+ self, image: Union[str, ImageType], prompt: Dict[str, List[float]]
594
+ ) -> Dict:
587
595
  """Invoke the few shot counting model.
588
596
 
589
597
  Parameters:
590
598
  image: the input image.
599
+ prompt: the visual prompt which is a bounding box describing the object.
591
600
 
592
601
  Returns:
593
602
  A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
594
603
  """
595
604
  image_size = get_image_size(image)
596
- bbox = [float(x) for x in prompt.split(",")]
597
- prompt = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
605
+ bbox = prompt["bbox"]
606
+ bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
598
607
  image_b64 = convert_to_b64(image)
599
608
 
600
609
  data = {
601
610
  "image": image_b64,
602
- "prompt": prompt,
611
+ "prompt": bbox_str,
603
612
  "tool": "few_shot_counting",
604
613
  }
605
614
  resp_data = _send_inference_request(data, "tools")
@@ -783,33 +792,49 @@ class Crop(Tool):
783
792
  return {"image": tmp.name}
784
793
 
785
794
 
786
- class BboxArea(Tool):
787
- r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places."""
795
+ class BboxStats(Tool):
796
+ r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
788
797
 
789
- name = "bbox_area_"
790
- description = "'bbox_area_' returns the area of the given bounding box in pixels normalized to 2 decimal places."
798
+ name = "bbox_stats_"
799
+ description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
791
800
  usage = {
792
- "required_parameters": [{"name": "bboxes", "type": "List[int]"}],
801
+ "required_parameters": [
802
+ {"name": "bboxes", "type": "List[int]"},
803
+ {"name": "image_size", "type": "Tuple[int]"},
804
+ ],
793
805
  "examples": [
794
806
  {
795
- "scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
796
- "parameters": {"bboxes": [0.2, 0.21, 0.34, 0.42]},
797
- }
807
+ "scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
808
+ "parameters": {
809
+ "bboxes": [[0.2, 0.21, 0.34, 0.42]],
810
+ "image_size": (500, 1200),
811
+ },
812
+ },
813
+ {
814
+ "scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
815
+ "parameters": {
816
+ "bboxes": [[0.2, 0.21, 0.34, 0.42]],
817
+ "image_size": (640, 480),
818
+ },
819
+ },
798
820
  ],
799
821
  }
800
822
 
801
- def __call__(self, bboxes: List[Dict]) -> List[Dict]:
823
+ def __call__(
824
+ self, bboxes: List[List[int]], image_size: Tuple[int, int]
825
+ ) -> List[Dict]:
802
826
  areas = []
803
- for elt in bboxes:
804
- height, width = elt["size"]
805
- for label, bbox in zip(elt["labels"], elt["bboxes"]):
806
- x1, y1, x2, y2 = bbox
807
- areas.append(
808
- {
809
- "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
810
- "label": label,
811
- }
812
- )
827
+ height, width = image_size
828
+ for bbox in bboxes:
829
+ x1, y1, x2, y2 = bbox
830
+ areas.append(
831
+ {
832
+ "width": round((x2 - x1) * width, 2),
833
+ "height": round((y2 - y1) * height, 2),
834
+ "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
835
+ }
836
+ )
837
+
813
838
  return areas
814
839
 
815
840
 
@@ -878,7 +903,7 @@ class SegIoU(Tool):
878
903
  ],
879
904
  "examples": [
880
905
  {
881
- "scenario": "If you want to calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
906
+ "scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
882
907
  "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
883
908
  }
884
909
  ],
@@ -947,6 +972,46 @@ class BboxContains(Tool):
947
972
  }
948
973
 
949
974
 
975
+ class ObjectDistance(Tool):
976
+ name = "object_distance_"
977
+ description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects."
978
+ usage = {
979
+ "required_parameters": [
980
+ {"name": "object1", "type": "Dict[str, Any]"},
981
+ {"name": "object2", "type": "Dict[str, Any]"},
982
+ ],
983
+ "examples": [
984
+ {
985
+ "scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}",
986
+ "parameters": {
987
+ "object1": {
988
+ "bboxes": [0.2, 0.21, 0.34, 0.42],
989
+ "scores": 0.54,
990
+ "masks": "mask_file1.png",
991
+ },
992
+ "object2": {
993
+ "bboxes": [0.3, 0.31, 0.44, 0.52],
994
+ "scores": 0.66,
995
+ "masks": "mask_file2.png",
996
+ },
997
+ },
998
+ }
999
+ ],
1000
+ }
1001
+
1002
+ def __call__(self, object1: Dict[str, Any], object2: Dict[str, Any]) -> float:
1003
+ if "masks" in object1 and "masks" in object2:
1004
+ mask1 = object1["masks"]
1005
+ mask2 = object2["masks"]
1006
+ return MaskDistance()(mask1, mask2)
1007
+ elif "bboxes" in object1 and "bboxes" in object2:
1008
+ bbox1 = object1["bboxes"]
1009
+ bbox2 = object2["bboxes"]
1010
+ return BoxDistance()(bbox1, bbox2)
1011
+ else:
1012
+ raise ValueError("Either of the objects should have masks or bboxes")
1013
+
1014
+
950
1015
  class BoxDistance(Tool):
951
1016
  name = "box_distance_"
952
1017
  description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes"
@@ -957,7 +1022,7 @@ class BoxDistance(Tool):
957
1022
  ],
958
1023
  "examples": [
959
1024
  {
960
- "scenario": "Calculate the distance between the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
1025
+ "scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
961
1026
  "parameters": {
962
1027
  "bbox1": [0.2, 0.21, 0.34, 0.42],
963
1028
  "bbox2": [0.3, 0.31, 0.44, 0.52],
@@ -976,26 +1041,57 @@ class BoxDistance(Tool):
976
1041
  return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
977
1042
 
978
1043
 
1044
+ class MaskDistance(Tool):
1045
+ name = "mask_distance_"
1046
+ description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks"
1047
+ usage = {
1048
+ "required_parameters": [
1049
+ {"name": "mask1", "type": "str"},
1050
+ {"name": "mask2", "type": "str"},
1051
+ ],
1052
+ "examples": [
1053
+ {
1054
+ "scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg",
1055
+ "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
1056
+ }
1057
+ ],
1058
+ }
1059
+
1060
+ def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
1061
+ pil_mask1 = Image.open(str(mask1))
1062
+ pil_mask2 = Image.open(str(mask2))
1063
+ np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
1064
+ np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
1065
+
1066
+ mask1_points = np.transpose(np.nonzero(np_mask1))
1067
+ mask2_points = np.transpose(np.nonzero(np_mask2))
1068
+ dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
1069
+ return cast(float, np.round(np.min(dist_matrix), 2))
1070
+
1071
+
979
1072
  class ExtractFrames(Tool):
980
1073
  r"""Extract frames from a video."""
981
1074
 
982
1075
  name = "extract_frames_"
983
- description = "'extract_frames_' extracts frames from a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
1076
+ description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
984
1077
  usage = {
985
1078
  "required_parameters": [{"name": "video_uri", "type": "str"}],
1079
+ "optional_parameters": [{"name": "frames_every", "type": "float"}],
986
1080
  "examples": [
987
1081
  {
988
1082
  "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
989
1083
  "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
990
1084
  },
991
1085
  {
992
- "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
993
- "parameters": {"video_uri": "tests/data/test.mp4"},
1086
+ "scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
1087
+ "parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
994
1088
  },
995
1089
  ],
996
1090
  }
997
1091
 
998
- def __call__(self, video_uri: str) -> List[Tuple[str, float]]:
1092
+ def __call__(
1093
+ self, video_uri: str, frames_every: float = 2
1094
+ ) -> List[Tuple[str, float]]:
999
1095
  """Extract frames from a video.
1000
1096
 
1001
1097
 
@@ -1005,7 +1101,7 @@ class ExtractFrames(Tool):
1005
1101
  Returns:
1006
1102
  a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
1007
1103
  """
1008
- frames = extract_frames_from_video(video_uri)
1104
+ frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
1009
1105
  result = []
1010
1106
  _LOGGER.info(
1011
1107
  f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
@@ -1108,12 +1204,11 @@ TOOLS = {
1108
1204
  AgentDINOv,
1109
1205
  ExtractFrames,
1110
1206
  Crop,
1111
- BboxArea,
1207
+ BboxStats,
1112
1208
  SegArea,
1113
- BboxIoU,
1114
- SegIoU,
1209
+ ObjectDistance,
1115
1210
  BboxContains,
1116
- BoxDistance,
1211
+ SegIoU,
1117
1212
  OCR,
1118
1213
  Calculator,
1119
1214
  ]
@@ -1145,20 +1240,3 @@ def register_tool(tool: Type[Tool]) -> Type[Tool]:
1145
1240
  "class": tool,
1146
1241
  }
1147
1242
  return tool
1148
-
1149
-
1150
- def _send_inference_request(
1151
- payload: Dict[str, Any], endpoint_name: str
1152
- ) -> Dict[str, Any]:
1153
- res = requests.post(
1154
- f"{_LND_API_URL}/model/{endpoint_name}",
1155
- headers={
1156
- "Content-Type": "application/json",
1157
- "apikey": _LND_API_KEY,
1158
- },
1159
- json=payload,
1160
- )
1161
- if res.status_code != 200:
1162
- _LOGGER.error(f"Request failed: {res.text}")
1163
- raise ValueError(f"Request failed: {res.text}")
1164
- return res.json()["data"] # type: ignore