vision-agent 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -308,7 +308,7 @@ def _handle_extract_frames(
308
308
  # any following processing
309
309
  for video_file_output in tool_result["call_results"]:
310
310
  # When the video tool is run with wrong parameters, exit the loop
311
- if len(video_file_output) < 2:
311
+ if not isinstance(video_file_output, tuple) or len(video_file_output) < 2:
312
312
  break
313
313
  for frame, _ in video_file_output:
314
314
  image = frame
@@ -3,7 +3,7 @@ from .tools import ( # Counter,
3
3
  CLIP,
4
4
  OCR,
5
5
  TOOLS,
6
- BboxArea,
6
+ BboxStats,
7
7
  BboxIoU,
8
8
  BoxDistance,
9
9
  Crop,
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import os
2
3
  from typing import Any, Dict
3
4
 
4
5
  import requests
@@ -13,11 +14,14 @@ _LND_API_URL = "https://api.dev.landing.ai/v1/agent"
13
14
  def _send_inference_request(
14
15
  payload: Dict[str, Any], endpoint_name: str
15
16
  ) -> Dict[str, Any]:
17
+ # runtime_tag is used to differentiate different internal callers
18
+ runtime_tag = os.environ.get("RUNTIME_TAG", "")
16
19
  res = requests.post(
17
20
  f"{_LND_API_URL}/model/{endpoint_name}",
18
21
  headers={
19
22
  "Content-Type": "application/json",
20
23
  "apikey": _LND_API_KEY,
24
+ "runtime-tag": runtime_tag,
21
25
  },
22
26
  json=payload,
23
27
  )
@@ -174,15 +174,15 @@ class GroundingDINO(Tool):
174
174
  """
175
175
 
176
176
  name = "grounding_dino_"
177
- description = "'grounding_dino_' is a tool that can detect and count objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
177
+ description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
178
178
  usage = {
179
179
  "required_parameters": [
180
180
  {"name": "prompt", "type": "str"},
181
181
  {"name": "image", "type": "str"},
182
182
  ],
183
183
  "optional_parameters": [
184
- {"name": "box_threshold", "type": "float"},
185
- {"name": "iou_threshold", "type": "float"},
184
+ {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
185
+ {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
186
186
  ],
187
187
  "examples": [
188
188
  {
@@ -209,7 +209,7 @@ class GroundingDINO(Tool):
209
209
  "prompt": "red shirt. green shirt",
210
210
  "image": "shirts.jpg",
211
211
  "box_threshold": 0.20,
212
- "iou_threshold": 0.75,
212
+ "iou_threshold": 0.20,
213
213
  },
214
214
  },
215
215
  ],
@@ -221,7 +221,7 @@ class GroundingDINO(Tool):
221
221
  prompt: str,
222
222
  image: Union[str, Path, ImageType],
223
223
  box_threshold: float = 0.20,
224
- iou_threshold: float = 0.75,
224
+ iou_threshold: float = 0.20,
225
225
  ) -> Dict:
226
226
  """Invoke the Grounding DINO model.
227
227
 
@@ -249,7 +249,7 @@ class GroundingDINO(Tool):
249
249
  data["scores"] = [round(score, 2) for score in data["scores"]]
250
250
  if "labels" in data:
251
251
  data["labels"] = list(data["labels"])
252
- data["size"] = (image_size[1], image_size[0])
252
+ data["image_size"] = image_size
253
253
  return data
254
254
 
255
255
 
@@ -277,15 +277,15 @@ class GroundingSAM(Tool):
277
277
  """
278
278
 
279
279
  name = "grounding_sam_"
280
- description = "'grounding_sam_' is a tool that can detect and segment objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
280
+ description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
281
281
  usage = {
282
282
  "required_parameters": [
283
283
  {"name": "prompt", "type": "str"},
284
284
  {"name": "image", "type": "str"},
285
285
  ],
286
286
  "optional_parameters": [
287
- {"name": "box_threshold", "type": "float"},
288
- {"name": "iou_threshold", "type": "float"},
287
+ {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
288
+ {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
289
289
  ],
290
290
  "examples": [
291
291
  {
@@ -312,7 +312,7 @@ class GroundingSAM(Tool):
312
312
  "prompt": "red shirt, green shirt",
313
313
  "image": "shirts.jpg",
314
314
  "box_threshold": 0.20,
315
- "iou_threshold": 0.75,
315
+ "iou_threshold": 0.20,
316
316
  },
317
317
  },
318
318
  ],
@@ -324,7 +324,7 @@ class GroundingSAM(Tool):
324
324
  prompt: str,
325
325
  image: Union[str, ImageType],
326
326
  box_threshold: float = 0.2,
327
- iou_threshold: float = 0.75,
327
+ iou_threshold: float = 0.2,
328
328
  ) -> Dict:
329
329
  """Invoke the Grounding SAM model.
330
330
 
@@ -353,6 +353,7 @@ class GroundingSAM(Tool):
353
353
  rle_decode(mask_rle=mask, shape=data["mask_shape"])
354
354
  for mask in data["masks"]
355
355
  ]
356
+ data["image_size"] = image_size
356
357
  data.pop("mask_shape", None)
357
358
  return data
358
359
 
@@ -434,6 +435,8 @@ class DINOv(Tool):
434
435
  for mask in data["masks"]
435
436
  ]
436
437
  data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
438
+ mask_shape = data.pop("mask_shape", None)
439
+ data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
437
440
  return data
438
441
 
439
442
 
@@ -789,33 +792,49 @@ class Crop(Tool):
789
792
  return {"image": tmp.name}
790
793
 
791
794
 
792
- class BboxArea(Tool):
793
- r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places."""
795
+ class BboxStats(Tool):
796
+ r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
794
797
 
795
- name = "bbox_area_"
796
- description = "'bbox_area_' returns the area of the given bounding box in pixels normalized to 2 decimal places."
798
+ name = "bbox_stats_"
799
+ description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
797
800
  usage = {
798
- "required_parameters": [{"name": "bboxes", "type": "List[int]"}],
801
+ "required_parameters": [
802
+ {"name": "bboxes", "type": "List[int]"},
803
+ {"name": "image_size", "type": "Tuple[int]"},
804
+ ],
799
805
  "examples": [
800
806
  {
801
- "scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
802
- "parameters": {"bboxes": [0.2, 0.21, 0.34, 0.42]},
803
- }
807
+ "scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
808
+ "parameters": {
809
+ "bboxes": [[0.2, 0.21, 0.34, 0.42]],
810
+ "image_size": (500, 1200),
811
+ },
812
+ },
813
+ {
814
+ "scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
815
+ "parameters": {
816
+ "bboxes": [[0.2, 0.21, 0.34, 0.42]],
817
+ "image_size": (640, 480),
818
+ },
819
+ },
804
820
  ],
805
821
  }
806
822
 
807
- def __call__(self, bboxes: List[Dict]) -> List[Dict]:
823
+ def __call__(
824
+ self, bboxes: List[List[int]], image_size: Tuple[int, int]
825
+ ) -> List[Dict]:
808
826
  areas = []
809
- for elt in bboxes:
810
- height, width = elt["size"]
811
- for label, bbox in zip(elt["labels"], elt["bboxes"]):
812
- x1, y1, x2, y2 = bbox
813
- areas.append(
814
- {
815
- "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
816
- "label": label,
817
- }
818
- )
827
+ height, width = image_size
828
+ for bbox in bboxes:
829
+ x1, y1, x2, y2 = bbox
830
+ areas.append(
831
+ {
832
+ "width": round((x2 - x1) * width, 2),
833
+ "height": round((y2 - y1) * height, 2),
834
+ "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
835
+ }
836
+ )
837
+
819
838
  return areas
820
839
 
821
840
 
@@ -1054,22 +1073,25 @@ class ExtractFrames(Tool):
1054
1073
  r"""Extract frames from a video."""
1055
1074
 
1056
1075
  name = "extract_frames_"
1057
- description = "'extract_frames_' extracts frames from a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
1076
+ description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
1058
1077
  usage = {
1059
1078
  "required_parameters": [{"name": "video_uri", "type": "str"}],
1079
+ "optional_parameters": [{"name": "frames_every", "type": "float"}],
1060
1080
  "examples": [
1061
1081
  {
1062
1082
  "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
1063
1083
  "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
1064
1084
  },
1065
1085
  {
1066
- "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
1067
- "parameters": {"video_uri": "tests/data/test.mp4"},
1086
+ "scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
1087
+ "parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
1068
1088
  },
1069
1089
  ],
1070
1090
  }
1071
1091
 
1072
- def __call__(self, video_uri: str) -> List[Tuple[str, float]]:
1092
+ def __call__(
1093
+ self, video_uri: str, frames_every: float = 2
1094
+ ) -> List[Tuple[str, float]]:
1073
1095
  """Extract frames from a video.
1074
1096
 
1075
1097
 
@@ -1079,7 +1101,7 @@ class ExtractFrames(Tool):
1079
1101
  Returns:
1080
1102
  a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
1081
1103
  """
1082
- frames = extract_frames_from_video(video_uri)
1104
+ frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
1083
1105
  result = []
1084
1106
  _LOGGER.info(
1085
1107
  f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
@@ -1182,7 +1204,7 @@ TOOLS = {
1182
1204
  AgentDINOv,
1183
1205
  ExtractFrames,
1184
1206
  Crop,
1185
- BboxArea,
1207
+ BboxStats,
1186
1208
  SegArea,
1187
1209
  ObjectDistance,
1188
1210
  BboxContains,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.15
3
+ Version: 0.2.17
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -7,7 +7,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
7
7
  vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
8
8
  vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
9
9
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
10
- vision_agent/agent/vision_agent.py,sha256=4-GjEX8ZmLhvLebqNRRTSSu1kSaFYVR_wFsrjXgKdYI,26984
10
+ vision_agent/agent/vision_agent.py,sha256=ywOowbuwNSapVwl02ePZP_EzW1FlZULoCV59LR5nFww,27028
11
11
  vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
12
  vision_agent/agent/vision_agent_v2.py,sha256=CDgGBSoa2LoMS0b4JhyDkoS3PJJNmCCPfxIGUc4RfQg,9658
13
13
  vision_agent/agent/vision_agent_v2_prompt.py,sha256=-90Hlbtqb5Fp7OVjGabpTdgr-yCr8AYKIfiMRfoL4SY,5141
@@ -17,10 +17,10 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
17
17
  vision_agent/llm/llm.py,sha256=qWDBpJolGLWNwDjpEXu1NrjlJbo7Fj9efJYkSfVn6oE,5784
18
18
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
19
19
  vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
20
- vision_agent/tools/__init__.py,sha256=WiEjXzXyaBq7IQMKOMbFAK3FKvLNfzZ3dd7CPN-d7B8,451
20
+ vision_agent/tools/__init__.py,sha256=p5SM0YhThSVO_jRF9O-OjH2fYDPv-iMjexDX9xPPb7M,452
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
- vision_agent/tools/tool_utils.py,sha256=moR7X4hkLKQzC56axdojo_OcIuVOv45bKcHPUVZrPvk,753
23
- vision_agent/tools/tools.py,sha256=WrNu_L5n2cEpe7e1oy8S1o3dy4JJ4AUxTHcjAdX64_g,46398
22
+ vision_agent/tools/tool_utils.py,sha256=1m24PE4Psb96Q51NFx3w3XctTgSyRSWCX7YG6YcJy9E,925
23
+ vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
24
24
  vision_agent/tools/tools_v2.py,sha256=1Y_ZbYJyuo2eZZkq7jY3YfuKWC82C-GFCZMLYH-I5ew,13800
25
25
  vision_agent/utils/__init__.py,sha256=AKXf1QVOpO6MnqU8RSaFLQ_4us4DcKf8ibgEbhuHjvI,95
26
26
  vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
@@ -28,7 +28,7 @@ vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hs
28
28
  vision_agent/utils/sim.py,sha256=FaD16kKL1-JR2aSCmznF9KkJux9u3_Nr9tF4smBeoK0,2327
29
29
  vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
30
30
  vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
31
- vision_agent-0.2.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
- vision_agent-0.2.15.dist-info/METADATA,sha256=qK9rIVOI_IL0dcUcIqtgoRCxuk5GZuQ5HHSrdsuVLKs,9121
33
- vision_agent-0.2.15.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
- vision_agent-0.2.15.dist-info/RECORD,,
31
+ vision_agent-0.2.17.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
+ vision_agent-0.2.17.dist-info/METADATA,sha256=NGE3lE7Aaa4uLZtdShMzowT8bINPKn2saBMihFSWVAA,9121
33
+ vision_agent-0.2.17.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
+ vision_agent-0.2.17.dist-info/RECORD,,