vision-agent 0.2.127__py3-none-any.whl → 0.2.129__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -70,30 +70,64 @@ This is the documentation for the functions you have access to. You may call any
70
70
  2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
71
71
  3. Your test case MUST run only on the given images which are {media}
72
72
  4. Print this final dictionary.
73
+ 5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
73
74
 
74
75
  **Example**:
76
+ --- EXAMPLE1 ---
75
77
  plan1:
76
78
  - Load the image from the provided file path 'image.jpg'.
77
- - Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image.
79
+ - Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
78
80
  plan2:
79
81
  - Load the image from the provided file path 'image.jpg'.
80
- - Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image.
82
+ - Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
81
83
  - Count the number of detected objects labeled as 'person'.
82
84
  plan3:
83
85
  - Load the image from the provided file path 'image.jpg'.
84
86
  - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
85
87
 
86
88
  ```python
87
- from vision_agent.tools import load_image, owl_v2, grounding_sam, countgd_counting
89
+ from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
88
90
  image = load_image("image.jpg")
89
- owl_v2_out = owl_v2("person", image)
91
+ owl_v2_out = owl_v2_image("person", image)
90
92
 
91
- gsam_out = grounding_sam("person", image)
92
- gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
93
+ f2s2_out = florence2_sam2_image("person", image)
94
+ # strip out the masks from the output becuase they don't provide useful information when printed
95
+ f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
93
96
 
94
97
  cgd_out = countgd_counting(image)
95
98
 
96
- final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
99
+ final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
100
+ print(final_out)
101
+
102
+ --- EXAMPLE2 ---
103
+ plan1:
104
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
105
+ - Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
106
+ plan2:
107
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
108
+ - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
109
+ plan3:
110
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
111
+ - Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
112
+
113
+
114
+ ```python
115
+ from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
116
+
117
+ # sample at 1 FPS and use the first 10 frames to reduce processing time
118
+ frames = extract_frames("video.mp4", 1)
119
+ frames = [f[0] for f in frames][:10]
120
+
121
+ # plan1
122
+ owl_v2_out = [owl_v2_image("person", f) for f in frames]
123
+
124
+ # plan2
125
+ florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
126
+
127
+ # plan3
128
+ countgd_out = [countgd_counting(f) for f in frames]
129
+
130
+ final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
97
131
  print(final_out)
98
132
  ```
99
133
  """
@@ -27,7 +27,7 @@ from .tools import (
27
27
  florence2_phrase_grounding,
28
28
  florence2_roberta_vqa,
29
29
  florence2_sam2_image,
30
- florence2_sam2_video,
30
+ florence2_sam2_video_tracking,
31
31
  generate_pose_image,
32
32
  generate_soft_edge_image,
33
33
  get_tool_documentation,
@@ -46,7 +46,8 @@ from .tools import (
46
46
  overlay_counting_results,
47
47
  overlay_heat_map,
48
48
  overlay_segmentation_masks,
49
- owl_v2,
49
+ owl_v2_image,
50
+ owl_v2_video,
50
51
  save_image,
51
52
  save_json,
52
53
  save_video,
@@ -1,3 +1,4 @@
1
+ from base64 import b64encode
1
2
  import inspect
2
3
  import logging
3
4
  import os
@@ -27,6 +28,7 @@ class ToolCallTrace(BaseModel):
27
28
  request: MutableMapping[str, Any]
28
29
  response: MutableMapping[str, Any]
29
30
  error: Optional[Error]
31
+ files: Optional[List[tuple[str, str]]]
30
32
 
31
33
 
32
34
  def send_inference_request(
@@ -202,12 +204,16 @@ def _call_post(
202
204
  files: Optional[List[Tuple[Any, ...]]] = None,
203
205
  function_name: str = "unknown",
204
206
  ) -> Any:
207
+ files_in_b64 = None
208
+ if files:
209
+ files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
205
210
  try:
206
211
  tool_call_trace = ToolCallTrace(
207
212
  endpoint_url=url,
208
213
  request=payload,
209
214
  response={},
210
215
  error=None,
216
+ files=files_in_b64,
211
217
  )
212
218
 
213
219
  if files is not None:
@@ -145,15 +145,15 @@ def grounding_dino(
145
145
  return return_data
146
146
 
147
147
 
148
- def owl_v2(
148
+ def owl_v2_image(
149
149
  prompt: str,
150
150
  image: np.ndarray,
151
151
  box_threshold: float = 0.10,
152
152
  ) -> List[Dict[str, Any]]:
153
- """'owl_v2' is a tool that can detect and count multiple objects given a text
154
- prompt such as category names or referring expressions. The categories in text
155
- prompt are separated by commas. It returns a list of bounding boxes with normalized
156
- coordinates, label names and associated probability scores.
153
+ """'owl_v2_image' is a tool that can detect and count multiple objects given a text
154
+ prompt such as category names or referring expressions on images. The categories in
155
+ text prompt are separated by commas. It returns a list of bounding boxes with
156
+ normalized coordinates, label names and associated probability scores.
157
157
 
158
158
  Parameters:
159
159
  prompt (str): The prompt to ground to the image.
@@ -170,32 +170,103 @@ def owl_v2(
170
170
 
171
171
  Example
172
172
  -------
173
- >>> owl_v2("car, dinosaur", image)
173
+ >>> owl_v2_image("car, dinosaur", image)
174
174
  [
175
175
  {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
176
176
  {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
177
177
  ]
178
178
  """
179
179
  image_size = image.shape[:2]
180
- image_b64 = convert_to_b64(image)
181
- request_data = {
180
+ buffer_bytes = numpy_to_bytes(image)
181
+ files = [("image", buffer_bytes)]
182
+ payload = {
182
183
  "prompts": [s.strip() for s in prompt.split(",")],
183
- "image": image_b64,
184
- "confidence": box_threshold,
185
- "function_name": "owl_v2",
184
+ "model": "owlv2",
185
+ "function_name": "owl_v2_image",
186
186
  }
187
- data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
188
- return_data = []
187
+ resp_data = send_inference_request(
188
+ payload, "text-to-object-detection", files=files, v2=True
189
+ )
190
+ bboxes = resp_data[0]
191
+ bboxes_formatted = [
192
+ ODResponseData(
193
+ label=bbox["label"],
194
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
195
+ score=round(bbox["score"], 2),
196
+ )
197
+ for bbox in bboxes
198
+ ]
199
+ filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
200
+ return [bbox.model_dump() for bbox in filtered_bboxes]
201
+
202
+
203
+ def owl_v2_video(
204
+ prompt: str,
205
+ frames: List[np.ndarray],
206
+ box_threshold: float = 0.10,
207
+ ) -> List[List[Dict[str, Any]]]:
208
+ """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
209
+ objects per frame given a text prompt sucha s a category name or referring
210
+ expression. The categories in text prompt are separated by commas. It returns a list
211
+ of lists where each inner list contains the score, label, and bounding box of the
212
+ detections for that frame.
213
+
214
+ Parameters:
215
+ prompt (str): The prompt to ground to the video.
216
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
217
+ box_threshold (float, optional): The threshold for the box detection. Defaults
218
+ to 0.30.
219
+
220
+ Returns:
221
+ List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
222
+ score, label, and bounding box of the detected objects with normalized
223
+ coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
224
+ coordinates of the top-left and xmax and ymax are the coordinates of the
225
+ bottom-right of the bounding box.
226
+
227
+ Example
228
+ -------
229
+ >>> owl_v2_video("car, dinosaur", frames)
230
+ [
231
+ [
232
+ {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
233
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
234
+ ],
235
+ ...
236
+ ]
237
+ """
238
+ if len(frames) == 0:
239
+ raise ValueError("No frames provided")
240
+
241
+ image_size = frames[0].shape[:2]
242
+ buffer_bytes = frames_to_bytes(frames)
243
+ files = [("video", buffer_bytes)]
244
+ payload = {
245
+ "prompts": [s.strip() for s in prompt.split(",")],
246
+ "model": "owlv2",
247
+ "function_name": "owl_v2_video",
248
+ }
249
+ data: Dict[str, Any] = send_inference_request(
250
+ payload, "text-to-object-detection", files=files, v2=True
251
+ )
252
+ bboxes_formatted = []
189
253
  if data is not None:
190
- for elt in data:
191
- return_data.append(
192
- {
193
- "bbox": normalize_bbox(elt["bbox"], image_size), # type: ignore
194
- "label": elt["label"], # type: ignore
195
- "score": round(elt["score"], 2), # type: ignore
196
- }
197
- )
198
- return return_data
254
+ for frame_data in data:
255
+ bboxes_formated_frame = []
256
+ for elt in frame_data:
257
+ bboxes_formated_frame.append(
258
+ ODResponseData(
259
+ label=elt["label"], # type: ignore
260
+ bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
261
+ score=round(elt["score"], 2), # type: ignore
262
+ )
263
+ )
264
+ bboxes_formatted.append(bboxes_formated_frame)
265
+
266
+ filtered_bboxes = [
267
+ filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
268
+ ]
269
+ return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
199
270
 
200
271
 
201
272
  def grounding_sam(
@@ -317,14 +388,14 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
317
388
  return return_data
318
389
 
319
390
 
320
- def florence2_sam2_video(
391
+ def florence2_sam2_video_tracking(
321
392
  prompt: str, frames: List[np.ndarray]
322
393
  ) -> List[List[Dict[str, Any]]]:
323
- """'florence2_sam2_video' is a tool that can segment and track multiple entities
324
- in a video given a text prompt such as category names or referring expressions. You
325
- can optionally separate the categories in the text with commas. It only tracks
326
- entities present in the first frame and only returns segmentation masks. It is
327
- useful for tracking and counting without duplicating counts.
394
+ """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
395
+ entities in a video given a text prompt such as category names or referring
396
+ expressions. You can optionally separate the categories in the text with commas. It
397
+ only tracks entities present in the first frame and only returns segmentation
398
+ masks. It is useful for tracking and counting without duplicating counts.
328
399
 
329
400
  Parameters:
330
401
  prompt (str): The prompt to ground to the video.
@@ -351,14 +422,15 @@ def florence2_sam2_video(
351
422
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
352
423
  },
353
424
  ],
425
+ ...
354
426
  ]
355
427
  """
356
428
 
357
429
  buffer_bytes = frames_to_bytes(frames)
358
430
  files = [("video", buffer_bytes)]
359
431
  payload = {
360
- "prompts": prompt.split(","),
361
- "function_name": "florence2_sam2_video",
432
+ "prompts": [s.strip() for s in prompt.split(",")],
433
+ "function_name": "florence2_sam2_video_tracking",
362
434
  }
363
435
  data: Dict[str, Any] = send_inference_request(
364
436
  payload, "florence2-sam2", files=files, v2=True
@@ -549,7 +621,14 @@ def countgd_counting(
549
621
  payload, "text-to-object-detection", files=files, metadata=metadata
550
622
  )
551
623
  bboxes_per_frame = resp_data[0]
552
- bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
624
+ bboxes_formatted = [
625
+ ODResponseData(
626
+ label=bbox["label"],
627
+ bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
628
+ score=round(bbox["score"], 2),
629
+ )
630
+ for bbox in bboxes_per_frame
631
+ ]
553
632
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
554
633
  return [bbox.model_dump() for bbox in filtered_bboxes]
555
634
 
@@ -601,7 +680,14 @@ def countgd_example_based_counting(
601
680
  payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
602
681
  )
603
682
  bboxes_per_frame = resp_data[0]
604
- bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
683
+ bboxes_formatted = [
684
+ ODResponseData(
685
+ label=bbox["label"],
686
+ bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
687
+ score=round(bbox["score"], 2),
688
+ )
689
+ for bbox in bboxes_per_frame
690
+ ]
605
691
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
606
692
  return [bbox.model_dump() for bbox in filtered_bboxes]
607
693
 
@@ -1374,12 +1460,12 @@ def closest_box_distance(
1374
1460
  def extract_frames(
1375
1461
  video_uri: Union[str, Path], fps: float = 1
1376
1462
  ) -> List[Tuple[np.ndarray, float]]:
1377
- """'extract_frames' extracts frames from a video which can be a file path or youtube
1378
- link, returns a list of tuples (frame, timestamp), where timestamp is the relative
1379
- time in seconds where the frame was captured. The frame is a numpy array.
1463
+ """'extract_frames' extracts frames from a video which can be a file path, url or
1464
+ youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
1465
+ relative time in seconds where the frame was captured. The frame is a numpy array.
1380
1466
 
1381
1467
  Parameters:
1382
- video_uri (Union[str, Path]): The path to the video file or youtube link
1468
+ video_uri (Union[str, Path]): The path to the video file, url or youtube link
1383
1469
  fps (float, optional): The frame rate per second to extract the frames. Defaults
1384
1470
  to 10.
1385
1471
 
@@ -1820,7 +1906,8 @@ def overlay_counting_results(
1820
1906
 
1821
1907
 
1822
1908
  FUNCTION_TOOLS = [
1823
- owl_v2,
1909
+ owl_v2_image,
1910
+ owl_v2_video,
1824
1911
  ocr,
1825
1912
  clip,
1826
1913
  vit_image_classification,
@@ -1829,7 +1916,7 @@ FUNCTION_TOOLS = [
1829
1916
  florence2_image_caption,
1830
1917
  florence2_ocr,
1831
1918
  florence2_sam2_image,
1832
- florence2_sam2_video,
1919
+ florence2_sam2_video_tracking,
1833
1920
  florence2_phrase_grounding,
1834
1921
  ixc25_image_vqa,
1835
1922
  ixc25_video_vqa,
@@ -4,8 +4,8 @@ import tempfile
4
4
  from functools import lru_cache
5
5
  from typing import List, Optional, Tuple
6
6
 
7
- import cv2
8
7
  import av # type: ignore
8
+ import cv2
9
9
  import numpy as np
10
10
  from decord import VideoReader # type: ignore
11
11
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.127
3
+ Version: 0.2.129
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
4
4
  vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
5
5
  vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=_2QQd_nTGojkk2ZOiMevVCY6-eUA9q1QdCWH7-Noq4w,34237
7
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
7
+ vision_agent/agent/vision_agent_coder_prompts.py,sha256=nj4iRRSAWYHjKqyUSp12aTCV1D5iUVCHeezVXoozS4M,12687
8
8
  vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
@@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
14
  vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
15
15
  vision_agent/lmm/lmm.py,sha256=092oefI65_QSRvQm2znXkjTdzlZTh-Ni_38610kfbJg,16836
16
16
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
- vision_agent/tools/__init__.py,sha256=T8Hi5aHf4J2QJDoPRvu5fxbiqMpAY-1Gi2EFIhJbf3A,2331
17
+ vision_agent/tools/__init__.py,sha256=nx60_hujcnLz3d2wQlCbcerUmT6R2vxRy66IsQjdB3M,2364
18
18
  vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
- vision_agent/tools/tool_utils.py,sha256=62NVlojPMf9MuJ-3yJEcrB3mzmOxN2HrNQzzjVa-FZg,7527
21
- vision_agent/tools/tools.py,sha256=sO0J-ts2CsJnf2UPcvxvmowE_G0X3f1iSChnS-cnPlk,65433
20
+ vision_agent/tools/tool_utils.py,sha256=PjdataKjPpiFSq1QBAAWHJUGPPn4p4dr07TPSlhXvFk,7758
21
+ vision_agent/tools/tools.py,sha256=p6QUo7V03UZOKBAGfabVWdPm9vUT9tyP_utCv0yKfcY,68659
22
22
  vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
23
23
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -26,8 +26,8 @@ vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4
26
26
  vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
27
27
  vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
- vision_agent/utils/video.py,sha256=oDTCuTv1dFMYvwqis7y0frt9U2iDF9KGN1g21bOVjvE,4528
30
- vision_agent-0.2.127.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.127.dist-info/METADATA,sha256=r3fKbSB79F3MsBsOTV0z054Qno3DTpf3Pa-xwkdIgD0,12295
32
- vision_agent-0.2.127.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.127.dist-info/RECORD,,
29
+ vision_agent/utils/video.py,sha256=GmJqu_3WhBMEwP4HToMMp8EwgftliHSpv5nd-QEDOcs,4528
30
+ vision_agent-0.2.129.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.129.dist-info/METADATA,sha256=uMW3dpm48GLsgwAA_grlCghpKPA32caHy2SrCWhYtdI,12295
32
+ vision_agent-0.2.129.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.129.dist-info/RECORD,,