vision-agent 0.2.127__tar.gz → 0.2.128__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.127 → vision_agent-0.2.128}/PKG-INFO +1 -1
- {vision_agent-0.2.127 → vision_agent-0.2.128}/pyproject.toml +1 -1
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/agent/vision_agent_coder_prompts.py +41 -7
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/tools/__init__.py +3 -2
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/tools/tools.py +125 -38
- {vision_agent-0.2.127 → vision_agent-0.2.128}/LICENSE +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/README.md +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/utils/video.py +1 -1
{vision_agent-0.2.127 → vision_agent-0.2.128}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
@@ -70,30 +70,64 @@ This is the documentation for the functions you have access to. You may call any
|
|
70
70
|
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
71
71
|
3. Your test case MUST run only on the given images which are {media}
|
72
72
|
4. Print this final dictionary.
|
73
|
+
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
|
73
74
|
|
74
75
|
**Example**:
|
76
|
+
--- EXAMPLE1 ---
|
75
77
|
plan1:
|
76
78
|
- Load the image from the provided file path 'image.jpg'.
|
77
|
-
- Use the '
|
79
|
+
- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
78
80
|
plan2:
|
79
81
|
- Load the image from the provided file path 'image.jpg'.
|
80
|
-
- Use the '
|
82
|
+
- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
81
83
|
- Count the number of detected objects labeled as 'person'.
|
82
84
|
plan3:
|
83
85
|
- Load the image from the provided file path 'image.jpg'.
|
84
86
|
- Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
|
85
87
|
|
86
88
|
```python
|
87
|
-
from vision_agent.tools import load_image,
|
89
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
|
88
90
|
image = load_image("image.jpg")
|
89
|
-
owl_v2_out =
|
91
|
+
owl_v2_out = owl_v2_image("person", image)
|
90
92
|
|
91
|
-
|
92
|
-
|
93
|
+
f2s2_out = florence2_sam2_image("person", image)
|
94
|
+
# strip out the masks from the output becuase they don't provide useful information when printed
|
95
|
+
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
93
96
|
|
94
97
|
cgd_out = countgd_counting(image)
|
95
98
|
|
96
|
-
final_out = {{"
|
99
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
|
100
|
+
print(final_out)
|
101
|
+
|
102
|
+
--- EXAMPLE2 ---
|
103
|
+
plan1:
|
104
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
|
105
|
+
- Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
|
106
|
+
plan2:
|
107
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
|
108
|
+
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
109
|
+
plan3:
|
110
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
|
111
|
+
- Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
|
112
|
+
|
113
|
+
|
114
|
+
```python
|
115
|
+
from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
|
116
|
+
|
117
|
+
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
118
|
+
frames = extract_frames("video.mp4", 1)
|
119
|
+
frames = [f[0] for f in frames][:10]
|
120
|
+
|
121
|
+
# plan1
|
122
|
+
owl_v2_out = [owl_v2_image("person", f) for f in frames]
|
123
|
+
|
124
|
+
# plan2
|
125
|
+
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
126
|
+
|
127
|
+
# plan3
|
128
|
+
countgd_out = [countgd_counting(f) for f in frames]
|
129
|
+
|
130
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
|
97
131
|
print(final_out)
|
98
132
|
```
|
99
133
|
"""
|
@@ -27,7 +27,7 @@ from .tools import (
|
|
27
27
|
florence2_phrase_grounding,
|
28
28
|
florence2_roberta_vqa,
|
29
29
|
florence2_sam2_image,
|
30
|
-
|
30
|
+
florence2_sam2_video_tracking,
|
31
31
|
generate_pose_image,
|
32
32
|
generate_soft_edge_image,
|
33
33
|
get_tool_documentation,
|
@@ -46,7 +46,8 @@ from .tools import (
|
|
46
46
|
overlay_counting_results,
|
47
47
|
overlay_heat_map,
|
48
48
|
overlay_segmentation_masks,
|
49
|
-
|
49
|
+
owl_v2_image,
|
50
|
+
owl_v2_video,
|
50
51
|
save_image,
|
51
52
|
save_json,
|
52
53
|
save_video,
|
@@ -145,15 +145,15 @@ def grounding_dino(
|
|
145
145
|
return return_data
|
146
146
|
|
147
147
|
|
148
|
-
def
|
148
|
+
def owl_v2_image(
|
149
149
|
prompt: str,
|
150
150
|
image: np.ndarray,
|
151
151
|
box_threshold: float = 0.10,
|
152
152
|
) -> List[Dict[str, Any]]:
|
153
|
-
"""'
|
154
|
-
prompt such as category names or referring expressions. The categories in
|
155
|
-
prompt are separated by commas. It returns a list of bounding boxes with
|
156
|
-
coordinates, label names and associated probability scores.
|
153
|
+
"""'owl_v2_image' is a tool that can detect and count multiple objects given a text
|
154
|
+
prompt such as category names or referring expressions on images. The categories in
|
155
|
+
text prompt are separated by commas. It returns a list of bounding boxes with
|
156
|
+
normalized coordinates, label names and associated probability scores.
|
157
157
|
|
158
158
|
Parameters:
|
159
159
|
prompt (str): The prompt to ground to the image.
|
@@ -170,32 +170,103 @@ def owl_v2(
|
|
170
170
|
|
171
171
|
Example
|
172
172
|
-------
|
173
|
-
>>>
|
173
|
+
>>> owl_v2_image("car, dinosaur", image)
|
174
174
|
[
|
175
175
|
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
176
176
|
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
177
177
|
]
|
178
178
|
"""
|
179
179
|
image_size = image.shape[:2]
|
180
|
-
|
181
|
-
|
180
|
+
buffer_bytes = numpy_to_bytes(image)
|
181
|
+
files = [("image", buffer_bytes)]
|
182
|
+
payload = {
|
182
183
|
"prompts": [s.strip() for s in prompt.split(",")],
|
183
|
-
"
|
184
|
-
"
|
185
|
-
"function_name": "owl_v2",
|
184
|
+
"model": "owlv2",
|
185
|
+
"function_name": "owl_v2_image",
|
186
186
|
}
|
187
|
-
|
188
|
-
|
187
|
+
resp_data = send_inference_request(
|
188
|
+
payload, "text-to-object-detection", files=files, v2=True
|
189
|
+
)
|
190
|
+
bboxes = resp_data[0]
|
191
|
+
bboxes_formatted = [
|
192
|
+
ODResponseData(
|
193
|
+
label=bbox["label"],
|
194
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
195
|
+
score=round(bbox["score"], 2),
|
196
|
+
)
|
197
|
+
for bbox in bboxes
|
198
|
+
]
|
199
|
+
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
200
|
+
return [bbox.model_dump() for bbox in filtered_bboxes]
|
201
|
+
|
202
|
+
|
203
|
+
def owl_v2_video(
|
204
|
+
prompt: str,
|
205
|
+
frames: List[np.ndarray],
|
206
|
+
box_threshold: float = 0.10,
|
207
|
+
) -> List[List[Dict[str, Any]]]:
|
208
|
+
"""'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
|
209
|
+
objects per frame given a text prompt sucha s a category name or referring
|
210
|
+
expression. The categories in text prompt are separated by commas. It returns a list
|
211
|
+
of lists where each inner list contains the score, label, and bounding box of the
|
212
|
+
detections for that frame.
|
213
|
+
|
214
|
+
Parameters:
|
215
|
+
prompt (str): The prompt to ground to the video.
|
216
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
217
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
218
|
+
to 0.30.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
|
222
|
+
score, label, and bounding box of the detected objects with normalized
|
223
|
+
coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
|
224
|
+
coordinates of the top-left and xmax and ymax are the coordinates of the
|
225
|
+
bottom-right of the bounding box.
|
226
|
+
|
227
|
+
Example
|
228
|
+
-------
|
229
|
+
>>> owl_v2_video("car, dinosaur", frames)
|
230
|
+
[
|
231
|
+
[
|
232
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
233
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
234
|
+
],
|
235
|
+
...
|
236
|
+
]
|
237
|
+
"""
|
238
|
+
if len(frames) == 0:
|
239
|
+
raise ValueError("No frames provided")
|
240
|
+
|
241
|
+
image_size = frames[0].shape[:2]
|
242
|
+
buffer_bytes = frames_to_bytes(frames)
|
243
|
+
files = [("video", buffer_bytes)]
|
244
|
+
payload = {
|
245
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
246
|
+
"model": "owlv2",
|
247
|
+
"function_name": "owl_v2_video",
|
248
|
+
}
|
249
|
+
data: Dict[str, Any] = send_inference_request(
|
250
|
+
payload, "text-to-object-detection", files=files, v2=True
|
251
|
+
)
|
252
|
+
bboxes_formatted = []
|
189
253
|
if data is not None:
|
190
|
-
for
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
254
|
+
for frame_data in data:
|
255
|
+
bboxes_formated_frame = []
|
256
|
+
for elt in frame_data:
|
257
|
+
bboxes_formated_frame.append(
|
258
|
+
ODResponseData(
|
259
|
+
label=elt["label"], # type: ignore
|
260
|
+
bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
|
261
|
+
score=round(elt["score"], 2), # type: ignore
|
262
|
+
)
|
263
|
+
)
|
264
|
+
bboxes_formatted.append(bboxes_formated_frame)
|
265
|
+
|
266
|
+
filtered_bboxes = [
|
267
|
+
filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
|
268
|
+
]
|
269
|
+
return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
|
199
270
|
|
200
271
|
|
201
272
|
def grounding_sam(
|
@@ -317,14 +388,14 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
|
|
317
388
|
return return_data
|
318
389
|
|
319
390
|
|
320
|
-
def
|
391
|
+
def florence2_sam2_video_tracking(
|
321
392
|
prompt: str, frames: List[np.ndarray]
|
322
393
|
) -> List[List[Dict[str, Any]]]:
|
323
|
-
"""'
|
324
|
-
in a video given a text prompt such as category names or referring
|
325
|
-
can optionally separate the categories in the text with commas. It
|
326
|
-
entities present in the first frame and only returns segmentation
|
327
|
-
useful for tracking and counting without duplicating counts.
|
394
|
+
"""'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
395
|
+
entities in a video given a text prompt such as category names or referring
|
396
|
+
expressions. You can optionally separate the categories in the text with commas. It
|
397
|
+
only tracks entities present in the first frame and only returns segmentation
|
398
|
+
masks. It is useful for tracking and counting without duplicating counts.
|
328
399
|
|
329
400
|
Parameters:
|
330
401
|
prompt (str): The prompt to ground to the video.
|
@@ -351,14 +422,15 @@ def florence2_sam2_video(
|
|
351
422
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
352
423
|
},
|
353
424
|
],
|
425
|
+
...
|
354
426
|
]
|
355
427
|
"""
|
356
428
|
|
357
429
|
buffer_bytes = frames_to_bytes(frames)
|
358
430
|
files = [("video", buffer_bytes)]
|
359
431
|
payload = {
|
360
|
-
"prompts": prompt.split(","),
|
361
|
-
"function_name": "
|
432
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
433
|
+
"function_name": "florence2_sam2_video_tracking",
|
362
434
|
}
|
363
435
|
data: Dict[str, Any] = send_inference_request(
|
364
436
|
payload, "florence2-sam2", files=files, v2=True
|
@@ -549,7 +621,14 @@ def countgd_counting(
|
|
549
621
|
payload, "text-to-object-detection", files=files, metadata=metadata
|
550
622
|
)
|
551
623
|
bboxes_per_frame = resp_data[0]
|
552
|
-
bboxes_formatted = [
|
624
|
+
bboxes_formatted = [
|
625
|
+
ODResponseData(
|
626
|
+
label=bbox["label"],
|
627
|
+
bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
|
628
|
+
score=round(bbox["score"], 2),
|
629
|
+
)
|
630
|
+
for bbox in bboxes_per_frame
|
631
|
+
]
|
553
632
|
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
554
633
|
return [bbox.model_dump() for bbox in filtered_bboxes]
|
555
634
|
|
@@ -601,7 +680,14 @@ def countgd_example_based_counting(
|
|
601
680
|
payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
|
602
681
|
)
|
603
682
|
bboxes_per_frame = resp_data[0]
|
604
|
-
bboxes_formatted = [
|
683
|
+
bboxes_formatted = [
|
684
|
+
ODResponseData(
|
685
|
+
label=bbox["label"],
|
686
|
+
bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
|
687
|
+
score=round(bbox["score"], 2),
|
688
|
+
)
|
689
|
+
for bbox in bboxes_per_frame
|
690
|
+
]
|
605
691
|
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
606
692
|
return [bbox.model_dump() for bbox in filtered_bboxes]
|
607
693
|
|
@@ -1374,12 +1460,12 @@ def closest_box_distance(
|
|
1374
1460
|
def extract_frames(
|
1375
1461
|
video_uri: Union[str, Path], fps: float = 1
|
1376
1462
|
) -> List[Tuple[np.ndarray, float]]:
|
1377
|
-
"""'extract_frames' extracts frames from a video which can be a file path or
|
1378
|
-
link, returns a list of tuples (frame, timestamp), where timestamp is the
|
1379
|
-
time in seconds where the frame was captured. The frame is a numpy array.
|
1463
|
+
"""'extract_frames' extracts frames from a video which can be a file path, url or
|
1464
|
+
youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
|
1465
|
+
relative time in seconds where the frame was captured. The frame is a numpy array.
|
1380
1466
|
|
1381
1467
|
Parameters:
|
1382
|
-
video_uri (Union[str, Path]): The path to the video file or youtube link
|
1468
|
+
video_uri (Union[str, Path]): The path to the video file, url or youtube link
|
1383
1469
|
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
1384
1470
|
to 10.
|
1385
1471
|
|
@@ -1820,7 +1906,8 @@ def overlay_counting_results(
|
|
1820
1906
|
|
1821
1907
|
|
1822
1908
|
FUNCTION_TOOLS = [
|
1823
|
-
|
1909
|
+
owl_v2_image,
|
1910
|
+
owl_v2_video,
|
1824
1911
|
ocr,
|
1825
1912
|
clip,
|
1826
1913
|
vit_image_classification,
|
@@ -1829,7 +1916,7 @@ FUNCTION_TOOLS = [
|
|
1829
1916
|
florence2_image_caption,
|
1830
1917
|
florence2_ocr,
|
1831
1918
|
florence2_sam2_image,
|
1832
|
-
|
1919
|
+
florence2_sam2_video_tracking,
|
1833
1920
|
florence2_phrase_grounding,
|
1834
1921
|
ixc25_image_vqa,
|
1835
1922
|
ixc25_video_vqa,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|