vision-agent 0.2.227__py3-none-any.whl → 0.2.229__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +3 -0
- vision_agent/tools/tool_utils.py +1 -2
- vision_agent/tools/tools.py +244 -0
- vision_agent/utils/video_tracking.py +1 -0
- {vision_agent-0.2.227.dist-info → vision_agent-0.2.229.dist-info}/METADATA +1 -1
- {vision_agent-0.2.227.dist-info → vision_agent-0.2.229.dist-info}/RECORD +8 -8
- {vision_agent-0.2.227.dist-info → vision_agent-0.2.229.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.227.dist-info → vision_agent-0.2.229.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tool_utils.py
CHANGED
@@ -72,8 +72,7 @@ def send_inference_request(
|
|
72
72
|
|
73
73
|
response = _call_post(url, payload, session, files, function_name, is_form)
|
74
74
|
|
75
|
-
|
76
|
-
return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
|
75
|
+
return response["data"]
|
77
76
|
|
78
77
|
|
79
78
|
def send_task_inference_request(
|
vision_agent/tools/tools.py
CHANGED
@@ -290,6 +290,14 @@ def od_sam2_video_tracking(
|
|
290
290
|
)
|
291
291
|
function_name = "florence2_object_detection"
|
292
292
|
|
293
|
+
elif od_model == ODModels.AGENTIC:
|
294
|
+
segment_results = agentic_object_detection(
|
295
|
+
prompt=prompt,
|
296
|
+
image=segment_frames[frame_number],
|
297
|
+
fine_tune_id=fine_tune_id,
|
298
|
+
)
|
299
|
+
function_name = "agentic_object_detection"
|
300
|
+
|
293
301
|
elif od_model == ODModels.CUSTOM:
|
294
302
|
segment_results = custom_object_detection(
|
295
303
|
deployment_id=fine_tune_id,
|
@@ -2140,6 +2148,242 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
|
|
2140
2148
|
return response
|
2141
2149
|
|
2142
2150
|
|
2151
|
+
# agentic od tools
|
2152
|
+
|
2153
|
+
|
2154
|
+
def _agentic_object_detection(
|
2155
|
+
prompt: str,
|
2156
|
+
image: np.ndarray,
|
2157
|
+
image_size: Tuple[int, ...],
|
2158
|
+
image_bytes: Optional[bytes] = None,
|
2159
|
+
fine_tune_id: Optional[str] = None,
|
2160
|
+
) -> Dict[str, Any]:
|
2161
|
+
if image_bytes is None:
|
2162
|
+
image_bytes = numpy_to_bytes(image)
|
2163
|
+
|
2164
|
+
files = [("image", image_bytes)]
|
2165
|
+
payload = {
|
2166
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
2167
|
+
"model": "agentic",
|
2168
|
+
}
|
2169
|
+
metadata = {"function_name": "agentic_object_detection"}
|
2170
|
+
|
2171
|
+
if fine_tune_id is not None:
|
2172
|
+
landing_api = LandingPublicAPI()
|
2173
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
2174
|
+
if status is not JobStatus.SUCCEEDED:
|
2175
|
+
raise FineTuneModelIsNotReady(
|
2176
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
2177
|
+
)
|
2178
|
+
|
2179
|
+
# we can only execute fine-tuned models with florence2
|
2180
|
+
payload = {
|
2181
|
+
"prompts": payload["prompts"],
|
2182
|
+
"jobId": fine_tune_id,
|
2183
|
+
"model": "florence2",
|
2184
|
+
}
|
2185
|
+
|
2186
|
+
detections = send_task_inference_request(
|
2187
|
+
payload,
|
2188
|
+
"text-to-object-detection",
|
2189
|
+
files=files,
|
2190
|
+
metadata=metadata,
|
2191
|
+
)
|
2192
|
+
|
2193
|
+
# get the first frame
|
2194
|
+
bboxes = detections[0]
|
2195
|
+
bboxes_formatted = [
|
2196
|
+
{
|
2197
|
+
"label": bbox["label"],
|
2198
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
2199
|
+
"score": bbox["score"],
|
2200
|
+
}
|
2201
|
+
for bbox in bboxes
|
2202
|
+
]
|
2203
|
+
display_data = [
|
2204
|
+
{
|
2205
|
+
"label": bbox["label"],
|
2206
|
+
"bbox": bbox["bounding_box"],
|
2207
|
+
"score": bbox["score"],
|
2208
|
+
}
|
2209
|
+
for bbox in bboxes
|
2210
|
+
]
|
2211
|
+
return {
|
2212
|
+
"files": files,
|
2213
|
+
"return_data": bboxes_formatted,
|
2214
|
+
"display_data": display_data,
|
2215
|
+
}
|
2216
|
+
|
2217
|
+
|
2218
|
+
def agentic_object_detection(
|
2219
|
+
prompt: str,
|
2220
|
+
image: np.ndarray,
|
2221
|
+
fine_tune_id: Optional[str] = None,
|
2222
|
+
) -> List[Dict[str, Any]]:
|
2223
|
+
"""'agentic_object_detection' is a tool that can detect and count multiple objects
|
2224
|
+
given a text prompt such as category names or referring expressions on images. The
|
2225
|
+
categories in text prompt are separated by commas. It returns a list of bounding
|
2226
|
+
boxes with normalized coordinates, label names and associated probability scores.
|
2227
|
+
|
2228
|
+
Parameters:
|
2229
|
+
prompt (str): The prompt to ground to the image.
|
2230
|
+
image (np.ndarray): The image to ground the prompt to.
|
2231
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
2232
|
+
fine-tuned model ID here to use it.
|
2233
|
+
|
2234
|
+
Returns:
|
2235
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
2236
|
+
bounding box of the detected objects with normalized coordinates between 0
|
2237
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
2238
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
2239
|
+
bounding box.
|
2240
|
+
|
2241
|
+
Example
|
2242
|
+
-------
|
2243
|
+
>>> agentic_object_detection("car", image)
|
2244
|
+
[
|
2245
|
+
{'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
2246
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
2247
|
+
]
|
2248
|
+
"""
|
2249
|
+
|
2250
|
+
image_size = image.shape[:2]
|
2251
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
2252
|
+
return []
|
2253
|
+
|
2254
|
+
ret = _agentic_object_detection(
|
2255
|
+
prompt, image, image_size, fine_tune_id=fine_tune_id
|
2256
|
+
)
|
2257
|
+
|
2258
|
+
_display_tool_trace(
|
2259
|
+
agentic_object_detection.__name__,
|
2260
|
+
{"prompts": prompt},
|
2261
|
+
ret["display_data"],
|
2262
|
+
ret["files"],
|
2263
|
+
)
|
2264
|
+
return ret["return_data"] # type: ignore
|
2265
|
+
|
2266
|
+
|
2267
|
+
def agentic_sam2_instance_segmentation(
|
2268
|
+
prompt: str, image: np.ndarray
|
2269
|
+
) -> List[Dict[str, Any]]:
|
2270
|
+
"""'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
|
2271
|
+
instances of objects given a text prompt such as category names or referring
|
2272
|
+
expressions on images. The categories in text prompt are separated by commas. It
|
2273
|
+
returns a list of bounding boxes with normalized coordinates, label names, masks
|
2274
|
+
and associated probability scores.
|
2275
|
+
|
2276
|
+
Parameters:
|
2277
|
+
prompt (str): The object that needs to be counted.
|
2278
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
2279
|
+
|
2280
|
+
Returns:
|
2281
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2282
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
2283
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2284
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2285
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2286
|
+
the background.
|
2287
|
+
|
2288
|
+
Example
|
2289
|
+
-------
|
2290
|
+
>>> agentic_sam2_instance_segmentation("flower", image)
|
2291
|
+
[
|
2292
|
+
{
|
2293
|
+
'score': 0.49,
|
2294
|
+
'label': 'flower',
|
2295
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2296
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2297
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2298
|
+
...,
|
2299
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2300
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2301
|
+
},
|
2302
|
+
]
|
2303
|
+
"""
|
2304
|
+
|
2305
|
+
od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
|
2306
|
+
seg_ret = _sam2(
|
2307
|
+
image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
|
2308
|
+
)
|
2309
|
+
|
2310
|
+
_display_tool_trace(
|
2311
|
+
agentic_sam2_instance_segmentation.__name__,
|
2312
|
+
{
|
2313
|
+
"prompts": prompt,
|
2314
|
+
},
|
2315
|
+
seg_ret["display_data"],
|
2316
|
+
seg_ret["files"],
|
2317
|
+
)
|
2318
|
+
|
2319
|
+
return seg_ret["return_data"] # type: ignore
|
2320
|
+
|
2321
|
+
|
2322
|
+
def agentic_sam2_video_tracking(
|
2323
|
+
prompt: str,
|
2324
|
+
frames: List[np.ndarray],
|
2325
|
+
chunk_length: Optional[int] = 10,
|
2326
|
+
fine_tune_id: Optional[str] = None,
|
2327
|
+
) -> List[List[Dict[str, Any]]]:
|
2328
|
+
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
2329
|
+
objects in a video given a text prompt such as category names or referring
|
2330
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
2331
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
2332
|
+
is useful for tracking and counting without duplicating counts.
|
2333
|
+
|
2334
|
+
Parameters:
|
2335
|
+
prompt (str): The prompt to ground to the image.
|
2336
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
2337
|
+
chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
|
2338
|
+
to find new objects.
|
2339
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
2340
|
+
fine-tuned model ID here to use it.
|
2341
|
+
|
2342
|
+
Returns:
|
2343
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
2344
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
2345
|
+
frame and the inner list is the entities per frame. The detected objects
|
2346
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
2347
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
2348
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
2349
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
2350
|
+
The label names are prefixed with their ID represent the total count.
|
2351
|
+
|
2352
|
+
Example
|
2353
|
+
-------
|
2354
|
+
>>> agentic_sam2_video_tracking("dinosaur", frames)
|
2355
|
+
[
|
2356
|
+
[
|
2357
|
+
{
|
2358
|
+
'label': '0: dinosaur',
|
2359
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2360
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2361
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2362
|
+
...,
|
2363
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2364
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2365
|
+
},
|
2366
|
+
],
|
2367
|
+
...
|
2368
|
+
]
|
2369
|
+
"""
|
2370
|
+
|
2371
|
+
ret = od_sam2_video_tracking(
|
2372
|
+
ODModels.AGENTIC,
|
2373
|
+
prompt=prompt,
|
2374
|
+
frames=frames,
|
2375
|
+
chunk_length=chunk_length,
|
2376
|
+
fine_tune_id=fine_tune_id,
|
2377
|
+
)
|
2378
|
+
_display_tool_trace(
|
2379
|
+
agentic_sam2_video_tracking.__name__,
|
2380
|
+
{},
|
2381
|
+
ret["display_data"],
|
2382
|
+
ret["files"],
|
2383
|
+
)
|
2384
|
+
return ret["return_data"] # type: ignore
|
2385
|
+
|
2386
|
+
|
2143
2387
|
def minimum_distance(
|
2144
2388
|
det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
|
2145
2389
|
) -> float:
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=8VpAC8zEk8OwcMLcTn7gEAfw6ihqlsEfzjEaW5yd5-4,2897
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
|
-
vision_agent/tools/tool_utils.py,sha256=
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tool_utils.py,sha256=kXB0F-HwmiChpQgKk7tMo-Acsl3UXxjaJV9mYo_q6n4,10076
|
34
|
+
vision_agent/tools/tools.py,sha256=M_kk17Yr5c6ODKet26GcxZAlGDwl0AwMMD4wCrBhR6Y,105157
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -40,8 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
|
|
40
40
|
vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent/utils/video_tracking.py,sha256=
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
47
|
-
vision_agent-0.2.
|
43
|
+
vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
|
44
|
+
vision_agent-0.2.229.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
45
|
+
vision_agent-0.2.229.dist-info/METADATA,sha256=ver5sB_NI_dkek1GxY9GsvktACS1Rl6-tgrr_B5p1Zc,20039
|
46
|
+
vision_agent-0.2.229.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
47
|
+
vision_agent-0.2.229.dist-info/RECORD,,
|
File without changes
|
File without changes
|