vision-agent 0.2.211__py3-none-any.whl → 0.2.212__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +4 -1
- vision_agent/tools/tools.py +192 -0
- {vision_agent-0.2.211.dist-info → vision_agent-0.2.212.dist-info}/METADATA +1 -1
- {vision_agent-0.2.211.dist-info → vision_agent-0.2.212.dist-info}/RECORD +6 -6
- {vision_agent-0.2.211.dist-info → vision_agent-0.2.212.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.211.dist-info → vision_agent-0.2.212.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
@@ -26,9 +26,10 @@ from .tools import (
|
|
26
26
|
claude35_text_extraction,
|
27
27
|
closest_box_distance,
|
28
28
|
closest_mask_distance,
|
29
|
+
countgd_example_based_counting,
|
29
30
|
countgd_object_detection,
|
30
31
|
countgd_sam2_object_detection,
|
31
|
-
|
32
|
+
countgd_sam2_video_tracking,
|
32
33
|
depth_anything_v2,
|
33
34
|
detr_segmentation,
|
34
35
|
extract_frames_and_timestamps,
|
@@ -46,11 +47,13 @@ from .tools import (
|
|
46
47
|
load_image,
|
47
48
|
minimum_distance,
|
48
49
|
ocr,
|
50
|
+
od_sam2_video_tracking,
|
49
51
|
overlay_bounding_boxes,
|
50
52
|
overlay_heat_map,
|
51
53
|
overlay_segmentation_masks,
|
52
54
|
owl_v2_image,
|
53
55
|
owl_v2_video,
|
56
|
+
owlv2_sam2_video_tracking,
|
54
57
|
qwen2_vl_images_vqa,
|
55
58
|
qwen2_vl_video_vqa,
|
56
59
|
sam2,
|
vision_agent/tools/tools.py
CHANGED
@@ -6,6 +6,7 @@ import tempfile
|
|
6
6
|
import urllib.request
|
7
7
|
from base64 import b64encode
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
|
+
from enum import Enum
|
9
10
|
from functools import lru_cache
|
10
11
|
from importlib import resources
|
11
12
|
from pathlib import Path
|
@@ -2394,6 +2395,197 @@ def _plot_counting(
|
|
2394
2395
|
return image
|
2395
2396
|
|
2396
2397
|
|
2398
|
+
class ODModels(str, Enum):
|
2399
|
+
COUNTGD = "countgd"
|
2400
|
+
FLORENCE2 = "florence2"
|
2401
|
+
OWLV2 = "owlv2"
|
2402
|
+
|
2403
|
+
|
2404
|
+
def od_sam2_video_tracking(
|
2405
|
+
od_model: ODModels,
|
2406
|
+
prompt: str,
|
2407
|
+
frames: List[np.ndarray],
|
2408
|
+
chunk_length: Optional[int] = 10,
|
2409
|
+
fine_tune_id: Optional[str] = None,
|
2410
|
+
) -> List[List[Dict[str, Any]]]:
|
2411
|
+
|
2412
|
+
results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
|
2413
|
+
|
2414
|
+
if chunk_length is None:
|
2415
|
+
step = 1 # Process every frame
|
2416
|
+
elif chunk_length <= 0:
|
2417
|
+
raise ValueError("chunk_length must be a positive integer or None.")
|
2418
|
+
else:
|
2419
|
+
step = chunk_length # Process frames with the specified step size
|
2420
|
+
|
2421
|
+
for idx in range(0, len(frames), step):
|
2422
|
+
if od_model == ODModels.COUNTGD:
|
2423
|
+
results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
|
2424
|
+
function_name = "countgd_object_detection"
|
2425
|
+
elif od_model == ODModels.OWLV2:
|
2426
|
+
results[idx] = owl_v2_image(
|
2427
|
+
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
2428
|
+
)
|
2429
|
+
function_name = "owl_v2_image"
|
2430
|
+
elif od_model == ODModels.FLORENCE2:
|
2431
|
+
results[idx] = florence2_sam2_image(
|
2432
|
+
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
2433
|
+
)
|
2434
|
+
function_name = "florence2_sam2_image"
|
2435
|
+
else:
|
2436
|
+
raise NotImplementedError(
|
2437
|
+
f"Object detection model '{od_model}' is not implemented."
|
2438
|
+
)
|
2439
|
+
|
2440
|
+
image_size = frames[0].shape[:2]
|
2441
|
+
|
2442
|
+
def _transform_detections(
|
2443
|
+
input_list: List[Optional[List[Dict[str, Any]]]]
|
2444
|
+
) -> List[Optional[Dict[str, Any]]]:
|
2445
|
+
output_list: List[Optional[Dict[str, Any]]] = []
|
2446
|
+
|
2447
|
+
for idx, frame in enumerate(input_list):
|
2448
|
+
if frame is not None:
|
2449
|
+
labels = [detection["label"] for detection in frame]
|
2450
|
+
bboxes = [
|
2451
|
+
denormalize_bbox(detection["bbox"], image_size)
|
2452
|
+
for detection in frame
|
2453
|
+
]
|
2454
|
+
|
2455
|
+
output_list.append(
|
2456
|
+
{
|
2457
|
+
"labels": labels,
|
2458
|
+
"bboxes": bboxes,
|
2459
|
+
}
|
2460
|
+
)
|
2461
|
+
else:
|
2462
|
+
output_list.append(None)
|
2463
|
+
|
2464
|
+
return output_list
|
2465
|
+
|
2466
|
+
output = _transform_detections(results)
|
2467
|
+
|
2468
|
+
buffer_bytes = frames_to_bytes(frames)
|
2469
|
+
files = [("video", buffer_bytes)]
|
2470
|
+
payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
|
2471
|
+
metadata = {"function_name": function_name}
|
2472
|
+
|
2473
|
+
detections = send_task_inference_request(
|
2474
|
+
payload,
|
2475
|
+
"sam2",
|
2476
|
+
files=files,
|
2477
|
+
metadata=metadata,
|
2478
|
+
)
|
2479
|
+
|
2480
|
+
return_data = []
|
2481
|
+
for frame in detections:
|
2482
|
+
return_frame_data = []
|
2483
|
+
for detection in frame:
|
2484
|
+
mask = rle_decode_array(detection["mask"])
|
2485
|
+
label = str(detection["id"]) + ": " + detection["label"]
|
2486
|
+
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
2487
|
+
return_data.append(return_frame_data)
|
2488
|
+
return_data = add_bboxes_from_masks(return_data)
|
2489
|
+
return nms(return_data, iou_threshold=0.95)
|
2490
|
+
|
2491
|
+
|
2492
|
+
def countgd_sam2_video_tracking(
|
2493
|
+
prompt: str,
|
2494
|
+
frames: List[np.ndarray],
|
2495
|
+
chunk_length: Optional[int] = 10,
|
2496
|
+
) -> List[List[Dict[str, Any]]]:
|
2497
|
+
"""'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
2498
|
+
prompt such as category names or referring expressions. The categories in the text
|
2499
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
2500
|
+
mask file names and associated probability scores.
|
2501
|
+
|
2502
|
+
Parameters:
|
2503
|
+
prompt (str): The prompt to ground to the image.
|
2504
|
+
image (np.ndarray): The image to ground the prompt to.
|
2505
|
+
|
2506
|
+
Returns:
|
2507
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2508
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
2509
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2510
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2511
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2512
|
+
the background.
|
2513
|
+
|
2514
|
+
Example
|
2515
|
+
-------
|
2516
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
2517
|
+
[
|
2518
|
+
[
|
2519
|
+
{
|
2520
|
+
'label': '0: dinosaur',
|
2521
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2522
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2523
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2524
|
+
...,
|
2525
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2526
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2527
|
+
},
|
2528
|
+
],
|
2529
|
+
...
|
2530
|
+
]
|
2531
|
+
"""
|
2532
|
+
|
2533
|
+
return od_sam2_video_tracking(
|
2534
|
+
ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
|
2535
|
+
)
|
2536
|
+
|
2537
|
+
|
2538
|
+
def owlv2_sam2_video_tracking(
|
2539
|
+
prompt: str,
|
2540
|
+
frames: List[np.ndarray],
|
2541
|
+
chunk_length: Optional[int] = 10,
|
2542
|
+
fine_tune_id: Optional[str] = None,
|
2543
|
+
) -> List[List[Dict[str, Any]]]:
|
2544
|
+
"""'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
2545
|
+
prompt such as category names or referring expressions. The categories in the text
|
2546
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
2547
|
+
mask file names and associated probability scores.
|
2548
|
+
|
2549
|
+
Parameters:
|
2550
|
+
prompt (str): The prompt to ground to the image.
|
2551
|
+
image (np.ndarray): The image to ground the prompt to.
|
2552
|
+
|
2553
|
+
Returns:
|
2554
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2555
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
2556
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2557
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2558
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2559
|
+
the background.
|
2560
|
+
|
2561
|
+
Example
|
2562
|
+
-------
|
2563
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
2564
|
+
[
|
2565
|
+
[
|
2566
|
+
{
|
2567
|
+
'label': '0: dinosaur',
|
2568
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2569
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2570
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2571
|
+
...,
|
2572
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2573
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2574
|
+
},
|
2575
|
+
],
|
2576
|
+
...
|
2577
|
+
]
|
2578
|
+
"""
|
2579
|
+
|
2580
|
+
return od_sam2_video_tracking(
|
2581
|
+
ODModels.OWLV2,
|
2582
|
+
prompt=prompt,
|
2583
|
+
frames=frames,
|
2584
|
+
chunk_length=chunk_length,
|
2585
|
+
fine_tune_id=fine_tune_id,
|
2586
|
+
)
|
2587
|
+
|
2588
|
+
|
2397
2589
|
FUNCTION_TOOLS = [
|
2398
2590
|
owl_v2_image,
|
2399
2591
|
owl_v2_video,
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256
|
29
|
+
vision_agent/tools/__init__.py,sha256=InL8zUTRN8i_9J6r2wAtYdtNrVkElqdO_p-e2OA8q5A,2770
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=f7M-93fdnDOYiQTNQs4qEGu6qLe4Zqvp59PW93x8828,90563
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
|
|
40
40
|
vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
43
|
+
vision_agent-0.2.212.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.212.dist-info/METADATA,sha256=Q5-h8xTNiE4Mynr66ihl3bzYFTgfs79mh0O9_Mt8fE0,19071
|
45
|
+
vision_agent-0.2.212.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.212.dist-info/RECORD,,
|
File without changes
|
File without changes
|