vision-agent 0.2.211__py3-none-any.whl → 0.2.213__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/tools/__init__.py +4 -1
- vision_agent/tools/tools.py +194 -0
- {vision_agent-0.2.211.dist-info → vision_agent-0.2.213.dist-info}/METADATA +1 -1
- {vision_agent-0.2.211.dist-info → vision_agent-0.2.213.dist-info}/RECORD +6 -6
- {vision_agent-0.2.211.dist-info → vision_agent-0.2.213.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.211.dist-info → vision_agent-0.2.213.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
@@ -26,9 +26,10 @@ from .tools import (
|
|
26
26
|
claude35_text_extraction,
|
27
27
|
closest_box_distance,
|
28
28
|
closest_mask_distance,
|
29
|
+
countgd_example_based_counting,
|
29
30
|
countgd_object_detection,
|
30
31
|
countgd_sam2_object_detection,
|
31
|
-
|
32
|
+
countgd_sam2_video_tracking,
|
32
33
|
depth_anything_v2,
|
33
34
|
detr_segmentation,
|
34
35
|
extract_frames_and_timestamps,
|
@@ -46,11 +47,13 @@ from .tools import (
|
|
46
47
|
load_image,
|
47
48
|
minimum_distance,
|
48
49
|
ocr,
|
50
|
+
od_sam2_video_tracking,
|
49
51
|
overlay_bounding_boxes,
|
50
52
|
overlay_heat_map,
|
51
53
|
overlay_segmentation_masks,
|
52
54
|
owl_v2_image,
|
53
55
|
owl_v2_video,
|
56
|
+
owlv2_sam2_video_tracking,
|
54
57
|
qwen2_vl_images_vqa,
|
55
58
|
qwen2_vl_video_vqa,
|
56
59
|
sam2,
|
vision_agent/tools/tools.py
CHANGED
@@ -6,6 +6,7 @@ import tempfile
|
|
6
6
|
import urllib.request
|
7
7
|
from base64 import b64encode
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
|
+
from enum import Enum
|
9
10
|
from functools import lru_cache
|
10
11
|
from importlib import resources
|
11
12
|
from pathlib import Path
|
@@ -2394,6 +2395,197 @@ def _plot_counting(
|
|
2394
2395
|
return image
|
2395
2396
|
|
2396
2397
|
|
2398
|
+
class ODModels(str, Enum):
|
2399
|
+
COUNTGD = "countgd"
|
2400
|
+
FLORENCE2 = "florence2"
|
2401
|
+
OWLV2 = "owlv2"
|
2402
|
+
|
2403
|
+
|
2404
|
+
def od_sam2_video_tracking(
|
2405
|
+
od_model: ODModels,
|
2406
|
+
prompt: str,
|
2407
|
+
frames: List[np.ndarray],
|
2408
|
+
chunk_length: Optional[int] = 10,
|
2409
|
+
fine_tune_id: Optional[str] = None,
|
2410
|
+
) -> List[List[Dict[str, Any]]]:
|
2411
|
+
|
2412
|
+
results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
|
2413
|
+
|
2414
|
+
if chunk_length is None:
|
2415
|
+
step = 1 # Process every frame
|
2416
|
+
elif chunk_length <= 0:
|
2417
|
+
raise ValueError("chunk_length must be a positive integer or None.")
|
2418
|
+
else:
|
2419
|
+
step = chunk_length # Process frames with the specified step size
|
2420
|
+
|
2421
|
+
for idx in range(0, len(frames), step):
|
2422
|
+
if od_model == ODModels.COUNTGD:
|
2423
|
+
results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
|
2424
|
+
function_name = "countgd_object_detection"
|
2425
|
+
elif od_model == ODModels.OWLV2:
|
2426
|
+
results[idx] = owl_v2_image(
|
2427
|
+
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
2428
|
+
)
|
2429
|
+
function_name = "owl_v2_image"
|
2430
|
+
elif od_model == ODModels.FLORENCE2:
|
2431
|
+
results[idx] = florence2_sam2_image(
|
2432
|
+
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
2433
|
+
)
|
2434
|
+
function_name = "florence2_sam2_image"
|
2435
|
+
else:
|
2436
|
+
raise NotImplementedError(
|
2437
|
+
f"Object detection model '{od_model}' is not implemented."
|
2438
|
+
)
|
2439
|
+
|
2440
|
+
image_size = frames[0].shape[:2]
|
2441
|
+
|
2442
|
+
def _transform_detections(
|
2443
|
+
input_list: List[Optional[List[Dict[str, Any]]]]
|
2444
|
+
) -> List[Optional[Dict[str, Any]]]:
|
2445
|
+
output_list: List[Optional[Dict[str, Any]]] = []
|
2446
|
+
|
2447
|
+
for idx, frame in enumerate(input_list):
|
2448
|
+
if frame is not None:
|
2449
|
+
labels = [detection["label"] for detection in frame]
|
2450
|
+
bboxes = [
|
2451
|
+
denormalize_bbox(detection["bbox"], image_size)
|
2452
|
+
for detection in frame
|
2453
|
+
]
|
2454
|
+
|
2455
|
+
output_list.append(
|
2456
|
+
{
|
2457
|
+
"labels": labels,
|
2458
|
+
"bboxes": bboxes,
|
2459
|
+
}
|
2460
|
+
)
|
2461
|
+
else:
|
2462
|
+
output_list.append(None)
|
2463
|
+
|
2464
|
+
return output_list
|
2465
|
+
|
2466
|
+
output = _transform_detections(results)
|
2467
|
+
|
2468
|
+
buffer_bytes = frames_to_bytes(frames)
|
2469
|
+
files = [("video", buffer_bytes)]
|
2470
|
+
payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
|
2471
|
+
metadata = {"function_name": function_name}
|
2472
|
+
|
2473
|
+
detections = send_task_inference_request(
|
2474
|
+
payload,
|
2475
|
+
"sam2",
|
2476
|
+
files=files,
|
2477
|
+
metadata=metadata,
|
2478
|
+
)
|
2479
|
+
|
2480
|
+
return_data = []
|
2481
|
+
for frame in detections:
|
2482
|
+
return_frame_data = []
|
2483
|
+
for detection in frame:
|
2484
|
+
mask = rle_decode_array(detection["mask"])
|
2485
|
+
label = str(detection["id"]) + ": " + detection["label"]
|
2486
|
+
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
2487
|
+
return_data.append(return_frame_data)
|
2488
|
+
return_data = add_bboxes_from_masks(return_data)
|
2489
|
+
return nms(return_data, iou_threshold=0.95)
|
2490
|
+
|
2491
|
+
|
2492
|
+
def countgd_sam2_video_tracking(
|
2493
|
+
prompt: str,
|
2494
|
+
frames: List[np.ndarray],
|
2495
|
+
chunk_length: Optional[int] = 10,
|
2496
|
+
) -> List[List[Dict[str, Any]]]:
|
2497
|
+
"""'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
2498
|
+
prompt such as category names or referring expressions. The categories in the text
|
2499
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
2500
|
+
mask file names and associated probability scores.
|
2501
|
+
|
2502
|
+
Parameters:
|
2503
|
+
prompt (str): The prompt to ground to the image.
|
2504
|
+
image (np.ndarray): The image to ground the prompt to.
|
2505
|
+
|
2506
|
+
Returns:
|
2507
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2508
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
2509
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2510
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2511
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2512
|
+
the background.
|
2513
|
+
|
2514
|
+
Example
|
2515
|
+
-------
|
2516
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
2517
|
+
[
|
2518
|
+
[
|
2519
|
+
{
|
2520
|
+
'label': '0: dinosaur',
|
2521
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2522
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2523
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2524
|
+
...,
|
2525
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2526
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2527
|
+
},
|
2528
|
+
],
|
2529
|
+
...
|
2530
|
+
]
|
2531
|
+
"""
|
2532
|
+
|
2533
|
+
return od_sam2_video_tracking(
|
2534
|
+
ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
|
2535
|
+
)
|
2536
|
+
|
2537
|
+
|
2538
|
+
def owlv2_sam2_video_tracking(
|
2539
|
+
prompt: str,
|
2540
|
+
frames: List[np.ndarray],
|
2541
|
+
chunk_length: Optional[int] = 10,
|
2542
|
+
fine_tune_id: Optional[str] = None,
|
2543
|
+
) -> List[List[Dict[str, Any]]]:
|
2544
|
+
"""'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
2545
|
+
prompt such as category names or referring expressions. The categories in the text
|
2546
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
2547
|
+
mask file names and associated probability scores.
|
2548
|
+
|
2549
|
+
Parameters:
|
2550
|
+
prompt (str): The prompt to ground to the image.
|
2551
|
+
image (np.ndarray): The image to ground the prompt to.
|
2552
|
+
|
2553
|
+
Returns:
|
2554
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2555
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
2556
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2557
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2558
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2559
|
+
the background.
|
2560
|
+
|
2561
|
+
Example
|
2562
|
+
-------
|
2563
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
2564
|
+
[
|
2565
|
+
[
|
2566
|
+
{
|
2567
|
+
'label': '0: dinosaur',
|
2568
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2569
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2570
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2571
|
+
...,
|
2572
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2573
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2574
|
+
},
|
2575
|
+
],
|
2576
|
+
...
|
2577
|
+
]
|
2578
|
+
"""
|
2579
|
+
|
2580
|
+
return od_sam2_video_tracking(
|
2581
|
+
ODModels.OWLV2,
|
2582
|
+
prompt=prompt,
|
2583
|
+
frames=frames,
|
2584
|
+
chunk_length=chunk_length,
|
2585
|
+
fine_tune_id=fine_tune_id,
|
2586
|
+
)
|
2587
|
+
|
2588
|
+
|
2397
2589
|
FUNCTION_TOOLS = [
|
2398
2590
|
owl_v2_image,
|
2399
2591
|
owl_v2_video,
|
@@ -2416,6 +2608,8 @@ FUNCTION_TOOLS = [
|
|
2416
2608
|
video_temporal_localization,
|
2417
2609
|
flux_image_inpainting,
|
2418
2610
|
siglip_classification,
|
2611
|
+
owlv2_sam2_video_tracking,
|
2612
|
+
countgd_sam2_video_tracking,
|
2419
2613
|
]
|
2420
2614
|
|
2421
2615
|
UTIL_TOOLS = [
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256
|
29
|
+
vision_agent/tools/__init__.py,sha256=InL8zUTRN8i_9J6r2wAtYdtNrVkElqdO_p-e2OA8q5A,2770
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=ZcXEI0Pb54OGXnLWi690SFx22k7JlEmQ-N16LzRLHlk,90627
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
|
|
40
40
|
vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
43
|
+
vision_agent-0.2.213.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.213.dist-info/METADATA,sha256=iXy6vkFwSXz6UQW1LjuZMCj6YT8YwmjGklhmulFOoIc,19071
|
45
|
+
vision_agent-0.2.213.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.213.dist-info/RECORD,,
|
File without changes
|
File without changes
|