vision-agent 0.2.211__py3-none-any.whl → 0.2.212__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,9 +26,10 @@ from .tools import (
26
26
  claude35_text_extraction,
27
27
  closest_box_distance,
28
28
  closest_mask_distance,
29
+ countgd_example_based_counting,
29
30
  countgd_object_detection,
30
31
  countgd_sam2_object_detection,
31
- countgd_example_based_counting,
32
+ countgd_sam2_video_tracking,
32
33
  depth_anything_v2,
33
34
  detr_segmentation,
34
35
  extract_frames_and_timestamps,
@@ -46,11 +47,13 @@ from .tools import (
46
47
  load_image,
47
48
  minimum_distance,
48
49
  ocr,
50
+ od_sam2_video_tracking,
49
51
  overlay_bounding_boxes,
50
52
  overlay_heat_map,
51
53
  overlay_segmentation_masks,
52
54
  owl_v2_image,
53
55
  owl_v2_video,
56
+ owlv2_sam2_video_tracking,
54
57
  qwen2_vl_images_vqa,
55
58
  qwen2_vl_video_vqa,
56
59
  sam2,
@@ -6,6 +6,7 @@ import tempfile
6
6
  import urllib.request
7
7
  from base64 import b64encode
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from enum import Enum
9
10
  from functools import lru_cache
10
11
  from importlib import resources
11
12
  from pathlib import Path
@@ -2394,6 +2395,197 @@ def _plot_counting(
2394
2395
  return image
2395
2396
 
2396
2397
 
2398
+ class ODModels(str, Enum):
2399
+ COUNTGD = "countgd"
2400
+ FLORENCE2 = "florence2"
2401
+ OWLV2 = "owlv2"
2402
+
2403
+
2404
+ def od_sam2_video_tracking(
2405
+ od_model: ODModels,
2406
+ prompt: str,
2407
+ frames: List[np.ndarray],
2408
+ chunk_length: Optional[int] = 10,
2409
+ fine_tune_id: Optional[str] = None,
2410
+ ) -> List[List[Dict[str, Any]]]:
2411
+
2412
+ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
2413
+
2414
+ if chunk_length is None:
2415
+ step = 1 # Process every frame
2416
+ elif chunk_length <= 0:
2417
+ raise ValueError("chunk_length must be a positive integer or None.")
2418
+ else:
2419
+ step = chunk_length # Process frames with the specified step size
2420
+
2421
+ for idx in range(0, len(frames), step):
2422
+ if od_model == ODModels.COUNTGD:
2423
+ results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
2424
+ function_name = "countgd_object_detection"
2425
+ elif od_model == ODModels.OWLV2:
2426
+ results[idx] = owl_v2_image(
2427
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2428
+ )
2429
+ function_name = "owl_v2_image"
2430
+ elif od_model == ODModels.FLORENCE2:
2431
+ results[idx] = florence2_sam2_image(
2432
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2433
+ )
2434
+ function_name = "florence2_sam2_image"
2435
+ else:
2436
+ raise NotImplementedError(
2437
+ f"Object detection model '{od_model}' is not implemented."
2438
+ )
2439
+
2440
+ image_size = frames[0].shape[:2]
2441
+
2442
+ def _transform_detections(
2443
+ input_list: List[Optional[List[Dict[str, Any]]]]
2444
+ ) -> List[Optional[Dict[str, Any]]]:
2445
+ output_list: List[Optional[Dict[str, Any]]] = []
2446
+
2447
+ for idx, frame in enumerate(input_list):
2448
+ if frame is not None:
2449
+ labels = [detection["label"] for detection in frame]
2450
+ bboxes = [
2451
+ denormalize_bbox(detection["bbox"], image_size)
2452
+ for detection in frame
2453
+ ]
2454
+
2455
+ output_list.append(
2456
+ {
2457
+ "labels": labels,
2458
+ "bboxes": bboxes,
2459
+ }
2460
+ )
2461
+ else:
2462
+ output_list.append(None)
2463
+
2464
+ return output_list
2465
+
2466
+ output = _transform_detections(results)
2467
+
2468
+ buffer_bytes = frames_to_bytes(frames)
2469
+ files = [("video", buffer_bytes)]
2470
+ payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
2471
+ metadata = {"function_name": function_name}
2472
+
2473
+ detections = send_task_inference_request(
2474
+ payload,
2475
+ "sam2",
2476
+ files=files,
2477
+ metadata=metadata,
2478
+ )
2479
+
2480
+ return_data = []
2481
+ for frame in detections:
2482
+ return_frame_data = []
2483
+ for detection in frame:
2484
+ mask = rle_decode_array(detection["mask"])
2485
+ label = str(detection["id"]) + ": " + detection["label"]
2486
+ return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
2487
+ return_data.append(return_frame_data)
2488
+ return_data = add_bboxes_from_masks(return_data)
2489
+ return nms(return_data, iou_threshold=0.95)
2490
+
2491
+
2492
+ def countgd_sam2_video_tracking(
2493
+ prompt: str,
2494
+ frames: List[np.ndarray],
2495
+ chunk_length: Optional[int] = 10,
2496
+ ) -> List[List[Dict[str, Any]]]:
2497
+ """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
2498
+ prompt such as category names or referring expressions. The categories in the text
2499
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2500
+ mask file names and associated probability scores.
2501
+
2502
+ Parameters:
2503
+ prompt (str): The prompt to ground to the image.
2504
+ image (np.ndarray): The image to ground the prompt to.
2505
+
2506
+ Returns:
2507
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2508
+ bounding box, and mask of the detected objects with normalized coordinates
2509
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2510
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2511
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2512
+ the background.
2513
+
2514
+ Example
2515
+ -------
2516
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2517
+ [
2518
+ [
2519
+ {
2520
+ 'label': '0: dinosaur',
2521
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2522
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2523
+ [0, 0, 0, ..., 0, 0, 0],
2524
+ ...,
2525
+ [0, 0, 0, ..., 0, 0, 0],
2526
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2527
+ },
2528
+ ],
2529
+ ...
2530
+ ]
2531
+ """
2532
+
2533
+ return od_sam2_video_tracking(
2534
+ ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
2535
+ )
2536
+
2537
+
2538
+ def owlv2_sam2_video_tracking(
2539
+ prompt: str,
2540
+ frames: List[np.ndarray],
2541
+ chunk_length: Optional[int] = 10,
2542
+ fine_tune_id: Optional[str] = None,
2543
+ ) -> List[List[Dict[str, Any]]]:
2544
+ """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
2545
+ prompt such as category names or referring expressions. The categories in the text
2546
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2547
+ mask file names and associated probability scores.
2548
+
2549
+ Parameters:
2550
+ prompt (str): The prompt to ground to the image.
2551
+ image (np.ndarray): The image to ground the prompt to.
2552
+
2553
+ Returns:
2554
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2555
+ bounding box, and mask of the detected objects with normalized coordinates
2556
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2557
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2558
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2559
+ the background.
2560
+
2561
+ Example
2562
+ -------
2563
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2564
+ [
2565
+ [
2566
+ {
2567
+ 'label': '0: dinosaur',
2568
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2569
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2570
+ [0, 0, 0, ..., 0, 0, 0],
2571
+ ...,
2572
+ [0, 0, 0, ..., 0, 0, 0],
2573
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2574
+ },
2575
+ ],
2576
+ ...
2577
+ ]
2578
+ """
2579
+
2580
+ return od_sam2_video_tracking(
2581
+ ODModels.OWLV2,
2582
+ prompt=prompt,
2583
+ frames=frames,
2584
+ chunk_length=chunk_length,
2585
+ fine_tune_id=fine_tune_id,
2586
+ )
2587
+
2588
+
2397
2589
  FUNCTION_TOOLS = [
2398
2590
  owl_v2_image,
2399
2591
  owl_v2_video,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.211
3
+ Version: 0.2.212
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=-49o3X7bWG7sMxk0pMifO7BmN_cwDFcuGfzll48qAV4,2678
29
+ vision_agent/tools/__init__.py,sha256=InL8zUTRN8i_9J6r2wAtYdtNrVkElqdO_p-e2OA8q5A,2770
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
34
- vision_agent/tools/tools.py,sha256=SZVKbPwNRbjul5BBKEZcrzcPJKdnWQXjet4tC5Zkkfw,83797
34
+ vision_agent/tools/tools.py,sha256=f7M-93fdnDOYiQTNQs4qEGu6qLe4Zqvp59PW93x8828,90563
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
40
40
  vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
43
- vision_agent-0.2.211.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.211.dist-info/METADATA,sha256=zbTzutwYFj7D_r8zNm92DmlXZkn4VuEeNDAPoFMq4Ks,19071
45
- vision_agent-0.2.211.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.211.dist-info/RECORD,,
43
+ vision_agent-0.2.212.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.212.dist-info/METADATA,sha256=Q5-h8xTNiE4Mynr66ihl3bzYFTgfs79mh0O9_Mt8fE0,19071
45
+ vision_agent-0.2.212.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.212.dist-info/RECORD,,