vision-agent 0.2.211__py3-none-any.whl → 0.2.212__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,9 +26,10 @@ from .tools import (
26
26
  claude35_text_extraction,
27
27
  closest_box_distance,
28
28
  closest_mask_distance,
29
+ countgd_example_based_counting,
29
30
  countgd_object_detection,
30
31
  countgd_sam2_object_detection,
31
- countgd_example_based_counting,
32
+ countgd_sam2_video_tracking,
32
33
  depth_anything_v2,
33
34
  detr_segmentation,
34
35
  extract_frames_and_timestamps,
@@ -46,11 +47,13 @@ from .tools import (
46
47
  load_image,
47
48
  minimum_distance,
48
49
  ocr,
50
+ od_sam2_video_tracking,
49
51
  overlay_bounding_boxes,
50
52
  overlay_heat_map,
51
53
  overlay_segmentation_masks,
52
54
  owl_v2_image,
53
55
  owl_v2_video,
56
+ owlv2_sam2_video_tracking,
54
57
  qwen2_vl_images_vqa,
55
58
  qwen2_vl_video_vqa,
56
59
  sam2,
@@ -6,6 +6,7 @@ import tempfile
6
6
  import urllib.request
7
7
  from base64 import b64encode
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from enum import Enum
9
10
  from functools import lru_cache
10
11
  from importlib import resources
11
12
  from pathlib import Path
@@ -2394,6 +2395,197 @@ def _plot_counting(
2394
2395
  return image
2395
2396
 
2396
2397
 
2398
+ class ODModels(str, Enum):
2399
+ COUNTGD = "countgd"
2400
+ FLORENCE2 = "florence2"
2401
+ OWLV2 = "owlv2"
2402
+
2403
+
2404
+ def od_sam2_video_tracking(
2405
+ od_model: ODModels,
2406
+ prompt: str,
2407
+ frames: List[np.ndarray],
2408
+ chunk_length: Optional[int] = 10,
2409
+ fine_tune_id: Optional[str] = None,
2410
+ ) -> List[List[Dict[str, Any]]]:
2411
+
2412
+ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
2413
+
2414
+ if chunk_length is None:
2415
+ step = 1 # Process every frame
2416
+ elif chunk_length <= 0:
2417
+ raise ValueError("chunk_length must be a positive integer or None.")
2418
+ else:
2419
+ step = chunk_length # Process frames with the specified step size
2420
+
2421
+ for idx in range(0, len(frames), step):
2422
+ if od_model == ODModels.COUNTGD:
2423
+ results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
2424
+ function_name = "countgd_object_detection"
2425
+ elif od_model == ODModels.OWLV2:
2426
+ results[idx] = owl_v2_image(
2427
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2428
+ )
2429
+ function_name = "owl_v2_image"
2430
+ elif od_model == ODModels.FLORENCE2:
2431
+ results[idx] = florence2_sam2_image(
2432
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2433
+ )
2434
+ function_name = "florence2_sam2_image"
2435
+ else:
2436
+ raise NotImplementedError(
2437
+ f"Object detection model '{od_model}' is not implemented."
2438
+ )
2439
+
2440
+ image_size = frames[0].shape[:2]
2441
+
2442
+ def _transform_detections(
2443
+ input_list: List[Optional[List[Dict[str, Any]]]]
2444
+ ) -> List[Optional[Dict[str, Any]]]:
2445
+ output_list: List[Optional[Dict[str, Any]]] = []
2446
+
2447
+ for idx, frame in enumerate(input_list):
2448
+ if frame is not None:
2449
+ labels = [detection["label"] for detection in frame]
2450
+ bboxes = [
2451
+ denormalize_bbox(detection["bbox"], image_size)
2452
+ for detection in frame
2453
+ ]
2454
+
2455
+ output_list.append(
2456
+ {
2457
+ "labels": labels,
2458
+ "bboxes": bboxes,
2459
+ }
2460
+ )
2461
+ else:
2462
+ output_list.append(None)
2463
+
2464
+ return output_list
2465
+
2466
+ output = _transform_detections(results)
2467
+
2468
+ buffer_bytes = frames_to_bytes(frames)
2469
+ files = [("video", buffer_bytes)]
2470
+ payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
2471
+ metadata = {"function_name": function_name}
2472
+
2473
+ detections = send_task_inference_request(
2474
+ payload,
2475
+ "sam2",
2476
+ files=files,
2477
+ metadata=metadata,
2478
+ )
2479
+
2480
+ return_data = []
2481
+ for frame in detections:
2482
+ return_frame_data = []
2483
+ for detection in frame:
2484
+ mask = rle_decode_array(detection["mask"])
2485
+ label = str(detection["id"]) + ": " + detection["label"]
2486
+ return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
2487
+ return_data.append(return_frame_data)
2488
+ return_data = add_bboxes_from_masks(return_data)
2489
+ return nms(return_data, iou_threshold=0.95)
2490
+
2491
+
2492
+ def countgd_sam2_video_tracking(
2493
+ prompt: str,
2494
+ frames: List[np.ndarray],
2495
+ chunk_length: Optional[int] = 10,
2496
+ ) -> List[List[Dict[str, Any]]]:
2497
+ """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
2498
+ prompt such as category names or referring expressions. The categories in the text
2499
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2500
+ mask file names and associated probability scores.
2501
+
2502
+ Parameters:
2503
+ prompt (str): The prompt to ground to the image.
2504
+ image (np.ndarray): The image to ground the prompt to.
2505
+
2506
+ Returns:
2507
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2508
+ bounding box, and mask of the detected objects with normalized coordinates
2509
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2510
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2511
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2512
+ the background.
2513
+
2514
+ Example
2515
+ -------
2516
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2517
+ [
2518
+ [
2519
+ {
2520
+ 'label': '0: dinosaur',
2521
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2522
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2523
+ [0, 0, 0, ..., 0, 0, 0],
2524
+ ...,
2525
+ [0, 0, 0, ..., 0, 0, 0],
2526
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2527
+ },
2528
+ ],
2529
+ ...
2530
+ ]
2531
+ """
2532
+
2533
+ return od_sam2_video_tracking(
2534
+ ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
2535
+ )
2536
+
2537
+
2538
+ def owlv2_sam2_video_tracking(
2539
+ prompt: str,
2540
+ frames: List[np.ndarray],
2541
+ chunk_length: Optional[int] = 10,
2542
+ fine_tune_id: Optional[str] = None,
2543
+ ) -> List[List[Dict[str, Any]]]:
2544
+ """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
2545
+ prompt such as category names or referring expressions. The categories in the text
2546
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2547
+ mask file names and associated probability scores.
2548
+
2549
+ Parameters:
2550
+ prompt (str): The prompt to ground to the image.
2551
+ image (np.ndarray): The image to ground the prompt to.
2552
+
2553
+ Returns:
2554
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2555
+ bounding box, and mask of the detected objects with normalized coordinates
2556
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2557
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2558
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2559
+ the background.
2560
+
2561
+ Example
2562
+ -------
2563
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2564
+ [
2565
+ [
2566
+ {
2567
+ 'label': '0: dinosaur',
2568
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2569
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2570
+ [0, 0, 0, ..., 0, 0, 0],
2571
+ ...,
2572
+ [0, 0, 0, ..., 0, 0, 0],
2573
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2574
+ },
2575
+ ],
2576
+ ...
2577
+ ]
2578
+ """
2579
+
2580
+ return od_sam2_video_tracking(
2581
+ ODModels.OWLV2,
2582
+ prompt=prompt,
2583
+ frames=frames,
2584
+ chunk_length=chunk_length,
2585
+ fine_tune_id=fine_tune_id,
2586
+ )
2587
+
2588
+
2397
2589
  FUNCTION_TOOLS = [
2398
2590
  owl_v2_image,
2399
2591
  owl_v2_video,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.211
3
+ Version: 0.2.212
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=-49o3X7bWG7sMxk0pMifO7BmN_cwDFcuGfzll48qAV4,2678
29
+ vision_agent/tools/__init__.py,sha256=InL8zUTRN8i_9J6r2wAtYdtNrVkElqdO_p-e2OA8q5A,2770
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
34
- vision_agent/tools/tools.py,sha256=SZVKbPwNRbjul5BBKEZcrzcPJKdnWQXjet4tC5Zkkfw,83797
34
+ vision_agent/tools/tools.py,sha256=f7M-93fdnDOYiQTNQs4qEGu6qLe4Zqvp59PW93x8828,90563
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
40
40
  vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
43
- vision_agent-0.2.211.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.211.dist-info/METADATA,sha256=zbTzutwYFj7D_r8zNm92DmlXZkn4VuEeNDAPoFMq4Ks,19071
45
- vision_agent-0.2.211.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.211.dist-info/RECORD,,
43
+ vision_agent-0.2.212.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.212.dist-info/METADATA,sha256=Q5-h8xTNiE4Mynr66ihl3bzYFTgfs79mh0O9_Mt8fE0,19071
45
+ vision_agent-0.2.212.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.212.dist-info/RECORD,,