vision-agent 0.2.211__tar.gz → 0.2.213__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. {vision_agent-0.2.211 → vision_agent-0.2.213}/PKG-INFO +1 -1
  2. {vision_agent-0.2.211 → vision_agent-0.2.213}/pyproject.toml +1 -1
  3. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/tools/__init__.py +4 -1
  4. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/tools/tools.py +194 -0
  5. {vision_agent-0.2.211 → vision_agent-0.2.213}/LICENSE +0 -0
  6. {vision_agent-0.2.211 → vision_agent-0.2.213}/README.md +0 -0
  7. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/.sim_tools/df.csv +0 -0
  8. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/.sim_tools/embs.npy +0 -0
  9. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/__init__.py +0 -0
  10. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/README.md +0 -0
  11. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/__init__.py +0 -0
  12. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/agent.py +0 -0
  13. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/agent_utils.py +0 -0
  14. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/types.py +0 -0
  15. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent.py +0 -0
  16. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_coder.py +0 -0
  17. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  18. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  19. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  20. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_planner.py +0 -0
  21. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  22. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  23. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  24. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_prompts.py +0 -0
  25. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  26. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/agent/vision_agent_v2.py +0 -0
  27. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/clients/__init__.py +0 -0
  28. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/clients/http.py +0 -0
  29. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/clients/landing_public_api.py +0 -0
  30. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/fonts/__init__.py +0 -0
  31. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  32. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/lmm/__init__.py +0 -0
  33. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/lmm/lmm.py +0 -0
  34. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/lmm/types.py +0 -0
  35. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/tools/meta_tools.py +0 -0
  36. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/tools/planner_tools.py +0 -0
  37. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/tools/prompts.py +0 -0
  38. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/tools/tool_utils.py +0 -0
  39. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/tools/tools_types.py +0 -0
  40. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/utils/__init__.py +0 -0
  41. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/utils/exceptions.py +0 -0
  42. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/utils/execute.py +0 -0
  43. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/utils/image_utils.py +0 -0
  44. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/utils/sim.py +0 -0
  45. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/utils/type_defs.py +0 -0
  46. {vision_agent-0.2.211 → vision_agent-0.2.213}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.211
3
+ Version: 0.2.213
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.211"
7
+ version = "0.2.213"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -26,9 +26,10 @@ from .tools import (
26
26
  claude35_text_extraction,
27
27
  closest_box_distance,
28
28
  closest_mask_distance,
29
+ countgd_example_based_counting,
29
30
  countgd_object_detection,
30
31
  countgd_sam2_object_detection,
31
- countgd_example_based_counting,
32
+ countgd_sam2_video_tracking,
32
33
  depth_anything_v2,
33
34
  detr_segmentation,
34
35
  extract_frames_and_timestamps,
@@ -46,11 +47,13 @@ from .tools import (
46
47
  load_image,
47
48
  minimum_distance,
48
49
  ocr,
50
+ od_sam2_video_tracking,
49
51
  overlay_bounding_boxes,
50
52
  overlay_heat_map,
51
53
  overlay_segmentation_masks,
52
54
  owl_v2_image,
53
55
  owl_v2_video,
56
+ owlv2_sam2_video_tracking,
54
57
  qwen2_vl_images_vqa,
55
58
  qwen2_vl_video_vqa,
56
59
  sam2,
@@ -6,6 +6,7 @@ import tempfile
6
6
  import urllib.request
7
7
  from base64 import b64encode
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from enum import Enum
9
10
  from functools import lru_cache
10
11
  from importlib import resources
11
12
  from pathlib import Path
@@ -2394,6 +2395,197 @@ def _plot_counting(
2394
2395
  return image
2395
2396
 
2396
2397
 
2398
+ class ODModels(str, Enum):
2399
+ COUNTGD = "countgd"
2400
+ FLORENCE2 = "florence2"
2401
+ OWLV2 = "owlv2"
2402
+
2403
+
2404
+ def od_sam2_video_tracking(
2405
+ od_model: ODModels,
2406
+ prompt: str,
2407
+ frames: List[np.ndarray],
2408
+ chunk_length: Optional[int] = 10,
2409
+ fine_tune_id: Optional[str] = None,
2410
+ ) -> List[List[Dict[str, Any]]]:
2411
+
2412
+ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
2413
+
2414
+ if chunk_length is None:
2415
+ step = 1 # Process every frame
2416
+ elif chunk_length <= 0:
2417
+ raise ValueError("chunk_length must be a positive integer or None.")
2418
+ else:
2419
+ step = chunk_length # Process frames with the specified step size
2420
+
2421
+ for idx in range(0, len(frames), step):
2422
+ if od_model == ODModels.COUNTGD:
2423
+ results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
2424
+ function_name = "countgd_object_detection"
2425
+ elif od_model == ODModels.OWLV2:
2426
+ results[idx] = owl_v2_image(
2427
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2428
+ )
2429
+ function_name = "owl_v2_image"
2430
+ elif od_model == ODModels.FLORENCE2:
2431
+ results[idx] = florence2_sam2_image(
2432
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2433
+ )
2434
+ function_name = "florence2_sam2_image"
2435
+ else:
2436
+ raise NotImplementedError(
2437
+ f"Object detection model '{od_model}' is not implemented."
2438
+ )
2439
+
2440
+ image_size = frames[0].shape[:2]
2441
+
2442
+ def _transform_detections(
2443
+ input_list: List[Optional[List[Dict[str, Any]]]]
2444
+ ) -> List[Optional[Dict[str, Any]]]:
2445
+ output_list: List[Optional[Dict[str, Any]]] = []
2446
+
2447
+ for idx, frame in enumerate(input_list):
2448
+ if frame is not None:
2449
+ labels = [detection["label"] for detection in frame]
2450
+ bboxes = [
2451
+ denormalize_bbox(detection["bbox"], image_size)
2452
+ for detection in frame
2453
+ ]
2454
+
2455
+ output_list.append(
2456
+ {
2457
+ "labels": labels,
2458
+ "bboxes": bboxes,
2459
+ }
2460
+ )
2461
+ else:
2462
+ output_list.append(None)
2463
+
2464
+ return output_list
2465
+
2466
+ output = _transform_detections(results)
2467
+
2468
+ buffer_bytes = frames_to_bytes(frames)
2469
+ files = [("video", buffer_bytes)]
2470
+ payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
2471
+ metadata = {"function_name": function_name}
2472
+
2473
+ detections = send_task_inference_request(
2474
+ payload,
2475
+ "sam2",
2476
+ files=files,
2477
+ metadata=metadata,
2478
+ )
2479
+
2480
+ return_data = []
2481
+ for frame in detections:
2482
+ return_frame_data = []
2483
+ for detection in frame:
2484
+ mask = rle_decode_array(detection["mask"])
2485
+ label = str(detection["id"]) + ": " + detection["label"]
2486
+ return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
2487
+ return_data.append(return_frame_data)
2488
+ return_data = add_bboxes_from_masks(return_data)
2489
+ return nms(return_data, iou_threshold=0.95)
2490
+
2491
+
2492
+ def countgd_sam2_video_tracking(
2493
+ prompt: str,
2494
+ frames: List[np.ndarray],
2495
+ chunk_length: Optional[int] = 10,
2496
+ ) -> List[List[Dict[str, Any]]]:
2497
+ """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
2498
+ prompt such as category names or referring expressions. The categories in the text
2499
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2500
+ mask file names and associated probability scores.
2501
+
2502
+ Parameters:
2503
+ prompt (str): The prompt to ground to the image.
2504
+ image (np.ndarray): The image to ground the prompt to.
2505
+
2506
+ Returns:
2507
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2508
+ bounding box, and mask of the detected objects with normalized coordinates
2509
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2510
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2511
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2512
+ the background.
2513
+
2514
+ Example
2515
+ -------
2516
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2517
+ [
2518
+ [
2519
+ {
2520
+ 'label': '0: dinosaur',
2521
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2522
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2523
+ [0, 0, 0, ..., 0, 0, 0],
2524
+ ...,
2525
+ [0, 0, 0, ..., 0, 0, 0],
2526
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2527
+ },
2528
+ ],
2529
+ ...
2530
+ ]
2531
+ """
2532
+
2533
+ return od_sam2_video_tracking(
2534
+ ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
2535
+ )
2536
+
2537
+
2538
+ def owlv2_sam2_video_tracking(
2539
+ prompt: str,
2540
+ frames: List[np.ndarray],
2541
+ chunk_length: Optional[int] = 10,
2542
+ fine_tune_id: Optional[str] = None,
2543
+ ) -> List[List[Dict[str, Any]]]:
2544
+ """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
2545
+ prompt such as category names or referring expressions. The categories in the text
2546
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2547
+ mask file names and associated probability scores.
2548
+
2549
+ Parameters:
2550
+ prompt (str): The prompt to ground to the image.
2551
+ image (np.ndarray): The image to ground the prompt to.
2552
+
2553
+ Returns:
2554
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2555
+ bounding box, and mask of the detected objects with normalized coordinates
2556
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2557
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2558
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2559
+ the background.
2560
+
2561
+ Example
2562
+ -------
2563
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2564
+ [
2565
+ [
2566
+ {
2567
+ 'label': '0: dinosaur',
2568
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2569
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2570
+ [0, 0, 0, ..., 0, 0, 0],
2571
+ ...,
2572
+ [0, 0, 0, ..., 0, 0, 0],
2573
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2574
+ },
2575
+ ],
2576
+ ...
2577
+ ]
2578
+ """
2579
+
2580
+ return od_sam2_video_tracking(
2581
+ ODModels.OWLV2,
2582
+ prompt=prompt,
2583
+ frames=frames,
2584
+ chunk_length=chunk_length,
2585
+ fine_tune_id=fine_tune_id,
2586
+ )
2587
+
2588
+
2397
2589
  FUNCTION_TOOLS = [
2398
2590
  owl_v2_image,
2399
2591
  owl_v2_video,
@@ -2416,6 +2608,8 @@ FUNCTION_TOOLS = [
2416
2608
  video_temporal_localization,
2417
2609
  flux_image_inpainting,
2418
2610
  siglip_classification,
2611
+ owlv2_sam2_video_tracking,
2612
+ countgd_sam2_video_tracking,
2419
2613
  ]
2420
2614
 
2421
2615
  UTIL_TOOLS = [
File without changes
File without changes