nnInteractive 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. nnInteractive/__init__.py +3 -0
  2. nnInteractive/inference/__init__.py +0 -0
  3. nnInteractive/inference/cvpr2025_challenge_baseline/__init__.py +0 -0
  4. nnInteractive/inference/cvpr2025_challenge_baseline/predict.py +173 -0
  5. nnInteractive/inference/inference_session.py +1400 -0
  6. nnInteractive/interaction/__init__.py +0 -0
  7. nnInteractive/interaction/point.py +166 -0
  8. nnInteractive/supervoxel/setup.py +4 -0
  9. nnInteractive/supervoxel/src/metadata.py +118 -0
  10. nnInteractive/supervoxel/src/reader.py +175 -0
  11. nnInteractive/supervoxel/src/run.py +136 -0
  12. nnInteractive/supervoxel/src/sam2/__init__.py +2 -0
  13. nnInteractive/supervoxel/src/sam2/sam2/__init__.py +11 -0
  14. nnInteractive/supervoxel/src/sam2/sam2/automatic_mask_generator.py +434 -0
  15. nnInteractive/supervoxel/src/sam2/sam2/benchmark.py +86 -0
  16. nnInteractive/supervoxel/src/sam2/sam2/build_sam.py +172 -0
  17. nnInteractive/supervoxel/src/sam2/sam2/modeling/__init__.py +5 -0
  18. nnInteractive/supervoxel/src/sam2/sam2/modeling/backbones/__init__.py +5 -0
  19. nnInteractive/supervoxel/src/sam2/sam2/modeling/backbones/hieradet.py +305 -0
  20. nnInteractive/supervoxel/src/sam2/sam2/modeling/backbones/image_encoder.py +132 -0
  21. nnInteractive/supervoxel/src/sam2/sam2/modeling/backbones/utils.py +89 -0
  22. nnInteractive/supervoxel/src/sam2/sam2/modeling/memory_attention.py +167 -0
  23. nnInteractive/supervoxel/src/sam2/sam2/modeling/memory_encoder.py +179 -0
  24. nnInteractive/supervoxel/src/sam2/sam2/modeling/position_encoding.py +217 -0
  25. nnInteractive/supervoxel/src/sam2/sam2/modeling/sam/__init__.py +5 -0
  26. nnInteractive/supervoxel/src/sam2/sam2/modeling/sam/mask_decoder.py +274 -0
  27. nnInteractive/supervoxel/src/sam2/sam2/modeling/sam/prompt_encoder.py +194 -0
  28. nnInteractive/supervoxel/src/sam2/sam2/modeling/sam/transformer.py +293 -0
  29. nnInteractive/supervoxel/src/sam2/sam2/modeling/sam2_base.py +879 -0
  30. nnInteractive/supervoxel/src/sam2/sam2/modeling/sam2_utils.py +315 -0
  31. nnInteractive/supervoxel/src/sam2/sam2/sam2_image_predictor.py +433 -0
  32. nnInteractive/supervoxel/src/sam2/sam2/sam2_video_predictor.py +1171 -0
  33. nnInteractive/supervoxel/src/sam2/sam2/sam2_video_predictor_legacy.py +1125 -0
  34. nnInteractive/supervoxel/src/sam2/sam2/utils/__init__.py +5 -0
  35. nnInteractive/supervoxel/src/sam2/sam2/utils/amg.py +332 -0
  36. nnInteractive/supervoxel/src/sam2/sam2/utils/misc.py +488 -0
  37. nnInteractive/supervoxel/src/sam2/sam2/utils/transforms.py +108 -0
  38. nnInteractive/supervoxel/src/sam2/setup.py +174 -0
  39. nnInteractive/supervoxel/src/sam2/training/__init__.py +5 -0
  40. nnInteractive/supervoxel/src/sam2/training/dataset/__init__.py +5 -0
  41. nnInteractive/supervoxel/src/sam2/training/dataset/sam2_datasets.py +176 -0
  42. nnInteractive/supervoxel/src/sam2/training/dataset/transforms.py +481 -0
  43. nnInteractive/supervoxel/src/sam2/training/dataset/utils.py +102 -0
  44. nnInteractive/supervoxel/src/sam2/training/dataset/vos_dataset.py +154 -0
  45. nnInteractive/supervoxel/src/sam2/training/dataset/vos_raw_dataset.py +290 -0
  46. nnInteractive/supervoxel/src/sam2/training/dataset/vos_sampler.py +103 -0
  47. nnInteractive/supervoxel/src/sam2/training/dataset/vos_segment_loader.py +289 -0
  48. nnInteractive/supervoxel/src/sam2/training/loss_fns.py +290 -0
  49. nnInteractive/supervoxel/src/sam2/training/model/__init__.py +5 -0
  50. nnInteractive/supervoxel/src/sam2/training/model/sam2.py +515 -0
  51. nnInteractive/supervoxel/src/sam2/training/optimizer.py +462 -0
  52. nnInteractive/supervoxel/src/sam2/training/scripts/sav_frame_extraction_submitit.py +157 -0
  53. nnInteractive/supervoxel/src/sam2/training/train.py +232 -0
  54. nnInteractive/supervoxel/src/sam2/training/trainer.py +1051 -0
  55. nnInteractive/supervoxel/src/sam2/training/utils/__init__.py +5 -0
  56. nnInteractive/supervoxel/src/sam2/training/utils/checkpoint_utils.py +328 -0
  57. nnInteractive/supervoxel/src/sam2/training/utils/data_utils.py +166 -0
  58. nnInteractive/supervoxel/src/sam2/training/utils/distributed.py +560 -0
  59. nnInteractive/supervoxel/src/sam2/training/utils/logger.py +236 -0
  60. nnInteractive/supervoxel/src/sam2/training/utils/train_utils.py +275 -0
  61. nnInteractive/supervoxel/src/supervoxel.py +198 -0
  62. nnInteractive/trainer/__init__.py +0 -0
  63. nnInteractive/trainer/nnInteractiveTrainer.py +24 -0
  64. nnInteractive/utils/__init__.py +0 -0
  65. nnInteractive/utils/bboxes.py +217 -0
  66. nnInteractive/utils/checkpoint_cleansing.py +9 -0
  67. nnInteractive/utils/crop.py +268 -0
  68. nnInteractive/utils/erosion_dilation.py +48 -0
  69. nnInteractive/utils/inference_helpers.py +45 -0
  70. nnInteractive/utils/os_shennanigans.py +16 -0
  71. nnInteractive/utils/rounding.py +13 -0
  72. nninteractive-2.0.0.dist-info/METADATA +511 -0
  73. nninteractive-2.0.0.dist-info/RECORD +76 -0
  74. nninteractive-2.0.0.dist-info/WHEEL +5 -0
  75. nninteractive-2.0.0.dist-info/licenses/LICENSE +201 -0
  76. nninteractive-2.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,488 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import os
8
+ import warnings
9
+ from threading import Thread
10
+
11
+ import numpy as np
12
+ import torch
13
+ from PIL import Image
14
+ from tqdm import tqdm
15
+
16
+
17
+ def get_sdpa_settings():
18
+ if torch.cuda.is_available():
19
+ old_gpu = torch.cuda.get_device_properties(0).major < 7
20
+ # only use Flash Attention on Ampere (8.0) or newer GPUs
21
+ use_flash_attn = torch.cuda.get_device_properties(0).major >= 8
22
+ if not use_flash_attn:
23
+ warnings.warn(
24
+ "Flash Attention is disabled as it requires a GPU with Ampere (8.0) CUDA capability.",
25
+ category=UserWarning,
26
+ stacklevel=2,
27
+ )
28
+ # keep math kernel for PyTorch versions before 2.2 (Flash Attention v2 is only
29
+ # available on PyTorch 2.2+, while Flash Attention v1 cannot handle all cases)
30
+ pytorch_version = tuple(int(v) for v in torch.__version__.split(".")[:2])
31
+ if pytorch_version < (2, 2):
32
+ warnings.warn(
33
+ f"You are using PyTorch {torch.__version__} without Flash Attention v2 support. "
34
+ "Consider upgrading to PyTorch 2.2+ for Flash Attention v2 (which could be faster).",
35
+ category=UserWarning,
36
+ stacklevel=2,
37
+ )
38
+ math_kernel_on = pytorch_version < (2, 2) or not use_flash_attn
39
+ else:
40
+ old_gpu = True
41
+ use_flash_attn = False
42
+ math_kernel_on = True
43
+
44
+ return old_gpu, use_flash_attn, math_kernel_on
45
+
46
+
47
+ def get_connected_components(mask):
48
+ """
49
+ Get the connected components (8-connectivity) of binary masks of shape (N, 1, H, W).
50
+
51
+ Inputs:
52
+ - mask: A binary mask tensor of shape (N, 1, H, W), where 1 is foreground and 0 is
53
+ background.
54
+
55
+ Outputs:
56
+ - labels: A tensor of shape (N, 1, H, W) containing the connected component labels
57
+ for foreground pixels and 0 for background pixels.
58
+ - counts: A tensor of shape (N, 1, H, W) containing the area of the connected
59
+ components for foreground pixels and 0 for background pixels.
60
+ """
61
+ from sam2 import _C
62
+
63
+ return _C.get_connected_componnets(mask.to(torch.uint8).contiguous())
64
+
65
+
66
+ def mask_to_box(masks: torch.Tensor):
67
+ """
68
+ compute bounding box given an input mask
69
+
70
+ Inputs:
71
+ - masks: [B, 1, H, W] masks, dtype=torch.Tensor
72
+
73
+ Returns:
74
+ - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
75
+ """
76
+ B, _, h, w = masks.shape
77
+ device = masks.device
78
+ xs = torch.arange(w, device=device, dtype=torch.int32)
79
+ ys = torch.arange(h, device=device, dtype=torch.int32)
80
+ grid_xs, grid_ys = torch.meshgrid(xs, ys, indexing="xy")
81
+ grid_xs = grid_xs[None, None, ...].expand(B, 1, h, w)
82
+ grid_ys = grid_ys[None, None, ...].expand(B, 1, h, w)
83
+ min_xs, _ = torch.min(torch.where(masks, grid_xs, w).flatten(-2), dim=-1)
84
+ max_xs, _ = torch.max(torch.where(masks, grid_xs, -1).flatten(-2), dim=-1)
85
+ min_ys, _ = torch.min(torch.where(masks, grid_ys, h).flatten(-2), dim=-1)
86
+ max_ys, _ = torch.max(torch.where(masks, grid_ys, -1).flatten(-2), dim=-1)
87
+ bbox_coords = torch.stack((min_xs, min_ys, max_xs, max_ys), dim=-1)
88
+
89
+ return bbox_coords
90
+
91
+
92
+ def _load_img_as_tensor(img_path, image_size):
93
+ img_pil = Image.open(img_path)
94
+ img_np = np.array(img_pil.convert("RGB").resize((image_size, image_size)))
95
+ if img_np.dtype == np.uint8: # np.uint8 is expected for JPEG images
96
+ img_np = img_np / 255.0
97
+ else:
98
+ raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}")
99
+ img = torch.from_numpy(img_np).permute(2, 0, 1)
100
+ video_width, video_height = img_pil.size # the original video size
101
+ return img, video_height, video_width
102
+
103
+
104
+ from skimage.transform import resize
105
+
106
+
107
+ def _load_np_as_tensor(img_arr, image_size):
108
+ _, video_width, video_height = img_arr.shape # the original video siz
109
+ img_arr = resize(img_arr, (3, image_size, image_size))
110
+
111
+ img = torch.from_numpy(img_arr) # .permute(2, 0, 1)
112
+ return img, video_height, video_width
113
+
114
+
115
+ class AsyncVideoFrameLoader:
116
+ """
117
+ A list of video frames to be load asynchronously without blocking session start.
118
+ """
119
+
120
+ def __init__(
121
+ self,
122
+ img_paths,
123
+ image_size,
124
+ offload_video_to_cpu,
125
+ img_mean,
126
+ img_std,
127
+ compute_device,
128
+ ):
129
+ self.img_paths = img_paths
130
+ self.image_size = image_size
131
+ self.offload_video_to_cpu = offload_video_to_cpu
132
+ self.img_mean = img_mean
133
+ self.img_std = img_std
134
+ # items in `self.images` will be loaded asynchronously
135
+ self.images = [None] * len(img_paths)
136
+ # catch and raise any exceptions in the async loading thread
137
+ self.exception = None
138
+ # video_height and video_width be filled when loading the first image
139
+ self.video_height = None
140
+ self.video_width = None
141
+ self.compute_device = compute_device
142
+
143
+ # load the first frame to fill video_height and video_width and also
144
+ # to cache it (since it's most likely where the user will click)
145
+ self.__getitem__(0)
146
+
147
+ # load the rest of frames asynchronously without blocking the session start
148
+ def _load_frames():
149
+ try:
150
+ for n in tqdm(range(len(self.images)), desc="frame loading (JPEG)"):
151
+ self.__getitem__(n)
152
+ except Exception as e:
153
+ self.exception = e
154
+
155
+ self.thread = Thread(target=_load_frames, daemon=True)
156
+ self.thread.start()
157
+
158
+ def __getitem__(self, index):
159
+ if self.exception is not None:
160
+ raise RuntimeError("Failure in frame loading thread") from self.exception
161
+
162
+ img = self.images[index]
163
+ if img is not None:
164
+ return img
165
+
166
+ img, video_height, video_width = _load_img_as_tensor(self.img_paths[index], self.image_size)
167
+ self.video_height = video_height
168
+ self.video_width = video_width
169
+ # normalize by mean and std
170
+ img -= self.img_mean
171
+ img /= self.img_std
172
+ if not self.offload_video_to_cpu:
173
+ img = img.to(self.compute_device, non_blocking=True)
174
+ self.images[index] = img
175
+ return img
176
+
177
+ def __len__(self):
178
+ return len(self.images)
179
+
180
+
181
+ def load_video_frames(
182
+ video_path,
183
+ image_size,
184
+ offload_video_to_cpu,
185
+ img_mean=(0.485, 0.456, 0.406),
186
+ img_std=(0.229, 0.224, 0.225),
187
+ async_loading_frames=False,
188
+ compute_device=torch.device("cuda"),
189
+ ):
190
+ """
191
+ Load the video frames from video_path. The frames are resized to image_size as in
192
+ the model and are loaded to GPU if offload_video_to_cpu=False. This is used by the demo.
193
+ """
194
+ is_bytes = isinstance(video_path, bytes)
195
+ is_str = isinstance(video_path, str)
196
+ is_mp4_path = is_str and os.path.splitext(video_path)[-1] in [".mp4", ".MP4"]
197
+ is_nifti_path = is_str and os.path.splitext(video_path)[-1] in [".nii", ".gz"]
198
+ is_np_array = isinstance(video_path, np.ndarray)
199
+ if is_bytes or is_mp4_path:
200
+ return load_video_frames_from_video_file(
201
+ video_path=video_path,
202
+ image_size=image_size,
203
+ offload_video_to_cpu=offload_video_to_cpu,
204
+ img_mean=img_mean,
205
+ img_std=img_std,
206
+ compute_device=compute_device,
207
+ )
208
+ elif is_nifti_path:
209
+ return load_video_frames_from_nifti_images(
210
+ video_path=video_path,
211
+ image_size=image_size,
212
+ offload_video_to_cpu=offload_video_to_cpu,
213
+ img_mean=img_mean,
214
+ img_std=img_std,
215
+ async_loading_frames=async_loading_frames,
216
+ compute_device=compute_device,
217
+ )
218
+ elif is_np_array:
219
+ return load_video_frames_from_numpy_array(
220
+ video_path,
221
+ image_size=image_size,
222
+ offload_video_to_cpu=offload_video_to_cpu,
223
+ img_mean=img_mean,
224
+ img_std=img_std,
225
+ async_loading_frames=async_loading_frames,
226
+ compute_device=compute_device,
227
+ )
228
+ elif is_str and os.path.isdir(video_path):
229
+ return load_video_frames_from_jpg_images(
230
+ video_path=video_path,
231
+ image_size=image_size,
232
+ offload_video_to_cpu=offload_video_to_cpu,
233
+ img_mean=img_mean,
234
+ img_std=img_std,
235
+ async_loading_frames=async_loading_frames,
236
+ compute_device=compute_device,
237
+ )
238
+ else:
239
+ raise NotImplementedError("Only MP4 video and JPEG folder are supported at this moment")
240
+
241
+
242
+ def load_video_frames_from_jpg_images(
243
+ video_path,
244
+ image_size,
245
+ offload_video_to_cpu,
246
+ img_mean=(0.485, 0.456, 0.406),
247
+ img_std=(0.229, 0.224, 0.225),
248
+ async_loading_frames=False,
249
+ compute_device=torch.device("cuda"),
250
+ ):
251
+ """
252
+ Load the video frames from a directory of JPEG files ("<frame_index>.jpg" format).
253
+
254
+ The frames are resized to image_size x image_size and are loaded to GPU if
255
+ `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.
256
+
257
+ You can load a frame asynchronously by setting `async_loading_frames` to `True`.
258
+ """
259
+ if isinstance(video_path, str) and os.path.isdir(video_path):
260
+ jpg_folder = video_path
261
+ else:
262
+ raise NotImplementedError(
263
+ "Only JPEG frames are supported at this moment. For video files, you may use "
264
+ "ffmpeg (https://ffmpeg.org/) to extract frames into a folder of JPEG files, such as \n"
265
+ "```\n"
266
+ "ffmpeg -i <your_video>.mp4 -q:v 2 -start_number 0 <output_dir>/'%05d.jpg'\n"
267
+ "```\n"
268
+ "where `-q:v` generates high-quality JPEG frames and `-start_number 0` asks "
269
+ "ffmpeg to start the JPEG file from 00000.jpg."
270
+ )
271
+
272
+ frame_names = [p for p in os.listdir(jpg_folder) if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]]
273
+ frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
274
+ num_frames = len(frame_names)
275
+ if num_frames == 0:
276
+ raise RuntimeError(f"no images found in {jpg_folder}")
277
+ img_paths = [os.path.join(jpg_folder, frame_name) for frame_name in frame_names]
278
+ img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
279
+ img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
280
+
281
+ if async_loading_frames:
282
+ lazy_images = AsyncVideoFrameLoader(
283
+ img_paths,
284
+ image_size,
285
+ offload_video_to_cpu,
286
+ img_mean,
287
+ img_std,
288
+ compute_device,
289
+ )
290
+ return lazy_images, lazy_images.video_height, lazy_images.video_width
291
+
292
+ images = torch.zeros(num_frames, 3, image_size, image_size, dtype=torch.float32)
293
+ for n, img_path in enumerate(tqdm(img_paths, desc="frame loading (JPEG)")):
294
+ images[n], video_height, video_width = _load_img_as_tensor(img_path, image_size)
295
+ if not offload_video_to_cpu:
296
+ images = images.to(compute_device)
297
+ img_mean = img_mean.to(compute_device)
298
+ img_std = img_std.to(compute_device)
299
+ # normalize by mean and std
300
+ images -= img_mean
301
+ images /= img_std
302
+ return images, video_height, video_width
303
+
304
+
305
+ def load_video_frames_from_nifti_images(
306
+ video_path,
307
+ image_size,
308
+ offload_video_to_cpu,
309
+ img_mean=(0.485, 0.456, 0.406),
310
+ img_std=(0.229, 0.224, 0.225),
311
+ async_loading_frames=False,
312
+ compute_device=torch.device("cuda"),
313
+ ):
314
+ """
315
+ Load the video frames from a directory of JPEG files ("<frame_index>.jpg" format).
316
+
317
+ The frames are resized to image_size x image_size and are loaded to GPU if
318
+ `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.
319
+
320
+ You can load a frame asynchronously by setting `async_loading_frames` to `True`.
321
+ """
322
+ import SimpleITK as sitk
323
+ from skimage.transform import resize
324
+
325
+ volume = sitk.ReadImage(video_path)
326
+ vol_data = sitk.GetArrayFromImage(volume)
327
+ # Clip
328
+ vol_data = np.clip(vol_data, -100, 400)
329
+ # Normalize
330
+ vol_data = (vol_data - vol_data.min()) / (vol_data.max() - vol_data.min())
331
+
332
+ video_height, video_width = vol_data.shape[1], vol_data.shape[2]
333
+
334
+ # resize
335
+ vol_data = resize(vol_data, (vol_data.shape[0], image_size, image_size))
336
+
337
+ # add fake RGB
338
+ vol_data = np.stack([vol_data, vol_data, vol_data], axis=1)
339
+
340
+ # if isinstance(video_path, str) and os.path.isdir(video_path):
341
+ # jpg_folder = video_path
342
+ # else:
343
+ # raise NotImplementedError(
344
+ # "Only JPEG frames are supported at this moment. For video files, you may use "
345
+ # "ffmpeg (https://ffmpeg.org/) to extract frames into a folder of JPEG files, such as \n"
346
+ # "```\n"
347
+ # "ffmpeg -i <your_video>.mp4 -q:v 2 -start_number 0 <output_dir>/'%05d.jpg'\n"
348
+ # "```\n"
349
+ # "where `-q:v` generates high-quality JPEG frames and `-start_number 0` asks "
350
+ # "ffmpeg to start the JPEG file from 00000.jpg."
351
+ # )
352
+
353
+ # frame_names = [
354
+ # p
355
+ # for p in os.listdir(jpg_folder)
356
+ # if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
357
+ # ]
358
+ # frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
359
+ # num_frames = len(frame_names)
360
+ # if num_frames == 0:
361
+ # raise RuntimeError(f"no images found in {jpg_folder}")
362
+ # img_paths = [os.path.join(jpg_folder, frame_name) for frame_name in frame_names]
363
+ # img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
364
+ # img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
365
+
366
+ # if async_loading_frames:
367
+ # lazy_images = AsyncVideoFrameLoader(
368
+ # img_paths,
369
+ # image_size,
370
+ # offload_video_to_cpu,
371
+ # img_mean,
372
+ # img_std,
373
+ # compute_device,
374
+ # )
375
+ # return lazy_images, lazy_images.video_height, lazy_images.video_width
376
+
377
+ images = torch.from_numpy(vol_data).float()
378
+ if not offload_video_to_cpu:
379
+ images = images.to(compute_device)
380
+ return images, video_height, video_width
381
+
382
+
383
+ def load_video_frames_from_numpy_array(
384
+ vol_data,
385
+ image_size,
386
+ offload_video_to_cpu,
387
+ img_mean=(0.485, 0.456, 0.406),
388
+ img_std=(0.229, 0.224, 0.225),
389
+ async_loading_frames=False,
390
+ compute_device=torch.device("cuda"),
391
+ ):
392
+ """
393
+ Load the video frames from a directory of JPEG files ("<frame_index>.jpg" format).
394
+
395
+ The frames are resized to image_size x image_size and are loaded to GPU if
396
+ `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.
397
+
398
+ You can load a frame asynchronously by setting `async_loading_frames` to `True`.
399
+ """
400
+ from skimage.transform import resize
401
+
402
+ video_height, video_width = vol_data.shape[1], vol_data.shape[2]
403
+
404
+ # normalize
405
+ vol_data = (vol_data - vol_data.min()) / (vol_data.max() - vol_data.min())
406
+
407
+ # resize
408
+ vol_data = resize(vol_data, (vol_data.shape[0], image_size, image_size))
409
+
410
+ # add fake RGB
411
+ vol_data = np.stack([vol_data, vol_data, vol_data], axis=1)
412
+
413
+ images = torch.from_numpy(vol_data).float()
414
+ if not offload_video_to_cpu:
415
+ images = images.to(compute_device)
416
+ return images, video_height, video_width
417
+
418
+
419
+ def load_video_frames_from_video_file(
420
+ video_path,
421
+ image_size,
422
+ offload_video_to_cpu,
423
+ img_mean=(0.485, 0.456, 0.406),
424
+ img_std=(0.229, 0.224, 0.225),
425
+ compute_device=torch.device("cuda"),
426
+ ):
427
+ """Load the video frames from a video file."""
428
+ import decord
429
+
430
+ img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
431
+ img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
432
+ # Get the original video height and width
433
+ decord.bridge.set_bridge("torch")
434
+ video_height, video_width, _ = decord.VideoReader(video_path).next().shape
435
+ # Iterate over all frames in the video
436
+ images = []
437
+ for frame in decord.VideoReader(video_path, width=image_size, height=image_size):
438
+ images.append(frame.permute(2, 0, 1))
439
+
440
+ images = torch.stack(images, dim=0).float() / 255.0
441
+ if not offload_video_to_cpu:
442
+ images = images.to(compute_device)
443
+ img_mean = img_mean.to(compute_device)
444
+ img_std = img_std.to(compute_device)
445
+ # normalize by mean and std
446
+ images -= img_mean
447
+ images /= img_std
448
+ return images, video_height, video_width
449
+
450
+
451
+ def fill_holes_in_mask_scores(mask, max_area):
452
+ """
453
+ A post processor to fill small holes in mask scores with area under `max_area`.
454
+ """
455
+ # Holes are those connected components in background with area <= self.max_area
456
+ # (background regions are those with mask scores <= 0)
457
+ assert max_area > 0, "max_area must be positive"
458
+
459
+ input_mask = mask
460
+ try:
461
+ labels, areas = get_connected_components(mask <= 0)
462
+ is_hole = (labels > 0) & (areas <= max_area)
463
+ # We fill holes with a small positive mask score (0.1) to change them to foreground.
464
+ mask = torch.where(is_hole, 0.1, mask)
465
+ except Exception as e:
466
+ # Skip the post-processing step on removing small holes if the CUDA kernel fails
467
+ warnings.warn(
468
+ f"{e}\n\nSkipping the post-processing step due to the error above. You can "
469
+ "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
470
+ "functionality may be limited (which doesn't affect the results in most cases; see "
471
+ "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
472
+ category=UserWarning,
473
+ stacklevel=2,
474
+ )
475
+ mask = input_mask
476
+
477
+ return mask
478
+
479
+
480
+ def concat_points(old_point_inputs, new_points, new_labels):
481
+ """Add new points and labels to previous point inputs (add at the end)."""
482
+ if old_point_inputs is None:
483
+ points, labels = new_points, new_labels
484
+ else:
485
+ points = torch.cat([old_point_inputs["point_coords"], new_points], dim=1)
486
+ labels = torch.cat([old_point_inputs["point_labels"], new_labels], dim=1)
487
+
488
+ return {"point_coords": points, "point_labels": labels}
@@ -0,0 +1,108 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import warnings
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ from torchvision.transforms import Normalize, Resize, ToTensor
13
+
14
+
15
+ class SAM2Transforms(nn.Module):
16
+ def __init__(self, resolution, mask_threshold, max_hole_area=0.0, max_sprinkle_area=0.0):
17
+ """
18
+ Transforms for SAM2.
19
+ """
20
+ super().__init__()
21
+ self.resolution = resolution
22
+ self.mask_threshold = mask_threshold
23
+ self.max_hole_area = max_hole_area
24
+ self.max_sprinkle_area = max_sprinkle_area
25
+ self.mean = [0.485, 0.456, 0.406]
26
+ self.std = [0.229, 0.224, 0.225]
27
+ self.to_tensor = ToTensor()
28
+ self.transforms = torch.jit.script(
29
+ nn.Sequential(
30
+ Resize((self.resolution, self.resolution)),
31
+ Normalize(self.mean, self.std),
32
+ )
33
+ )
34
+
35
+ def __call__(self, x):
36
+ x = self.to_tensor(x)
37
+ return self.transforms(x)
38
+
39
+ def forward_batch(self, img_list):
40
+ img_batch = [self.transforms(self.to_tensor(img)) for img in img_list]
41
+ img_batch = torch.stack(img_batch, dim=0)
42
+ return img_batch
43
+
44
+ def transform_coords(self, coords: torch.Tensor, normalize=False, orig_hw=None) -> torch.Tensor:
45
+ """
46
+ Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates,
47
+ If the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
48
+
49
+ Returns
50
+ Un-normalized coordinates in the range of [0, 1] which is expected by the SAM2 model.
51
+ """
52
+ if normalize:
53
+ assert orig_hw is not None
54
+ h, w = orig_hw
55
+ coords = coords.clone()
56
+ coords[..., 0] = coords[..., 0] / w
57
+ coords[..., 1] = coords[..., 1] / h
58
+
59
+ coords = coords * self.resolution # unnormalize coords
60
+ return coords
61
+
62
+ def transform_boxes(self, boxes: torch.Tensor, normalize=False, orig_hw=None) -> torch.Tensor:
63
+ """
64
+ Expects a tensor of shape Bx4. The coordinates can be in absolute image or normalized coordinates,
65
+ if the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
66
+ """
67
+ boxes = self.transform_coords(boxes.reshape(-1, 2, 2), normalize, orig_hw)
68
+ return boxes
69
+
70
+ def postprocess_masks(self, masks: torch.Tensor, orig_hw) -> torch.Tensor:
71
+ """
72
+ Perform PostProcessing on output masks.
73
+ """
74
+ from sam2.utils.misc import get_connected_components
75
+
76
+ masks = masks.float()
77
+ input_masks = masks
78
+ mask_flat = masks.flatten(0, 1).unsqueeze(1) # flatten as 1-channel image
79
+ try:
80
+ if self.max_hole_area > 0:
81
+ # Holes are those connected components in background with area <= self.fill_hole_area
82
+ # (background regions are those with mask scores <= self.mask_threshold)
83
+ labels, areas = get_connected_components(mask_flat <= self.mask_threshold)
84
+ is_hole = (labels > 0) & (areas <= self.max_hole_area)
85
+ is_hole = is_hole.reshape_as(masks)
86
+ # We fill holes with a small positive mask score (10.0) to change them to foreground.
87
+ masks = torch.where(is_hole, self.mask_threshold + 10.0, masks)
88
+
89
+ if self.max_sprinkle_area > 0:
90
+ labels, areas = get_connected_components(mask_flat > self.mask_threshold)
91
+ is_hole = (labels > 0) & (areas <= self.max_sprinkle_area)
92
+ is_hole = is_hole.reshape_as(masks)
93
+ # We fill holes with negative mask score (-10.0) to change them to background.
94
+ masks = torch.where(is_hole, self.mask_threshold - 10.0, masks)
95
+ except Exception as e:
96
+ # Skip the post-processing step if the CUDA kernel fails
97
+ warnings.warn(
98
+ f"{e}\n\nSkipping the post-processing step due to the error above. You can "
99
+ "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
100
+ "functionality may be limited (which doesn't affect the results in most cases; see "
101
+ "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
102
+ category=UserWarning,
103
+ stacklevel=2,
104
+ )
105
+ masks = input_masks
106
+
107
+ masks = F.interpolate(masks, orig_hw, mode="bilinear", align_corners=False)
108
+ return masks