frontveg 0.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. frontveg/__init__.py +11 -0
  2. frontveg/_tests/__init__.py +0 -0
  3. frontveg/_tests/test_widget.py +66 -0
  4. frontveg/_version.py +21 -0
  5. frontveg/_widget.py +132 -0
  6. frontveg/napari.yaml +14 -0
  7. frontveg/utils.py +95 -0
  8. frontveg-0.1.dev1.dist-info/METADATA +143 -0
  9. frontveg-0.1.dev1.dist-info/RECORD +44 -0
  10. frontveg-0.1.dev1.dist-info/WHEEL +5 -0
  11. frontveg-0.1.dev1.dist-info/entry_points.txt +2 -0
  12. frontveg-0.1.dev1.dist-info/licenses/LICENSE +28 -0
  13. frontveg-0.1.dev1.dist-info/top_level.txt +2 -0
  14. sam2/__init__.py +11 -0
  15. sam2/automatic_mask_generator.py +454 -0
  16. sam2/build_sam.py +167 -0
  17. sam2/configs/sam2/sam2_hiera_b+.yaml +113 -0
  18. sam2/configs/sam2/sam2_hiera_l.yaml +117 -0
  19. sam2/configs/sam2/sam2_hiera_s.yaml +116 -0
  20. sam2/configs/sam2/sam2_hiera_t.yaml +118 -0
  21. sam2/modeling/__init__.py +5 -0
  22. sam2/modeling/backbones/__init__.py +5 -0
  23. sam2/modeling/backbones/hieradet.py +317 -0
  24. sam2/modeling/backbones/image_encoder.py +134 -0
  25. sam2/modeling/backbones/utils.py +95 -0
  26. sam2/modeling/memory_attention.py +169 -0
  27. sam2/modeling/memory_encoder.py +181 -0
  28. sam2/modeling/position_encoding.py +221 -0
  29. sam2/modeling/sam/__init__.py +5 -0
  30. sam2/modeling/sam/mask_decoder.py +295 -0
  31. sam2/modeling/sam/prompt_encoder.py +182 -0
  32. sam2/modeling/sam/transformer.py +360 -0
  33. sam2/modeling/sam2_base.py +907 -0
  34. sam2/modeling/sam2_utils.py +323 -0
  35. sam2/sam2_hiera_b+.yaml +1 -0
  36. sam2/sam2_hiera_l.yaml +1 -0
  37. sam2/sam2_hiera_s.yaml +1 -0
  38. sam2/sam2_hiera_t.yaml +1 -0
  39. sam2/sam2_image_predictor.py +466 -0
  40. sam2/sam2_video_predictor.py +1172 -0
  41. sam2/utils/__init__.py +5 -0
  42. sam2/utils/amg.py +348 -0
  43. sam2/utils/misc.py +349 -0
  44. sam2/utils/transforms.py +118 -0
sam2/utils/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
sam2/utils/amg.py ADDED
@@ -0,0 +1,348 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import math
8
+ from copy import deepcopy
9
+ from itertools import product
10
+ from typing import Any, Dict, Generator, ItemsView, List, Tuple
11
+
12
+ import numpy as np
13
+ import torch
14
+
15
+ # Very lightly adapted from https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/utils/amg.py
16
+
17
+
18
+ class MaskData:
19
+ """
20
+ A structure for storing masks and their related data in batched format.
21
+ Implements basic filtering and concatenation.
22
+ """
23
+
24
+ def __init__(self, **kwargs) -> None:
25
+ for v in kwargs.values():
26
+ assert isinstance(
27
+ v, (list, np.ndarray, torch.Tensor)
28
+ ), "MaskData only supports list, numpy arrays, and torch tensors."
29
+ self._stats = dict(**kwargs)
30
+
31
+ def __setitem__(self, key: str, item: Any) -> None:
32
+ assert isinstance(
33
+ item, (list, np.ndarray, torch.Tensor)
34
+ ), "MaskData only supports list, numpy arrays, and torch tensors."
35
+ self._stats[key] = item
36
+
37
+ def __delitem__(self, key: str) -> None:
38
+ del self._stats[key]
39
+
40
+ def __getitem__(self, key: str) -> Any:
41
+ return self._stats[key]
42
+
43
+ def items(self) -> ItemsView[str, Any]:
44
+ return self._stats.items()
45
+
46
+ def filter(self, keep: torch.Tensor) -> None:
47
+ for k, v in self._stats.items():
48
+ if v is None:
49
+ self._stats[k] = None
50
+ elif isinstance(v, torch.Tensor):
51
+ self._stats[k] = v[torch.as_tensor(keep, device=v.device)]
52
+ elif isinstance(v, np.ndarray):
53
+ self._stats[k] = v[keep.detach().cpu().numpy()]
54
+ elif isinstance(v, list) and keep.dtype == torch.bool:
55
+ self._stats[k] = [a for i, a in enumerate(v) if keep[i]]
56
+ elif isinstance(v, list):
57
+ self._stats[k] = [v[i] for i in keep]
58
+ else:
59
+ raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
60
+
61
+ def cat(self, new_stats: "MaskData") -> None:
62
+ for k, v in new_stats.items():
63
+ if k not in self._stats or self._stats[k] is None:
64
+ self._stats[k] = deepcopy(v)
65
+ elif isinstance(v, torch.Tensor):
66
+ self._stats[k] = torch.cat([self._stats[k], v], dim=0)
67
+ elif isinstance(v, np.ndarray):
68
+ self._stats[k] = np.concatenate([self._stats[k], v], axis=0)
69
+ elif isinstance(v, list):
70
+ self._stats[k] = self._stats[k] + deepcopy(v)
71
+ else:
72
+ raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
73
+
74
+ def to_numpy(self) -> None:
75
+ for k, v in self._stats.items():
76
+ if isinstance(v, torch.Tensor):
77
+ self._stats[k] = v.float().detach().cpu().numpy()
78
+
79
+
80
+ def is_box_near_crop_edge(
81
+ boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0
82
+ ) -> torch.Tensor:
83
+ """Filter masks at the edge of a crop, but not at the edge of the original image."""
84
+ crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
85
+ orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
86
+ boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
87
+ near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
88
+ near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
89
+ near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
90
+ return torch.any(near_crop_edge, dim=1)
91
+
92
+
93
+ def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor:
94
+ box_xywh = deepcopy(box_xyxy)
95
+ box_xywh[2] = box_xywh[2] - box_xywh[0]
96
+ box_xywh[3] = box_xywh[3] - box_xywh[1]
97
+ return box_xywh
98
+
99
+
100
+ def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
101
+ assert len(args) > 0 and all(
102
+ len(a) == len(args[0]) for a in args
103
+ ), "Batched iteration must have inputs of all the same size."
104
+ n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
105
+ for b in range(n_batches):
106
+ yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]
107
+
108
+
109
+ def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]:
110
+ """
111
+ Encodes masks to an uncompressed RLE, in the format expected by
112
+ pycoco tools.
113
+ """
114
+ # Put in fortran order and flatten h,w
115
+ b, h, w = tensor.shape
116
+ tensor = tensor.permute(0, 2, 1).flatten(1)
117
+
118
+ # Compute change indices
119
+ diff = tensor[:, 1:] ^ tensor[:, :-1]
120
+ change_indices = diff.nonzero()
121
+
122
+ # Encode run length
123
+ out = []
124
+ for i in range(b):
125
+ cur_idxs = change_indices[change_indices[:, 0] == i, 1]
126
+ cur_idxs = torch.cat(
127
+ [
128
+ torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device),
129
+ cur_idxs + 1,
130
+ torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device),
131
+ ]
132
+ )
133
+ btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
134
+ counts = [] if tensor[i, 0] == 0 else [0]
135
+ counts.extend(btw_idxs.detach().cpu().tolist())
136
+ out.append({"size": [h, w], "counts": counts})
137
+ return out
138
+
139
+
140
+ def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
141
+ """Compute a binary mask from an uncompressed RLE."""
142
+ h, w = rle["size"]
143
+ mask = np.empty(h * w, dtype=bool)
144
+ idx = 0
145
+ parity = False
146
+ for count in rle["counts"]:
147
+ mask[idx : idx + count] = parity
148
+ idx += count
149
+ parity ^= True
150
+ mask = mask.reshape(w, h)
151
+ return mask.transpose() # Put in C order
152
+
153
+
154
+ def area_from_rle(rle: Dict[str, Any]) -> int:
155
+ return sum(rle["counts"][1::2])
156
+
157
+
158
+ def calculate_stability_score(
159
+ masks: torch.Tensor, mask_threshold: float, threshold_offset: float
160
+ ) -> torch.Tensor:
161
+ """
162
+ Computes the stability score for a batch of masks. The stability
163
+ score is the IoU between the binary masks obtained by thresholding
164
+ the predicted mask logits at high and low values.
165
+ """
166
+ # One mask is always contained inside the other.
167
+ # Save memory by preventing unnecessary cast to torch.int64
168
+ intersections = (
169
+ (masks > (mask_threshold + threshold_offset))
170
+ .sum(-1, dtype=torch.int16)
171
+ .sum(-1, dtype=torch.int32)
172
+ )
173
+ unions = (
174
+ (masks > (mask_threshold - threshold_offset))
175
+ .sum(-1, dtype=torch.int16)
176
+ .sum(-1, dtype=torch.int32)
177
+ )
178
+ return intersections / unions
179
+
180
+
181
+ def build_point_grid(n_per_side: int) -> np.ndarray:
182
+ """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
183
+ offset = 1 / (2 * n_per_side)
184
+ points_one_side = np.linspace(offset, 1 - offset, n_per_side)
185
+ points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
186
+ points_y = np.tile(points_one_side[:, None], (1, n_per_side))
187
+ points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
188
+ return points
189
+
190
+
191
+ def build_all_layer_point_grids(
192
+ n_per_side: int, n_layers: int, scale_per_layer: int
193
+ ) -> List[np.ndarray]:
194
+ """Generates point grids for all crop layers."""
195
+ points_by_layer = []
196
+ for i in range(n_layers + 1):
197
+ n_points = int(n_per_side / (scale_per_layer**i))
198
+ points_by_layer.append(build_point_grid(n_points))
199
+ return points_by_layer
200
+
201
+
202
+ def generate_crop_boxes(
203
+ im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
204
+ ) -> Tuple[List[List[int]], List[int]]:
205
+ """
206
+ Generates a list of crop boxes of different sizes. Each layer
207
+ has (2**i)**2 boxes for the ith layer.
208
+ """
209
+ crop_boxes, layer_idxs = [], []
210
+ im_h, im_w = im_size
211
+ short_side = min(im_h, im_w)
212
+
213
+ # Original image
214
+ crop_boxes.append([0, 0, im_w, im_h])
215
+ layer_idxs.append(0)
216
+
217
+ def crop_len(orig_len, n_crops, overlap):
218
+ return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
219
+
220
+ for i_layer in range(n_layers):
221
+ n_crops_per_side = 2 ** (i_layer + 1)
222
+ overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
223
+
224
+ crop_w = crop_len(im_w, n_crops_per_side, overlap)
225
+ crop_h = crop_len(im_h, n_crops_per_side, overlap)
226
+
227
+ crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
228
+ crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
229
+
230
+ # Crops in XYWH format
231
+ for x0, y0 in product(crop_box_x0, crop_box_y0):
232
+ box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
233
+ crop_boxes.append(box)
234
+ layer_idxs.append(i_layer + 1)
235
+
236
+ return crop_boxes, layer_idxs
237
+
238
+
239
+ def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
240
+ x0, y0, _, _ = crop_box
241
+ offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
242
+ # Check if boxes has a channel dimension
243
+ if len(boxes.shape) == 3:
244
+ offset = offset.unsqueeze(1)
245
+ return boxes + offset
246
+
247
+
248
+ def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
249
+ x0, y0, _, _ = crop_box
250
+ offset = torch.tensor([[x0, y0]], device=points.device)
251
+ # Check if points has a channel dimension
252
+ if len(points.shape) == 3:
253
+ offset = offset.unsqueeze(1)
254
+ return points + offset
255
+
256
+
257
+ def uncrop_masks(
258
+ masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int
259
+ ) -> torch.Tensor:
260
+ x0, y0, x1, y1 = crop_box
261
+ if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
262
+ return masks
263
+ # Coordinate transform masks
264
+ pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
265
+ pad = (x0, pad_x - x0, y0, pad_y - y0)
266
+ return torch.nn.functional.pad(masks, pad, value=0)
267
+
268
+
269
+ def remove_small_regions(
270
+ mask: np.ndarray, area_thresh: float, mode: str
271
+ ) -> Tuple[np.ndarray, bool]:
272
+ """
273
+ Removes small disconnected regions and holes in a mask. Returns the
274
+ mask and an indicator of if the mask has been modified.
275
+ """
276
+ import cv2 # type: ignore
277
+
278
+ assert mode in ["holes", "islands"]
279
+ correct_holes = mode == "holes"
280
+ working_mask = (correct_holes ^ mask).astype(np.uint8)
281
+ n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
282
+ sizes = stats[:, -1][1:] # Row 0 is background label
283
+ small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
284
+ if len(small_regions) == 0:
285
+ return mask, False
286
+ fill_labels = [0] + small_regions
287
+ if not correct_holes:
288
+ fill_labels = [i for i in range(n_labels) if i not in fill_labels]
289
+ # If every region is below threshold, keep largest
290
+ if len(fill_labels) == 0:
291
+ fill_labels = [int(np.argmax(sizes)) + 1]
292
+ mask = np.isin(regions, fill_labels)
293
+ return mask, True
294
+
295
+
296
+ def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]:
297
+ from pycocotools import mask as mask_utils # type: ignore
298
+
299
+ h, w = uncompressed_rle["size"]
300
+ rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
301
+ rle["counts"] = rle["counts"].decode("utf-8") # Necessary to serialize with json
302
+ return rle
303
+
304
+
305
+ def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
306
+ """
307
+ Calculates boxes in XYXY format around masks. Return [0,0,0,0] for
308
+ an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
309
+ """
310
+ # torch.max below raises an error on empty inputs, just skip in this case
311
+ if torch.numel(masks) == 0:
312
+ return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
313
+
314
+ # Normalize shape to CxHxW
315
+ shape = masks.shape
316
+ h, w = shape[-2:]
317
+ if len(shape) > 2:
318
+ masks = masks.flatten(0, -3)
319
+ else:
320
+ masks = masks.unsqueeze(0)
321
+
322
+ # Get top and bottom edges
323
+ in_height, _ = torch.max(masks, dim=-1)
324
+ in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
325
+ bottom_edges, _ = torch.max(in_height_coords, dim=-1)
326
+ in_height_coords = in_height_coords + h * (~in_height)
327
+ top_edges, _ = torch.min(in_height_coords, dim=-1)
328
+
329
+ # Get left and right edges
330
+ in_width, _ = torch.max(masks, dim=-2)
331
+ in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
332
+ right_edges, _ = torch.max(in_width_coords, dim=-1)
333
+ in_width_coords = in_width_coords + w * (~in_width)
334
+ left_edges, _ = torch.min(in_width_coords, dim=-1)
335
+
336
+ # If the mask is empty the right edge will be to the left of the left edge.
337
+ # Replace these boxes with [0, 0, 0, 0]
338
+ empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
339
+ out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
340
+ out = out * (~empty_filter).unsqueeze(-1)
341
+
342
+ # Return to original shape
343
+ if len(shape) > 2:
344
+ out = out.reshape(*shape[:-2], 4)
345
+ else:
346
+ out = out[0]
347
+
348
+ return out
sam2/utils/misc.py ADDED
@@ -0,0 +1,349 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import os
8
+ import warnings
9
+ from threading import Thread
10
+
11
+ import numpy as np
12
+ import torch
13
+ from PIL import Image
14
+ from tqdm import tqdm
15
+
16
+
17
+ def get_sdpa_settings():
18
+ if torch.cuda.is_available():
19
+ old_gpu = torch.cuda.get_device_properties(0).major < 7
20
+ # only use Flash Attention on Ampere (8.0) or newer GPUs
21
+ use_flash_attn = torch.cuda.get_device_properties(0).major >= 8
22
+ if not use_flash_attn:
23
+ warnings.warn(
24
+ "Flash Attention is disabled as it requires a GPU with Ampere (8.0) CUDA capability.",
25
+ category=UserWarning,
26
+ stacklevel=2,
27
+ )
28
+ # keep math kernel for PyTorch versions before 2.2 (Flash Attention v2 is only
29
+ # available on PyTorch 2.2+, while Flash Attention v1 cannot handle all cases)
30
+ pytorch_version = tuple(int(v) for v in torch.__version__.split(".")[:2])
31
+ if pytorch_version < (2, 2):
32
+ warnings.warn(
33
+ f"You are using PyTorch {torch.__version__} without Flash Attention v2 support. "
34
+ "Consider upgrading to PyTorch 2.2+ for Flash Attention v2 (which could be faster).",
35
+ category=UserWarning,
36
+ stacklevel=2,
37
+ )
38
+ math_kernel_on = pytorch_version < (2, 2) or not use_flash_attn
39
+ else:
40
+ old_gpu = True
41
+ use_flash_attn = False
42
+ math_kernel_on = True
43
+
44
+ return old_gpu, use_flash_attn, math_kernel_on
45
+
46
+
47
+ def get_connected_components(mask):
48
+ """
49
+ Get the connected components (8-connectivity) of binary masks of shape (N, 1, H, W).
50
+
51
+ Inputs:
52
+ - mask: A binary mask tensor of shape (N, 1, H, W), where 1 is foreground and 0 is
53
+ background.
54
+
55
+ Outputs:
56
+ - labels: A tensor of shape (N, 1, H, W) containing the connected component labels
57
+ for foreground pixels and 0 for background pixels.
58
+ - counts: A tensor of shape (N, 1, H, W) containing the area of the connected
59
+ components for foreground pixels and 0 for background pixels.
60
+ """
61
+ from sam2 import _C
62
+
63
+ return _C.get_connected_componnets(mask.to(torch.uint8).contiguous())
64
+
65
+
66
+ def mask_to_box(masks: torch.Tensor):
67
+ """
68
+ compute bounding box given an input mask
69
+
70
+ Inputs:
71
+ - masks: [B, 1, H, W] masks, dtype=torch.Tensor
72
+
73
+ Returns:
74
+ - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
75
+ """
76
+ B, _, h, w = masks.shape
77
+ device = masks.device
78
+ xs = torch.arange(w, device=device, dtype=torch.int32)
79
+ ys = torch.arange(h, device=device, dtype=torch.int32)
80
+ grid_xs, grid_ys = torch.meshgrid(xs, ys, indexing="xy")
81
+ grid_xs = grid_xs[None, None, ...].expand(B, 1, h, w)
82
+ grid_ys = grid_ys[None, None, ...].expand(B, 1, h, w)
83
+ min_xs, _ = torch.min(torch.where(masks, grid_xs, w).flatten(-2), dim=-1)
84
+ max_xs, _ = torch.max(torch.where(masks, grid_xs, -1).flatten(-2), dim=-1)
85
+ min_ys, _ = torch.min(torch.where(masks, grid_ys, h).flatten(-2), dim=-1)
86
+ max_ys, _ = torch.max(torch.where(masks, grid_ys, -1).flatten(-2), dim=-1)
87
+ bbox_coords = torch.stack((min_xs, min_ys, max_xs, max_ys), dim=-1)
88
+
89
+ return bbox_coords
90
+
91
+
92
+ def _load_img_as_tensor(img_path, image_size):
93
+ img_pil = Image.open(img_path)
94
+ img_np = np.array(img_pil.convert("RGB").resize((image_size, image_size)))
95
+ if img_np.dtype == np.uint8: # np.uint8 is expected for JPEG images
96
+ img_np = img_np / 255.0
97
+ else:
98
+ raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}")
99
+ img = torch.from_numpy(img_np).permute(2, 0, 1)
100
+ video_width, video_height = img_pil.size # the original video size
101
+ return img, video_height, video_width
102
+
103
+
104
+ class AsyncVideoFrameLoader:
105
+ """
106
+ A list of video frames to be load asynchronously without blocking session start.
107
+ """
108
+
109
+ def __init__(
110
+ self,
111
+ img_paths,
112
+ image_size,
113
+ offload_video_to_cpu,
114
+ img_mean,
115
+ img_std,
116
+ compute_device,
117
+ ):
118
+ self.img_paths = img_paths
119
+ self.image_size = image_size
120
+ self.offload_video_to_cpu = offload_video_to_cpu
121
+ self.img_mean = img_mean
122
+ self.img_std = img_std
123
+ # items in `self.images` will be loaded asynchronously
124
+ self.images = [None] * len(img_paths)
125
+ # catch and raise any exceptions in the async loading thread
126
+ self.exception = None
127
+ # video_height and video_width be filled when loading the first image
128
+ self.video_height = None
129
+ self.video_width = None
130
+ self.compute_device = compute_device
131
+
132
+ # load the first frame to fill video_height and video_width and also
133
+ # to cache it (since it's most likely where the user will click)
134
+ self.__getitem__(0)
135
+
136
+ # load the rest of frames asynchronously without blocking the session start
137
+ def _load_frames():
138
+ try:
139
+ for n in tqdm(range(len(self.images)), desc="frame loading (JPEG)"):
140
+ self.__getitem__(n)
141
+ except Exception as e:
142
+ self.exception = e
143
+
144
+ self.thread = Thread(target=_load_frames, daemon=True)
145
+ self.thread.start()
146
+
147
+ def __getitem__(self, index):
148
+ if self.exception is not None:
149
+ raise RuntimeError("Failure in frame loading thread") from self.exception
150
+
151
+ img = self.images[index]
152
+ if img is not None:
153
+ return img
154
+
155
+ img, video_height, video_width = _load_img_as_tensor(
156
+ self.img_paths[index], self.image_size
157
+ )
158
+ self.video_height = video_height
159
+ self.video_width = video_width
160
+ # normalize by mean and std
161
+ img -= self.img_mean
162
+ img /= self.img_std
163
+ if not self.offload_video_to_cpu:
164
+ img = img.to(self.compute_device, non_blocking=True)
165
+ self.images[index] = img
166
+ return img
167
+
168
+ def __len__(self):
169
+ return len(self.images)
170
+
171
+
172
+ def load_video_frames(
173
+ video_path,
174
+ image_size,
175
+ offload_video_to_cpu,
176
+ img_mean=(0.485, 0.456, 0.406),
177
+ img_std=(0.229, 0.224, 0.225),
178
+ async_loading_frames=False,
179
+ compute_device=torch.device("cuda"),
180
+ ):
181
+ """
182
+ Load the video frames from video_path. The frames are resized to image_size as in
183
+ the model and are loaded to GPU if offload_video_to_cpu=False. This is used by the demo.
184
+ """
185
+ is_bytes = isinstance(video_path, bytes)
186
+ is_str = isinstance(video_path, str)
187
+ is_mp4_path = is_str and os.path.splitext(video_path)[-1] in [".mp4", ".MP4"]
188
+ if is_bytes or is_mp4_path:
189
+ return load_video_frames_from_video_file(
190
+ video_path=video_path,
191
+ image_size=image_size,
192
+ offload_video_to_cpu=offload_video_to_cpu,
193
+ img_mean=img_mean,
194
+ img_std=img_std,
195
+ compute_device=compute_device,
196
+ )
197
+ elif is_str and os.path.isdir(video_path):
198
+ return load_video_frames_from_jpg_images(
199
+ video_path=video_path,
200
+ image_size=image_size,
201
+ offload_video_to_cpu=offload_video_to_cpu,
202
+ img_mean=img_mean,
203
+ img_std=img_std,
204
+ async_loading_frames=async_loading_frames,
205
+ compute_device=compute_device,
206
+ )
207
+ else:
208
+ raise NotImplementedError(
209
+ "Only MP4 video and JPEG folder are supported at this moment"
210
+ )
211
+
212
+
213
+ def load_video_frames_from_jpg_images(
214
+ video_path,
215
+ image_size,
216
+ offload_video_to_cpu,
217
+ img_mean=(0.485, 0.456, 0.406),
218
+ img_std=(0.229, 0.224, 0.225),
219
+ async_loading_frames=False,
220
+ compute_device=torch.device("cuda"),
221
+ ):
222
+ """
223
+ Load the video frames from a directory of JPEG files ("<frame_index>.jpg" format).
224
+
225
+ The frames are resized to image_size x image_size and are loaded to GPU if
226
+ `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.
227
+
228
+ You can load a frame asynchronously by setting `async_loading_frames` to `True`.
229
+ """
230
+ if isinstance(video_path, str) and os.path.isdir(video_path):
231
+ jpg_folder = video_path
232
+ else:
233
+ raise NotImplementedError(
234
+ "Only JPEG frames are supported at this moment. For video files, you may use "
235
+ "ffmpeg (https://ffmpeg.org/) to extract frames into a folder of JPEG files, such as \n"
236
+ "```\n"
237
+ "ffmpeg -i <your_video>.mp4 -q:v 2 -start_number 0 <output_dir>/'%05d.jpg'\n"
238
+ "```\n"
239
+ "where `-q:v` generates high-quality JPEG frames and `-start_number 0` asks "
240
+ "ffmpeg to start the JPEG file from 00000.jpg."
241
+ )
242
+
243
+ frame_names = [
244
+ p
245
+ for p in os.listdir(jpg_folder)
246
+ if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
247
+ ]
248
+ frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
249
+ num_frames = len(frame_names)
250
+ if num_frames == 0:
251
+ raise RuntimeError(f"no images found in {jpg_folder}")
252
+ img_paths = [os.path.join(jpg_folder, frame_name) for frame_name in frame_names]
253
+ img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
254
+ img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
255
+
256
+ if async_loading_frames:
257
+ lazy_images = AsyncVideoFrameLoader(
258
+ img_paths,
259
+ image_size,
260
+ offload_video_to_cpu,
261
+ img_mean,
262
+ img_std,
263
+ compute_device,
264
+ )
265
+ return lazy_images, lazy_images.video_height, lazy_images.video_width
266
+
267
+ images = torch.zeros(num_frames, 3, image_size, image_size, dtype=torch.float32)
268
+ for n, img_path in enumerate(tqdm(img_paths, desc="frame loading (JPEG)")):
269
+ images[n], video_height, video_width = _load_img_as_tensor(img_path, image_size)
270
+ if not offload_video_to_cpu:
271
+ images = images.to(compute_device)
272
+ img_mean = img_mean.to(compute_device)
273
+ img_std = img_std.to(compute_device)
274
+ # normalize by mean and std
275
+ images -= img_mean
276
+ images /= img_std
277
+ return images, video_height, video_width
278
+
279
+
280
+ def load_video_frames_from_video_file(
281
+ video_path,
282
+ image_size,
283
+ offload_video_to_cpu,
284
+ img_mean=(0.485, 0.456, 0.406),
285
+ img_std=(0.229, 0.224, 0.225),
286
+ compute_device=torch.device("cuda"),
287
+ ):
288
+ """Load the video frames from a video file."""
289
+ import decord
290
+
291
+ img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
292
+ img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
293
+ # Get the original video height and width
294
+ decord.bridge.set_bridge("torch")
295
+ video_height, video_width, _ = decord.VideoReader(video_path).next().shape
296
+ # Iterate over all frames in the video
297
+ images = []
298
+ for frame in decord.VideoReader(video_path, width=image_size, height=image_size):
299
+ images.append(frame.permute(2, 0, 1))
300
+
301
+ images = torch.stack(images, dim=0).float() / 255.0
302
+ if not offload_video_to_cpu:
303
+ images = images.to(compute_device)
304
+ img_mean = img_mean.to(compute_device)
305
+ img_std = img_std.to(compute_device)
306
+ # normalize by mean and std
307
+ images -= img_mean
308
+ images /= img_std
309
+ return images, video_height, video_width
310
+
311
+
312
+ def fill_holes_in_mask_scores(mask, max_area):
313
+ """
314
+ A post processor to fill small holes in mask scores with area under `max_area`.
315
+ """
316
+ # Holes are those connected components in background with area <= self.max_area
317
+ # (background regions are those with mask scores <= 0)
318
+ assert max_area > 0, "max_area must be positive"
319
+
320
+ input_mask = mask
321
+ try:
322
+ labels, areas = get_connected_components(mask <= 0)
323
+ is_hole = (labels > 0) & (areas <= max_area)
324
+ # We fill holes with a small positive mask score (0.1) to change them to foreground.
325
+ mask = torch.where(is_hole, 0.1, mask)
326
+ except Exception as e:
327
+ # Skip the post-processing step on removing small holes if the CUDA kernel fails
328
+ warnings.warn(
329
+ f"{e}\n\nSkipping the post-processing step due to the error above. You can "
330
+ "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
331
+ "functionality may be limited (which doesn't affect the results in most cases; see "
332
+ "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
333
+ category=UserWarning,
334
+ stacklevel=2,
335
+ )
336
+ mask = input_mask
337
+
338
+ return mask
339
+
340
+
341
+ def concat_points(old_point_inputs, new_points, new_labels):
342
+ """Add new points and labels to previous point inputs (add at the end)."""
343
+ if old_point_inputs is None:
344
+ points, labels = new_points, new_labels
345
+ else:
346
+ points = torch.cat([old_point_inputs["point_coords"], new_points], dim=1)
347
+ labels = torch.cat([old_point_inputs["point_labels"], new_labels], dim=1)
348
+
349
+ return {"point_coords": points, "point_labels": labels}