dgenerate-ultralytics-headless 8.3.187__py3-none-any.whl → 8.3.190__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {dgenerate_ultralytics_headless-8.3.187.dist-info → dgenerate_ultralytics_headless-8.3.190.dist-info}/METADATA +3 -2
  2. {dgenerate_ultralytics_headless-8.3.187.dist-info → dgenerate_ultralytics_headless-8.3.190.dist-info}/RECORD +38 -37
  3. ultralytics/__init__.py +1 -1
  4. ultralytics/data/utils.py +2 -2
  5. ultralytics/engine/exporter.py +9 -6
  6. ultralytics/engine/predictor.py +1 -1
  7. ultralytics/engine/results.py +5 -5
  8. ultralytics/engine/trainer.py +2 -0
  9. ultralytics/engine/validator.py +3 -1
  10. ultralytics/hub/__init__.py +6 -2
  11. ultralytics/hub/auth.py +2 -2
  12. ultralytics/hub/google/__init__.py +2 -2
  13. ultralytics/hub/session.py +3 -5
  14. ultralytics/hub/utils.py +5 -5
  15. ultralytics/models/rtdetr/val.py +3 -1
  16. ultralytics/models/yolo/detect/predict.py +2 -2
  17. ultralytics/models/yolo/detect/val.py +15 -4
  18. ultralytics/models/yolo/obb/val.py +5 -2
  19. ultralytics/models/yolo/segment/val.py +0 -3
  20. ultralytics/nn/autobackend.py +29 -36
  21. ultralytics/nn/modules/__init__.py +3 -3
  22. ultralytics/nn/modules/head.py +5 -1
  23. ultralytics/nn/tasks.py +2 -2
  24. ultralytics/utils/__init__.py +49 -14
  25. ultralytics/utils/benchmarks.py +12 -6
  26. ultralytics/utils/callbacks/platform.py +2 -1
  27. ultralytics/utils/checks.py +3 -3
  28. ultralytics/utils/downloads.py +46 -40
  29. ultralytics/utils/logger.py +7 -6
  30. ultralytics/utils/nms.py +346 -0
  31. ultralytics/utils/ops.py +80 -249
  32. ultralytics/utils/tal.py +1 -1
  33. ultralytics/utils/torch_utils.py +50 -47
  34. ultralytics/utils/tqdm.py +58 -59
  35. {dgenerate_ultralytics_headless-8.3.187.dist-info → dgenerate_ultralytics_headless-8.3.190.dist-info}/WHEEL +0 -0
  36. {dgenerate_ultralytics_headless-8.3.187.dist-info → dgenerate_ultralytics_headless-8.3.190.dist-info}/entry_points.txt +0 -0
  37. {dgenerate_ultralytics_headless-8.3.187.dist-info → dgenerate_ultralytics_headless-8.3.190.dist-info}/licenses/LICENSE +0 -0
  38. {dgenerate_ultralytics_headless-8.3.187.dist-info → dgenerate_ultralytics_headless-8.3.190.dist-info}/top_level.txt +0 -0
ultralytics/utils/ops.py CHANGED
@@ -11,8 +11,7 @@ import numpy as np
11
11
  import torch
12
12
  import torch.nn.functional as F
13
13
 
14
- from ultralytics.utils import LOGGER
15
- from ultralytics.utils.metrics import batch_probiou
14
+ from ultralytics.utils import NOT_MACOS14
16
15
 
17
16
 
18
17
  class Profile(contextlib.ContextDecorator):
@@ -122,20 +121,18 @@ def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding: bool = T
122
121
  """
123
122
  if ratio_pad is None: # calculate from img0_shape
124
123
  gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
125
- pad = (
126
- round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1),
127
- round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1),
128
- ) # wh padding
124
+ pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
125
+ pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
129
126
  else:
130
127
  gain = ratio_pad[0][0]
131
- pad = ratio_pad[1]
128
+ pad_x, pad_y = ratio_pad[1]
132
129
 
133
130
  if padding:
134
- boxes[..., 0] -= pad[0] # x padding
135
- boxes[..., 1] -= pad[1] # y padding
131
+ boxes[..., 0] -= pad_x # x padding
132
+ boxes[..., 1] -= pad_y # y padding
136
133
  if not xywh:
137
- boxes[..., 2] -= pad[0] # x padding
138
- boxes[..., 3] -= pad[1] # y padding
134
+ boxes[..., 2] -= pad_x # x padding
135
+ boxes[..., 3] -= pad_y # y padding
139
136
  boxes[..., :4] /= gain
140
137
  return clip_boxes(boxes, img0_shape)
141
138
 
@@ -156,207 +153,32 @@ def make_divisible(x: int, divisor):
156
153
  return math.ceil(x / divisor) * divisor
157
154
 
158
155
 
159
- def nms_rotated(boxes, scores, threshold: float = 0.45, use_triu: bool = True):
160
- """
161
- Perform NMS on oriented bounding boxes using probiou and fast-nms.
162
-
163
- Args:
164
- boxes (torch.Tensor): Rotated bounding boxes with shape (N, 5) in xywhr format.
165
- scores (torch.Tensor): Confidence scores with shape (N,).
166
- threshold (float): IoU threshold for NMS.
167
- use_triu (bool): Whether to use torch.triu operator for upper triangular matrix operations.
168
-
169
- Returns:
170
- (torch.Tensor): Indices of boxes to keep after NMS.
171
- """
172
- sorted_idx = torch.argsort(scores, descending=True)
173
- boxes = boxes[sorted_idx]
174
- ious = batch_probiou(boxes, boxes)
175
- if use_triu:
176
- ious = ious.triu_(diagonal=1)
177
- # NOTE: handle the case when len(boxes) hence exportable by eliminating if-else condition
178
- pick = torch.nonzero((ious >= threshold).sum(0) <= 0).squeeze_(-1)
179
- else:
180
- n = boxes.shape[0]
181
- row_idx = torch.arange(n, device=boxes.device).view(-1, 1).expand(-1, n)
182
- col_idx = torch.arange(n, device=boxes.device).view(1, -1).expand(n, -1)
183
- upper_mask = row_idx < col_idx
184
- ious = ious * upper_mask
185
- # Zeroing these scores ensures the additional indices would not affect the final results
186
- scores[~((ious >= threshold).sum(0) <= 0)] = 0
187
- # NOTE: return indices with fixed length to avoid TFLite reshape error
188
- pick = torch.topk(scores, scores.shape[0]).indices
189
- return sorted_idx[pick]
190
-
191
-
192
- def non_max_suppression(
193
- prediction,
194
- conf_thres: float = 0.25,
195
- iou_thres: float = 0.45,
196
- classes=None,
197
- agnostic: bool = False,
198
- multi_label: bool = False,
199
- labels=(),
200
- max_det: int = 300,
201
- nc: int = 0, # number of classes (optional)
202
- max_time_img: float = 0.05,
203
- max_nms: int = 30000,
204
- max_wh: int = 7680,
205
- in_place: bool = True,
206
- rotated: bool = False,
207
- end2end: bool = False,
208
- return_idxs: bool = False,
209
- ):
210
- """
211
- Perform non-maximum suppression (NMS) on prediction results.
212
-
213
- Applies NMS to filter overlapping bounding boxes based on confidence and IoU thresholds. Supports multiple
214
- detection formats including standard boxes, rotated boxes, and masks.
215
-
216
- Args:
217
- prediction (torch.Tensor): Predictions with shape (batch_size, num_classes + 4 + num_masks, num_boxes)
218
- containing boxes, classes, and optional masks.
219
- conf_thres (float): Confidence threshold for filtering detections. Valid values are between 0.0 and 1.0.
220
- iou_thres (float): IoU threshold for NMS filtering. Valid values are between 0.0 and 1.0.
221
- classes (List[int], optional): List of class indices to consider. If None, all classes are considered.
222
- agnostic (bool): Whether to perform class-agnostic NMS.
223
- multi_label (bool): Whether each box can have multiple labels.
224
- labels (List[List[Union[int, float, torch.Tensor]]]): A priori labels for each image.
225
- max_det (int): Maximum number of detections to keep per image.
226
- nc (int): Number of classes. Indices after this are considered masks.
227
- max_time_img (float): Maximum time in seconds for processing one image.
228
- max_nms (int): Maximum number of boxes for torchvision.ops.nms().
229
- max_wh (int): Maximum box width and height in pixels.
230
- in_place (bool): Whether to modify the input prediction tensor in place.
231
- rotated (bool): Whether to handle Oriented Bounding Boxes (OBB).
232
- end2end (bool): Whether the model is end-to-end and doesn't require NMS.
233
- return_idxs (bool): Whether to return the indices of kept detections.
234
-
235
- Returns:
236
- output (List[torch.Tensor]): List of detections per image with shape (num_boxes, 6 + num_masks)
237
- containing (x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
238
- keepi (List[torch.Tensor]): Indices of kept detections if return_idxs=True.
239
- """
240
- import torchvision # scope for faster 'import ultralytics'
241
-
242
- # Checks
243
- assert 0 <= conf_thres <= 1, f"Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0"
244
- assert 0 <= iou_thres <= 1, f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0"
245
- if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
246
- prediction = prediction[0] # select only inference output
247
- if classes is not None:
248
- classes = torch.tensor(classes, device=prediction.device)
249
-
250
- if prediction.shape[-1] == 6 or end2end: # end-to-end model (BNC, i.e. 1,300,6)
251
- output = [pred[pred[:, 4] > conf_thres][:max_det] for pred in prediction]
252
- if classes is not None:
253
- output = [pred[(pred[:, 5:6] == classes).any(1)] for pred in output]
254
- return output
255
-
256
- bs = prediction.shape[0] # batch size (BCN, i.e. 1,84,6300)
257
- nc = nc or (prediction.shape[1] - 4) # number of classes
258
- extra = prediction.shape[1] - nc - 4 # number of extra info
259
- mi = 4 + nc # mask start index
260
- xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates
261
- xinds = torch.stack([torch.arange(len(i), device=prediction.device) for i in xc])[..., None] # to track idxs
262
-
263
- # Settings
264
- # min_wh = 2 # (pixels) minimum box width and height
265
- time_limit = 2.0 + max_time_img * bs # seconds to quit after
266
- multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
267
-
268
- prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84)
269
- if not rotated:
270
- if in_place:
271
- prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy
272
- else:
273
- prediction = torch.cat((xywh2xyxy(prediction[..., :4]), prediction[..., 4:]), dim=-1) # xywh to xyxy
274
-
275
- t = time.time()
276
- output = [torch.zeros((0, 6 + extra), device=prediction.device)] * bs
277
- keepi = [torch.zeros((0, 1), device=prediction.device)] * bs # to store the kept idxs
278
- for xi, (x, xk) in enumerate(zip(prediction, xinds)): # image index, (preds, preds indices)
279
- # Apply constraints
280
- # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
281
- filt = xc[xi] # confidence
282
- x, xk = x[filt], xk[filt]
283
-
284
- # Cat apriori labels if autolabelling
285
- if labels and len(labels[xi]) and not rotated:
286
- lb = labels[xi]
287
- v = torch.zeros((len(lb), nc + extra + 4), device=x.device)
288
- v[:, :4] = xywh2xyxy(lb[:, 1:5]) # box
289
- v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls
290
- x = torch.cat((x, v), 0)
291
-
292
- # If none remain process next image
293
- if not x.shape[0]:
294
- continue
295
-
296
- # Detections matrix nx6 (xyxy, conf, cls)
297
- box, cls, mask = x.split((4, nc, extra), 1)
298
-
299
- if multi_label:
300
- i, j = torch.where(cls > conf_thres)
301
- x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
302
- xk = xk[i]
303
- else: # best class only
304
- conf, j = cls.max(1, keepdim=True)
305
- filt = conf.view(-1) > conf_thres
306
- x = torch.cat((box, conf, j.float(), mask), 1)[filt]
307
- xk = xk[filt]
308
-
309
- # Filter by class
310
- if classes is not None:
311
- filt = (x[:, 5:6] == classes).any(1)
312
- x, xk = x[filt], xk[filt]
313
-
314
- # Check shape
315
- n = x.shape[0] # number of boxes
316
- if not n: # no boxes
317
- continue
318
- if n > max_nms: # excess boxes
319
- filt = x[:, 4].argsort(descending=True)[:max_nms] # sort by confidence and remove excess boxes
320
- x, xk = x[filt], xk[filt]
321
-
322
- # Batched NMS
323
- c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
324
- scores = x[:, 4] # scores
325
- if rotated:
326
- boxes = torch.cat((x[:, :2] + c, x[:, 2:4], x[:, -1:]), dim=-1) # xywhr
327
- i = nms_rotated(boxes, scores, iou_thres)
328
- else:
329
- boxes = x[:, :4] + c # boxes (offset by class)
330
- i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
331
- i = i[:max_det] # limit detections
332
-
333
- output[xi], keepi[xi] = x[i], xk[i].reshape(-1)
334
- if (time.time() - t) > time_limit:
335
- LOGGER.warning(f"NMS time limit {time_limit:.3f}s exceeded")
336
- break # time limit exceeded
337
-
338
- return (output, keepi) if return_idxs else output
339
-
340
-
341
156
  def clip_boxes(boxes, shape):
342
157
  """
343
158
  Clip bounding boxes to image boundaries.
344
159
 
345
160
  Args:
346
161
  boxes (torch.Tensor | np.ndarray): Bounding boxes to clip.
347
- shape (tuple): Image shape as (height, width).
162
+ shape (tuple): Image shape as HWC or HW (supports both).
348
163
 
349
164
  Returns:
350
165
  (torch.Tensor | np.ndarray): Clipped bounding boxes.
351
166
  """
352
- if isinstance(boxes, torch.Tensor): # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
353
- boxes[..., 0] = boxes[..., 0].clamp(0, shape[1]) # x1
354
- boxes[..., 1] = boxes[..., 1].clamp(0, shape[0]) # y1
355
- boxes[..., 2] = boxes[..., 2].clamp(0, shape[1]) # x2
356
- boxes[..., 3] = boxes[..., 3].clamp(0, shape[0]) # y2
167
+ h, w = shape[:2] # supports both HWC or HW shapes
168
+ if isinstance(boxes, torch.Tensor): # faster individually
169
+ if NOT_MACOS14:
170
+ boxes[..., 0].clamp_(0, w) # x1
171
+ boxes[..., 1].clamp_(0, h) # y1
172
+ boxes[..., 2].clamp_(0, w) # x2
173
+ boxes[..., 3].clamp_(0, h) # y2
174
+ else: # Apple macOS14 MPS bug https://github.com/ultralytics/ultralytics/pull/21878
175
+ boxes[..., 0] = boxes[..., 0].clamp(0, w)
176
+ boxes[..., 1] = boxes[..., 1].clamp(0, h)
177
+ boxes[..., 2] = boxes[..., 2].clamp(0, w)
178
+ boxes[..., 3] = boxes[..., 3].clamp(0, h)
357
179
  else: # np.array (faster grouped)
358
- boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
359
- boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
180
+ boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, w) # x1, x2
181
+ boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, h) # y1, y2
360
182
  return boxes
361
183
 
362
184
 
@@ -366,17 +188,22 @@ def clip_coords(coords, shape):
366
188
 
367
189
  Args:
368
190
  coords (torch.Tensor | np.ndarray): Line coordinates to clip.
369
- shape (tuple): Image shape as (height, width).
191
+ shape (tuple): Image shape as HWC or HW (supports both).
370
192
 
371
193
  Returns:
372
194
  (torch.Tensor | np.ndarray): Clipped coordinates.
373
195
  """
374
- if isinstance(coords, torch.Tensor): # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
375
- coords[..., 0] = coords[..., 0].clamp(0, shape[1]) # x
376
- coords[..., 1] = coords[..., 1].clamp(0, shape[0]) # y
377
- else: # np.array (faster grouped)
378
- coords[..., 0] = coords[..., 0].clip(0, shape[1]) # x
379
- coords[..., 1] = coords[..., 1].clip(0, shape[0]) # y
196
+ h, w = shape[:2] # supports both HWC or HW shapes
197
+ if isinstance(coords, torch.Tensor):
198
+ if NOT_MACOS14:
199
+ coords[..., 0].clamp_(0, w) # x
200
+ coords[..., 1].clamp_(0, h) # y
201
+ else: # Apple macOS14 MPS bug https://github.com/ultralytics/ultralytics/pull/21878
202
+ coords[..., 0] = coords[..., 0].clamp(0, w)
203
+ coords[..., 1] = coords[..., 1].clamp(0, h)
204
+ else: # np.array
205
+ coords[..., 0] = coords[..., 0].clip(0, w) # x
206
+ coords[..., 1] = coords[..., 1].clip(0, h) # y
380
207
  return coords
381
208
 
382
209
 
@@ -389,32 +216,34 @@ def scale_image(masks, im0_shape, ratio_pad=None):
389
216
 
390
217
  Args:
391
218
  masks (np.ndarray): Resized and padded masks with shape [H, W, N] or [H, W, 3].
392
- im0_shape (tuple): Original image shape as (height, width).
219
+ im0_shape (tuple): Original image shape as HWC or HW (supports both).
393
220
  ratio_pad (tuple, optional): Ratio and padding values as ((ratio_h, ratio_w), (pad_h, pad_w)).
394
221
 
395
222
  Returns:
396
223
  (np.ndarray): Rescaled masks with shape [H, W, N] matching original image dimensions.
397
224
  """
398
225
  # Rescale coordinates (xyxy) from im1_shape to im0_shape
399
- im1_shape = masks.shape
400
- if im1_shape[:2] == im0_shape[:2]:
226
+ im0_h, im0_w = im0_shape[:2] # supports both HWC or HW shapes
227
+ im1_h, im1_w, _ = masks.shape
228
+ if im1_h == im0_h and im1_w == im0_w:
401
229
  return masks
230
+
402
231
  if ratio_pad is None: # calculate from im0_shape
403
- gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1]) # gain = old / new
404
- pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2 # wh padding
232
+ gain = min(im1_h / im0_h, im1_w / im0_w) # gain = old / new
233
+ pad = (im1_w - im0_w * gain) / 2, (im1_h - im0_h * gain) / 2 # wh padding
405
234
  else:
406
235
  pad = ratio_pad[1]
407
236
 
408
- top, left = (int(round(pad[1] - 0.1)), int(round(pad[0] - 0.1)))
409
- bottom, right = (
410
- im1_shape[0] - int(round(pad[1] + 0.1)),
411
- im1_shape[1] - int(round(pad[0] + 0.1)),
412
- )
237
+ pad_w, pad_h = pad
238
+ top = int(round(pad_h - 0.1))
239
+ left = int(round(pad_w - 0.1))
240
+ bottom = im1_h - int(round(pad_h + 0.1))
241
+ right = im1_w - int(round(pad_w + 0.1))
413
242
 
414
243
  if len(masks.shape) < 2:
415
244
  raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
416
245
  masks = masks[top:bottom, left:right]
417
- masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]))
246
+ masks = cv2.resize(masks, (im0_w, im0_h))
418
247
  if len(masks.shape) == 2:
419
248
  masks = masks[:, :, None]
420
249
 
@@ -434,10 +263,11 @@ def xyxy2xywh(x):
434
263
  """
435
264
  assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
436
265
  y = empty_like(x) # faster than clone/copy
437
- y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center
438
- y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center
439
- y[..., 2] = x[..., 2] - x[..., 0] # width
440
- y[..., 3] = x[..., 3] - x[..., 1] # height
266
+ x1, y1, x2, y2 = x[..., 0], x[..., 1], x[..., 2], x[..., 3]
267
+ y[..., 0] = (x1 + x2) / 2 # x center
268
+ y[..., 1] = (y1 + y2) / 2 # y center
269
+ y[..., 2] = x2 - x1 # width
270
+ y[..., 3] = y2 - y1 # height
441
271
  return y
442
272
 
443
273
 
@@ -478,10 +308,12 @@ def xywhn2xyxy(x, w: int = 640, h: int = 640, padw: int = 0, padh: int = 0):
478
308
  """
479
309
  assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
480
310
  y = empty_like(x) # faster than clone/copy
481
- y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw # top left x
482
- y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh # top left y
483
- y[..., 2] = w * (x[..., 0] + x[..., 2] / 2) + padw # bottom right x
484
- y[..., 3] = h * (x[..., 1] + x[..., 3] / 2) + padh # bottom right y
311
+ xc, yc, xw, xh = x[..., 0], x[..., 1], x[..., 2], x[..., 3]
312
+ half_w, half_h = xw / 2, xh / 2
313
+ y[..., 0] = w * (xc - half_w) + padw # top left x
314
+ y[..., 1] = h * (yc - half_h) + padh # top left y
315
+ y[..., 2] = w * (xc + half_w) + padw # bottom right x
316
+ y[..., 3] = h * (yc + half_h) + padh # bottom right y
485
317
  return y
486
318
 
487
319
 
@@ -504,10 +336,11 @@ def xyxy2xywhn(x, w: int = 640, h: int = 640, clip: bool = False, eps: float = 0
504
336
  x = clip_boxes(x, (h - eps, w - eps))
505
337
  assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
506
338
  y = empty_like(x) # faster than clone/copy
507
- y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w # x center
508
- y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h # y center
509
- y[..., 2] = (x[..., 2] - x[..., 0]) / w # width
510
- y[..., 3] = (x[..., 3] - x[..., 1]) / h # height
339
+ x1, y1, x2, y2 = x[..., 0], x[..., 1], x[..., 2], x[..., 3]
340
+ y[..., 0] = ((x1 + x2) / 2) / w # x center
341
+ y[..., 1] = ((y1 + y2) / 2) / h # y center
342
+ y[..., 2] = (x2 - x1) / w # width
343
+ y[..., 3] = (y2 - y1) / h # height
511
344
  return y
512
345
 
513
346
 
@@ -756,19 +589,15 @@ def scale_masks(masks, shape, padding: bool = True):
756
589
  """
757
590
  mh, mw = masks.shape[2:]
758
591
  gain = min(mh / shape[0], mw / shape[1]) # gain = old / new
759
- pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding
592
+ pad_w = mw - shape[1] * gain
593
+ pad_h = mh - shape[0] * gain
760
594
  if padding:
761
- pad[0] /= 2
762
- pad[1] /= 2
763
- top, left = (int(round(pad[1] - 0.1)), int(round(pad[0] - 0.1))) if padding else (0, 0) # y, x
764
- bottom, right = (
765
- mh - int(round(pad[1] + 0.1)),
766
- mw - int(round(pad[0] + 0.1)),
767
- )
768
- masks = masks[..., top:bottom, left:right]
769
-
770
- masks = F.interpolate(masks, shape, mode="bilinear", align_corners=False) # NCHW
771
- return masks
595
+ pad_w /= 2
596
+ pad_h /= 2
597
+ top, left = (int(round(pad_h - 0.1)), int(round(pad_w - 0.1))) if padding else (0, 0)
598
+ bottom = mh - int(round(pad_h + 0.1))
599
+ right = mw - int(round(pad_w + 0.1))
600
+ return F.interpolate(masks[..., top:bottom, left:right], shape, mode="bilinear", align_corners=False) # NCHW masks
772
601
 
773
602
 
774
603
  def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize: bool = False, padding: bool = True):
@@ -776,9 +605,9 @@ def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize: bool
776
605
  Rescale segment coordinates from img1_shape to img0_shape.
777
606
 
778
607
  Args:
779
- img1_shape (tuple): Shape of the source image.
608
+ img1_shape (tuple): Source image shape as HWC or HW (supports both).
780
609
  coords (torch.Tensor): Coordinates to scale with shape (N, 2).
781
- img0_shape (tuple): Shape of the target image.
610
+ img0_shape (tuple): Image 0 shape as HWC or HW (supports both).
782
611
  ratio_pad (tuple, optional): Ratio and padding values as ((ratio_h, ratio_w), (pad_h, pad_w)).
783
612
  normalize (bool): Whether to normalize coordinates to range [0, 1].
784
613
  padding (bool): Whether coordinates are based on YOLO-style augmented images with padding.
@@ -786,9 +615,11 @@ def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize: bool
786
615
  Returns:
787
616
  (torch.Tensor): Scaled coordinates.
788
617
  """
618
+ img0_h, img0_w = img0_shape[:2] # supports both HWC or HW shapes
789
619
  if ratio_pad is None: # calculate from img0_shape
790
- gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
791
- pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
620
+ img1_h, img1_w = img1_shape[:2] # supports both HWC or HW shapes
621
+ gain = min(img1_h / img0_h, img1_w / img0_w) # gain = old / new
622
+ pad = (img1_w - img0_w * gain) / 2, (img1_h - img0_h * gain) / 2 # wh padding
792
623
  else:
793
624
  gain = ratio_pad[0][0]
794
625
  pad = ratio_pad[1]
@@ -800,8 +631,8 @@ def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize: bool
800
631
  coords[..., 1] /= gain
801
632
  coords = clip_coords(coords, img0_shape)
802
633
  if normalize:
803
- coords[..., 0] /= img0_shape[1] # width
804
- coords[..., 1] /= img0_shape[0] # height
634
+ coords[..., 0] /= img0_w # width
635
+ coords[..., 1] /= img0_h # height
805
636
  return coords
806
637
 
807
638
 
ultralytics/utils/tal.py CHANGED
@@ -387,7 +387,7 @@ def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
387
387
  if xywh:
388
388
  c_xy = (x1y1 + x2y2) / 2
389
389
  wh = x2y2 - x1y1
390
- return torch.cat((c_xy, wh), dim) # xywh bbox
390
+ return torch.cat([c_xy, wh], dim) # xywh bbox
391
391
  return torch.cat((x1y1, x2y2), dim) # xyxy bbox
392
392
 
393
393
 
@@ -250,68 +250,71 @@ def time_sync():
250
250
 
251
251
 
252
252
  def fuse_conv_and_bn(conv, bn):
253
- """Fuse Conv2d() and BatchNorm2d() layers."""
254
- fusedconv = (
255
- nn.Conv2d(
256
- conv.in_channels,
257
- conv.out_channels,
258
- kernel_size=conv.kernel_size,
259
- stride=conv.stride,
260
- padding=conv.padding,
261
- dilation=conv.dilation,
262
- groups=conv.groups,
263
- bias=True,
264
- )
265
- .requires_grad_(False)
266
- .to(conv.weight.device)
267
- )
253
+ """
254
+ Fuse Conv2d and BatchNorm2d layers for inference optimization.
255
+
256
+ Args:
257
+ conv (nn.Conv2d): Convolutional layer to fuse.
258
+ bn (nn.BatchNorm2d): Batch normalization layer to fuse.
259
+
260
+ Returns:
261
+ (nn.Conv2d): The fused convolutional layer with gradients disabled.
268
262
 
269
- # Prepare filters
263
+ Example:
264
+ >>> conv = nn.Conv2d(3, 16, 3)
265
+ >>> bn = nn.BatchNorm2d(16)
266
+ >>> fused_conv = fuse_conv_and_bn(conv, bn)
267
+ """
268
+ # Compute fused weights
270
269
  w_conv = conv.weight.view(conv.out_channels, -1)
271
270
  w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
272
- fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
271
+ conv.weight.data = torch.mm(w_bn, w_conv).view(conv.weight.shape)
273
272
 
274
- # Prepare spatial bias
275
- b_conv = (
276
- torch.zeros(conv.weight.shape[0], dtype=conv.weight.dtype, device=conv.weight.device)
277
- if conv.bias is None
278
- else conv.bias
279
- )
273
+ # Compute fused bias
274
+ b_conv = torch.zeros(conv.out_channels, device=conv.weight.device) if conv.bias is None else conv.bias
280
275
  b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
281
- fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
276
+ fused_bias = torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn
277
+
278
+ if conv.bias is None:
279
+ conv.register_parameter("bias", nn.Parameter(fused_bias))
280
+ else:
281
+ conv.bias.data = fused_bias
282
282
 
283
- return fusedconv
283
+ return conv.requires_grad_(False)
284
284
 
285
285
 
286
286
  def fuse_deconv_and_bn(deconv, bn):
287
- """Fuse ConvTranspose2d() and BatchNorm2d() layers."""
288
- fuseddconv = (
289
- nn.ConvTranspose2d(
290
- deconv.in_channels,
291
- deconv.out_channels,
292
- kernel_size=deconv.kernel_size,
293
- stride=deconv.stride,
294
- padding=deconv.padding,
295
- output_padding=deconv.output_padding,
296
- dilation=deconv.dilation,
297
- groups=deconv.groups,
298
- bias=True,
299
- )
300
- .requires_grad_(False)
301
- .to(deconv.weight.device)
302
- )
287
+ """
288
+ Fuse ConvTranspose2d and BatchNorm2d layers for inference optimization.
303
289
 
304
- # Prepare filters
290
+ Args:
291
+ deconv (nn.ConvTranspose2d): Transposed convolutional layer to fuse.
292
+ bn (nn.BatchNorm2d): Batch normalization layer to fuse.
293
+
294
+ Returns:
295
+ (nn.ConvTranspose2d): The fused transposed convolutional layer with gradients disabled.
296
+
297
+ Example:
298
+ >>> deconv = nn.ConvTranspose2d(16, 3, 3)
299
+ >>> bn = nn.BatchNorm2d(3)
300
+ >>> fused_deconv = fuse_deconv_and_bn(deconv, bn)
301
+ """
302
+ # Compute fused weights
305
303
  w_deconv = deconv.weight.view(deconv.out_channels, -1)
306
304
  w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
307
- fuseddconv.weight.copy_(torch.mm(w_bn, w_deconv).view(fuseddconv.weight.shape))
305
+ deconv.weight.data = torch.mm(w_bn, w_deconv).view(deconv.weight.shape)
308
306
 
309
- # Prepare spatial bias
310
- b_conv = torch.zeros(deconv.weight.shape[1], device=deconv.weight.device) if deconv.bias is None else deconv.bias
307
+ # Compute fused bias
308
+ b_conv = torch.zeros(deconv.out_channels, device=deconv.weight.device) if deconv.bias is None else deconv.bias
311
309
  b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
312
- fuseddconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
310
+ fused_bias = torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn
311
+
312
+ if deconv.bias is None:
313
+ deconv.register_parameter("bias", nn.Parameter(fused_bias))
314
+ else:
315
+ deconv.bias.data = fused_bias
313
316
 
314
- return fuseddconv
317
+ return deconv.requires_grad_(False)
315
318
 
316
319
 
317
320
  def model_info(model, detailed=False, verbose=True, imgsz=640):