ultralytics 8.3.163__py3-none-any.whl → 8.3.164__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. ultralytics/__init__.py +1 -1
  2. ultralytics/data/augment.py +182 -153
  3. ultralytics/data/build.py +23 -3
  4. ultralytics/data/dataset.py +6 -2
  5. ultralytics/data/loaders.py +2 -2
  6. ultralytics/data/utils.py +9 -7
  7. ultralytics/engine/exporter.py +7 -3
  8. ultralytics/engine/results.py +42 -42
  9. ultralytics/models/fastsam/model.py +1 -1
  10. ultralytics/models/fastsam/predict.py +1 -1
  11. ultralytics/models/sam/model.py +4 -4
  12. ultralytics/models/sam/modules/blocks.py +5 -5
  13. ultralytics/models/sam/modules/memory_attention.py +19 -19
  14. ultralytics/models/sam/modules/transformer.py +24 -22
  15. ultralytics/models/yolo/detect/val.py +2 -2
  16. ultralytics/models/yolo/world/train_world.py +9 -1
  17. ultralytics/solutions/distance_calculation.py +1 -1
  18. ultralytics/solutions/instance_segmentation.py +2 -2
  19. ultralytics/solutions/object_blurrer.py +2 -2
  20. ultralytics/solutions/object_counter.py +2 -2
  21. ultralytics/solutions/object_cropper.py +1 -1
  22. ultralytics/solutions/queue_management.py +1 -1
  23. ultralytics/solutions/security_alarm.py +2 -2
  24. ultralytics/solutions/templates/similarity-search.html +0 -24
  25. ultralytics/solutions/vision_eye.py +1 -1
  26. ultralytics/utils/benchmarks.py +2 -2
  27. ultralytics/utils/export.py +0 -2
  28. ultralytics/utils/instance.py +32 -25
  29. ultralytics/utils/ops.py +8 -8
  30. {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/METADATA +1 -1
  31. {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/RECORD +35 -35
  32. {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/WHEEL +0 -0
  33. {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/entry_points.txt +0 -0
  34. {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/licenses/LICENSE +0 -0
  35. {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ from PIL import Image
12
12
  from torch.nn import functional as F
13
13
 
14
14
  from ultralytics.data.utils import polygons2masks, polygons2masks_overlap
15
- from ultralytics.utils import LOGGER, colorstr
15
+ from ultralytics.utils import LOGGER, IterableSimpleNamespace, colorstr
16
16
  from ultralytics.utils.checks import check_version
17
17
  from ultralytics.utils.instance import Instances
18
18
  from ultralytics.utils.metrics import bbox_ioa
@@ -366,7 +366,7 @@ class BaseMixTransform:
366
366
  self.pre_transform = pre_transform
367
367
  self.p = p
368
368
 
369
- def __call__(self, labels):
369
+ def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
370
370
  """
371
371
  Apply pre-processing transforms and cutmix/mixup/mosaic transforms to labels data.
372
372
 
@@ -374,10 +374,10 @@ class BaseMixTransform:
374
374
  selects additional images, applies pre-transforms if specified, and then performs the mix transform.
375
375
 
376
376
  Args:
377
- labels (dict): A dictionary containing label data for an image.
377
+ labels (Dict[str, Any]): A dictionary containing label data for an image.
378
378
 
379
379
  Returns:
380
- (dict): The transformed labels dictionary, which may include mixed data from other images.
380
+ (Dict[str, Any]): The transformed labels dictionary, which may include mixed data from other images.
381
381
 
382
382
  Examples:
383
383
  >>> transform = BaseMixTransform(dataset, pre_transform=None, p=0.5)
@@ -406,7 +406,7 @@ class BaseMixTransform:
406
406
  labels.pop("mix_labels", None)
407
407
  return labels
408
408
 
409
- def _mix_transform(self, labels):
409
+ def _mix_transform(self, labels: Dict[str, Any]):
410
410
  """
411
411
  Apply CutMix, MixUp or Mosaic augmentation to the label dictionary.
412
412
 
@@ -414,11 +414,11 @@ class BaseMixTransform:
414
414
  Mosaic. It modifies the input label dictionary in-place with the augmented data.
415
415
 
416
416
  Args:
417
- labels (dict): A dictionary containing image and label data. Expected to have a 'mix_labels' key
417
+ labels (Dict[str, Any]): A dictionary containing image and label data. Expected to have a 'mix_labels' key
418
418
  with a list of additional image and label data for mixing.
419
419
 
420
420
  Returns:
421
- (dict): The modified labels dictionary with augmented data after applying the mix transform.
421
+ (Dict[str, Any]): The modified labels dictionary with augmented data after applying the mix transform.
422
422
 
423
423
  Examples:
424
424
  >>> transform = BaseMixTransform(dataset)
@@ -442,7 +442,7 @@ class BaseMixTransform:
442
442
  return random.randint(0, len(self.dataset) - 1)
443
443
 
444
444
  @staticmethod
445
- def _update_label_text(labels):
445
+ def _update_label_text(labels: Dict[str, Any]) -> Dict[str, Any]:
446
446
  """
447
447
  Update label text and class IDs for mixed labels in image augmentation.
448
448
 
@@ -450,11 +450,11 @@ class BaseMixTransform:
450
450
  creating a unified set of text labels and updating class IDs accordingly.
451
451
 
452
452
  Args:
453
- labels (dict): A dictionary containing label information, including 'texts' and 'cls' fields,
453
+ labels (Dict[str, Any]): A dictionary containing label information, including 'texts' and 'cls' fields,
454
454
  and optionally a 'mix_labels' field with additional label dictionaries.
455
455
 
456
456
  Returns:
457
- (dict): The updated labels dictionary with unified text labels and updated class IDs.
457
+ (Dict[str, Any]): The updated labels dictionary with unified text labels and updated class IDs.
458
458
 
459
459
  Examples:
460
460
  >>> labels = {
@@ -517,7 +517,7 @@ class Mosaic(BaseMixTransform):
517
517
  >>> augmented_labels = mosaic_aug(original_labels)
518
518
  """
519
519
 
520
- def __init__(self, dataset, imgsz=640, p=1.0, n=4):
520
+ def __init__(self, dataset, imgsz: int = 640, p: float = 1.0, n: int = 4):
521
521
  """
522
522
  Initialize the Mosaic augmentation object.
523
523
 
@@ -564,7 +564,7 @@ class Mosaic(BaseMixTransform):
564
564
  else: # select any images
565
565
  return [random.randint(0, len(self.dataset) - 1) for _ in range(self.n - 1)]
566
566
 
567
- def _mix_transform(self, labels):
567
+ def _mix_transform(self, labels: Dict[str, Any]) -> Dict[str, Any]:
568
568
  """
569
569
  Apply mosaic augmentation to the input image and labels.
570
570
 
@@ -573,12 +573,12 @@ class Mosaic(BaseMixTransform):
573
573
  mosaic augmentation.
574
574
 
575
575
  Args:
576
- labels (dict): A dictionary containing image data and annotations. Expected keys include:
576
+ labels (Dict[str, Any]): A dictionary containing image data and annotations. Expected keys include:
577
577
  - 'rect_shape': Should be None as rect and mosaic are mutually exclusive.
578
578
  - 'mix_labels': A list of dictionaries containing data for other images to be used in the mosaic.
579
579
 
580
580
  Returns:
581
- (dict): A dictionary containing the mosaic-augmented image and updated annotations.
581
+ (Dict[str, Any]): A dictionary containing the mosaic-augmented image and updated annotations.
582
582
 
583
583
  Raises:
584
584
  AssertionError: If 'rect_shape' is not None or if 'mix_labels' is empty.
@@ -593,7 +593,7 @@ class Mosaic(BaseMixTransform):
593
593
  self._mosaic3(labels) if self.n == 3 else self._mosaic4(labels) if self.n == 4 else self._mosaic9(labels)
594
594
  ) # This code is modified for mosaic3 method.
595
595
 
596
- def _mosaic3(self, labels):
596
+ def _mosaic3(self, labels: Dict[str, Any]) -> Dict[str, Any]:
597
597
  """
598
598
  Create a 1x3 image mosaic by combining three images.
599
599
 
@@ -601,12 +601,12 @@ class Mosaic(BaseMixTransform):
601
601
  additional images on either side. It's part of the Mosaic augmentation technique used in object detection.
602
602
 
603
603
  Args:
604
- labels (dict): A dictionary containing image and label information for the main (center) image.
604
+ labels (Dict[str, Any]): A dictionary containing image and label information for the main (center) image.
605
605
  Must include 'img' key with the image array, and 'mix_labels' key with a list of two
606
606
  dictionaries containing information for the side images.
607
607
 
608
608
  Returns:
609
- (dict): A dictionary with the mosaic image and updated labels. Keys include:
609
+ (Dict[str, Any]): A dictionary with the mosaic image and updated labels. Keys include:
610
610
  - 'img' (np.ndarray): The mosaic image array with shape (H, W, C).
611
611
  - Other keys from the input labels, updated to reflect the new image dimensions.
612
612
 
@@ -652,7 +652,7 @@ class Mosaic(BaseMixTransform):
652
652
  final_labels["img"] = img3[-self.border[0] : self.border[0], -self.border[1] : self.border[1]]
653
653
  return final_labels
654
654
 
655
- def _mosaic4(self, labels):
655
+ def _mosaic4(self, labels: Dict[str, Any]) -> Dict[str, Any]:
656
656
  """
657
657
  Create a 2x2 image mosaic from four input images.
658
658
 
@@ -660,11 +660,11 @@ class Mosaic(BaseMixTransform):
660
660
  updates the corresponding labels for each image in the mosaic.
661
661
 
662
662
  Args:
663
- labels (dict): A dictionary containing image data and labels for the base image (index 0) and three
663
+ labels (Dict[str, Any]): A dictionary containing image data and labels for the base image (index 0) and three
664
664
  additional images (indices 1-3) in the 'mix_labels' key.
665
665
 
666
666
  Returns:
667
- (dict): A dictionary containing the mosaic image and updated labels. The 'img' key contains the mosaic
667
+ (Dict[str, Any]): A dictionary containing the mosaic image and updated labels. The 'img' key contains the mosaic
668
668
  image as a numpy array, and other keys contain the combined and adjusted labels for all four images.
669
669
 
670
670
  Examples:
@@ -710,7 +710,7 @@ class Mosaic(BaseMixTransform):
710
710
  final_labels["img"] = img4
711
711
  return final_labels
712
712
 
713
- def _mosaic9(self, labels):
713
+ def _mosaic9(self, labels: Dict[str, Any]) -> Dict[str, Any]:
714
714
  """
715
715
  Create a 3x3 image mosaic from the input image and eight additional images.
716
716
 
@@ -718,16 +718,16 @@ class Mosaic(BaseMixTransform):
718
718
  and eight additional images from the dataset are placed around it in a 3x3 grid pattern.
719
719
 
720
720
  Args:
721
- labels (dict): A dictionary containing the input image and its associated labels. It should have
721
+ labels (Dict[str, Any]): A dictionary containing the input image and its associated labels. It should have
722
722
  the following keys:
723
- - 'img' (numpy.ndarray): The input image.
723
+ - 'img' (np.ndarray): The input image.
724
724
  - 'resized_shape' (Tuple[int, int]): The shape of the resized image (height, width).
725
725
  - 'mix_labels' (List[Dict]): A list of dictionaries containing information for the additional
726
726
  eight images, each with the same structure as the input labels.
727
727
 
728
728
  Returns:
729
- (dict): A dictionary containing the mosaic image and updated labels. It includes the following keys:
730
- - 'img' (numpy.ndarray): The final mosaic image.
729
+ (Dict[str, Any]): A dictionary containing the mosaic image and updated labels. It includes the following keys:
730
+ - 'img' (np.ndarray): The final mosaic image.
731
731
  - Other keys from the input labels, updated to reflect the new mosaic arrangement.
732
732
 
733
733
  Examples:
@@ -783,7 +783,7 @@ class Mosaic(BaseMixTransform):
783
783
  return final_labels
784
784
 
785
785
  @staticmethod
786
- def _update_labels(labels, padw, padh):
786
+ def _update_labels(labels, padw: int, padh: int) -> Dict[str, Any]:
787
787
  """
788
788
  Update label coordinates with padding values.
789
789
 
@@ -791,7 +791,7 @@ class Mosaic(BaseMixTransform):
791
791
  values. It also denormalizes the coordinates if they were previously normalized.
792
792
 
793
793
  Args:
794
- labels (dict): A dictionary containing image and instance information.
794
+ labels (Dict[str, Any]): A dictionary containing image and instance information.
795
795
  padw (int): Padding width to be added to the x-coordinates.
796
796
  padh (int): Padding height to be added to the y-coordinates.
797
797
 
@@ -809,7 +809,7 @@ class Mosaic(BaseMixTransform):
809
809
  labels["instances"].add_padding(padw, padh)
810
810
  return labels
811
811
 
812
- def _cat_labels(self, mosaic_labels):
812
+ def _cat_labels(self, mosaic_labels: List[Dict[str, Any]]) -> Dict[str, Any]:
813
813
  """
814
814
  Concatenate and process labels for mosaic augmentation.
815
815
 
@@ -817,10 +817,10 @@ class Mosaic(BaseMixTransform):
817
817
  mosaic border, and removes zero-area boxes.
818
818
 
819
819
  Args:
820
- mosaic_labels (List[Dict]): A list of label dictionaries for each image in the mosaic.
820
+ mosaic_labels (List[Dict[str, Any]]): A list of label dictionaries for each image in the mosaic.
821
821
 
822
822
  Returns:
823
- (dict): A dictionary containing concatenated and processed labels for the mosaic image, including:
823
+ (Dict[str, Any]): A dictionary containing concatenated and processed labels for the mosaic image, including:
824
824
  - im_file (str): File path of the first image in the mosaic.
825
825
  - ori_shape (Tuple[int, int]): Original shape of the first image.
826
826
  - resized_shape (Tuple[int, int]): Shape of the mosaic image (imgsz * 2, imgsz * 2).
@@ -883,7 +883,7 @@ class MixUp(BaseMixTransform):
883
883
  >>> augmented_labels = mixup(original_labels)
884
884
  """
885
885
 
886
- def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
886
+ def __init__(self, dataset, pre_transform=None, p: float = 0.0) -> None:
887
887
  """
888
888
  Initialize the MixUp augmentation object.
889
889
 
@@ -902,7 +902,7 @@ class MixUp(BaseMixTransform):
902
902
  """
903
903
  super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
904
904
 
905
- def _mix_transform(self, labels):
905
+ def _mix_transform(self, labels: Dict[str, Any]) -> Dict[str, Any]:
906
906
  """
907
907
  Apply MixUp augmentation to the input labels.
908
908
 
@@ -910,10 +910,10 @@ class MixUp(BaseMixTransform):
910
910
  "mixup: Beyond Empirical Risk Minimization" (https://arxiv.org/abs/1710.09412).
911
911
 
912
912
  Args:
913
- labels (dict): A dictionary containing the original image and label information.
913
+ labels (Dict[str, Any]): A dictionary containing the original image and label information.
914
914
 
915
915
  Returns:
916
- (dict): A dictionary containing the mixed-up image and combined label information.
916
+ (Dict[str, Any]): A dictionary containing the mixed-up image and combined label information.
917
917
 
918
918
  Examples:
919
919
  >>> mixer = MixUp(dataset)
@@ -952,7 +952,7 @@ class CutMix(BaseMixTransform):
952
952
  >>> augmented_labels = cutmix(original_labels)
953
953
  """
954
954
 
955
- def __init__(self, dataset, pre_transform=None, p=0.0, beta=1.0, num_areas=3) -> None:
955
+ def __init__(self, dataset, pre_transform=None, p: float = 0.0, beta: float = 1.0, num_areas: int = 3) -> None:
956
956
  """
957
957
  Initialize the CutMix augmentation object.
958
958
 
@@ -967,7 +967,7 @@ class CutMix(BaseMixTransform):
967
967
  self.beta = beta
968
968
  self.num_areas = num_areas
969
969
 
970
- def _rand_bbox(self, width, height):
970
+ def _rand_bbox(self, width: int, height: int) -> Tuple[int, int, int, int]:
971
971
  """
972
972
  Generate random bounding box coordinates for the cut region.
973
973
 
@@ -976,7 +976,7 @@ class CutMix(BaseMixTransform):
976
976
  height (int): Height of the image.
977
977
 
978
978
  Returns:
979
- (tuple): (x1, y1, x2, y2) coordinates of the bounding box.
979
+ (Tuple[int]): (x1, y1, x2, y2) coordinates of the bounding box.
980
980
  """
981
981
  # Sample mixing ratio from Beta distribution
982
982
  lam = np.random.beta(self.beta, self.beta)
@@ -997,15 +997,15 @@ class CutMix(BaseMixTransform):
997
997
 
998
998
  return x1, y1, x2, y2
999
999
 
1000
- def _mix_transform(self, labels):
1000
+ def _mix_transform(self, labels: Dict[str, Any]) -> Dict[str, Any]:
1001
1001
  """
1002
1002
  Apply CutMix augmentation to the input labels.
1003
1003
 
1004
1004
  Args:
1005
- labels (dict): A dictionary containing the original image and label information.
1005
+ labels (Dict[str, Any]): A dictionary containing the original image and label information.
1006
1006
 
1007
1007
  Returns:
1008
- (dict): A dictionary containing the mixed image and adjusted labels.
1008
+ (Dict[str, Any]): A dictionary containing the mixed image and adjusted labels.
1009
1009
 
1010
1010
  Examples:
1011
1011
  >>> cutter = CutMix(dataset)
@@ -1080,7 +1080,14 @@ class RandomPerspective:
1080
1080
  """
1081
1081
 
1082
1082
  def __init__(
1083
- self, degrees=0.0, translate=0.1, scale=0.5, shear=0.0, perspective=0.0, border=(0, 0), pre_transform=None
1083
+ self,
1084
+ degrees: float = 0.0,
1085
+ translate: float = 0.1,
1086
+ scale: float = 0.5,
1087
+ shear: float = 0.0,
1088
+ perspective: float = 0.0,
1089
+ border: Tuple[int, int] = (0, 0),
1090
+ pre_transform=None,
1084
1091
  ):
1085
1092
  """
1086
1093
  Initialize RandomPerspective object with transformation parameters.
@@ -1110,7 +1117,7 @@ class RandomPerspective:
1110
1117
  self.border = border # mosaic border
1111
1118
  self.pre_transform = pre_transform
1112
1119
 
1113
- def affine_transform(self, img, border):
1120
+ def affine_transform(self, img: np.ndarray, border: Tuple[int, int]) -> Tuple[np.ndarray, np.ndarray, float]:
1114
1121
  """
1115
1122
  Apply a sequence of affine transformations centered around the image center.
1116
1123
 
@@ -1174,7 +1181,7 @@ class RandomPerspective:
1174
1181
  img = img[..., None]
1175
1182
  return img, M, s
1176
1183
 
1177
- def apply_bboxes(self, bboxes, M):
1184
+ def apply_bboxes(self, bboxes: np.ndarray, M: np.ndarray) -> np.ndarray:
1178
1185
  """
1179
1186
  Apply affine transformation to bounding boxes.
1180
1187
 
@@ -1182,12 +1189,12 @@ class RandomPerspective:
1182
1189
  transformation matrix.
1183
1190
 
1184
1191
  Args:
1185
- bboxes (torch.Tensor): Bounding boxes in xyxy format with shape (N, 4), where N is the number
1192
+ bboxes (np.ndarray): Bounding boxes in xyxy format with shape (N, 4), where N is the number
1186
1193
  of bounding boxes.
1187
- M (torch.Tensor): Affine transformation matrix with shape (3, 3).
1194
+ M (np.ndarray): Affine transformation matrix with shape (3, 3).
1188
1195
 
1189
1196
  Returns:
1190
- (torch.Tensor): Transformed bounding boxes in xyxy format with shape (N, 4).
1197
+ (np.ndarray): Transformed bounding boxes in xyxy format with shape (N, 4).
1191
1198
 
1192
1199
  Examples:
1193
1200
  >>> bboxes = torch.tensor([[10, 10, 20, 20], [30, 30, 40, 40]])
@@ -1208,7 +1215,7 @@ class RandomPerspective:
1208
1215
  y = xy[:, [1, 3, 5, 7]]
1209
1216
  return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T
1210
1217
 
1211
- def apply_segments(self, segments, M):
1218
+ def apply_segments(self, segments: np.ndarray, M: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
1212
1219
  """
1213
1220
  Apply affine transformations to segments and generate new bounding boxes.
1214
1221
 
@@ -1244,7 +1251,7 @@ class RandomPerspective:
1244
1251
  segments[..., 1] = segments[..., 1].clip(bboxes[:, 1:2], bboxes[:, 3:4])
1245
1252
  return bboxes, segments
1246
1253
 
1247
- def apply_keypoints(self, keypoints, M):
1254
+ def apply_keypoints(self, keypoints: np.ndarray, M: np.ndarray) -> np.ndarray:
1248
1255
  """
1249
1256
  Apply affine transformation to keypoints.
1250
1257
 
@@ -1278,7 +1285,7 @@ class RandomPerspective:
1278
1285
  visible[out_mask] = 0
1279
1286
  return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3)
1280
1287
 
1281
- def __call__(self, labels):
1288
+ def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
1282
1289
  """
1283
1290
  Apply random perspective and affine transformations to an image and its associated labels.
1284
1291
 
@@ -1287,7 +1294,7 @@ class RandomPerspective:
1287
1294
  and keypoints accordingly.
1288
1295
 
1289
1296
  Args:
1290
- labels (dict): A dictionary containing image data and annotations.
1297
+ labels (Dict[str, Any]): A dictionary containing image data and annotations.
1291
1298
  Must include:
1292
1299
  'img' (np.ndarray): The input image.
1293
1300
  'cls' (np.ndarray): Class labels.
@@ -1296,7 +1303,7 @@ class RandomPerspective:
1296
1303
  'mosaic_border' (Tuple[int, int]): Border size for mosaic augmentation.
1297
1304
 
1298
1305
  Returns:
1299
- (dict): Transformed labels dictionary containing:
1306
+ (Dict[str, Any]): Transformed labels dictionary containing:
1300
1307
  - 'img' (np.ndarray): The transformed image.
1301
1308
  - 'cls' (np.ndarray): Updated class labels.
1302
1309
  - 'instances' (Instances): Updated object instances.
@@ -1357,7 +1364,14 @@ class RandomPerspective:
1357
1364
  return labels
1358
1365
 
1359
1366
  @staticmethod
1360
- def box_candidates(box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16):
1367
+ def box_candidates(
1368
+ box1: np.ndarray,
1369
+ box2: np.ndarray,
1370
+ wh_thr: int = 2,
1371
+ ar_thr: int = 100,
1372
+ area_thr: float = 0.1,
1373
+ eps: float = 1e-16,
1374
+ ) -> np.ndarray:
1361
1375
  """
1362
1376
  Compute candidate boxes for further processing based on size and aspect ratio criteria.
1363
1377
 
@@ -1366,20 +1380,20 @@ class RandomPerspective:
1366
1380
  been overly distorted or reduced by the augmentation process.
1367
1381
 
1368
1382
  Args:
1369
- box1 (numpy.ndarray): Original boxes before augmentation, shape (4, N) where n is the
1383
+ box1 (np.ndarray): Original boxes before augmentation, shape (4, N) where n is the
1370
1384
  number of boxes. Format is [x1, y1, x2, y2] in absolute coordinates.
1371
- box2 (numpy.ndarray): Augmented boxes after transformation, shape (4, N). Format is
1385
+ box2 (np.ndarray): Augmented boxes after transformation, shape (4, N). Format is
1372
1386
  [x1, y1, x2, y2] in absolute coordinates.
1373
- wh_thr (float): Width and height threshold in pixels. Boxes smaller than this in either
1387
+ wh_thr (int): Width and height threshold in pixels. Boxes smaller than this in either
1374
1388
  dimension are rejected.
1375
- ar_thr (float): Aspect ratio threshold. Boxes with an aspect ratio greater than this
1389
+ ar_thr (int): Aspect ratio threshold. Boxes with an aspect ratio greater than this
1376
1390
  value are rejected.
1377
1391
  area_thr (float): Area ratio threshold. Boxes with an area ratio (new/old) less than
1378
1392
  this value are rejected.
1379
1393
  eps (float): Small epsilon value to prevent division by zero.
1380
1394
 
1381
1395
  Returns:
1382
- (numpy.ndarray): Boolean array of shape (n) indicating which boxes are candidates.
1396
+ (np.ndarray): Boolean array of shape (n) indicating which boxes are candidates.
1383
1397
  True values correspond to boxes that meet all criteria.
1384
1398
 
1385
1399
  Examples:
@@ -1420,7 +1434,7 @@ class RandomHSV:
1420
1434
  >>> augmented_image = augmented_labels["img"]
1421
1435
  """
1422
1436
 
1423
- def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None:
1437
+ def __init__(self, hgain: float = 0.5, sgain: float = 0.5, vgain: float = 0.5) -> None:
1424
1438
  """
1425
1439
  Initialize the RandomHSV object for random HSV (Hue, Saturation, Value) augmentation.
1426
1440
 
@@ -1439,7 +1453,7 @@ class RandomHSV:
1439
1453
  self.sgain = sgain
1440
1454
  self.vgain = vgain
1441
1455
 
1442
- def __call__(self, labels):
1456
+ def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
1443
1457
  """
1444
1458
  Apply random HSV augmentation to an image within predefined limits.
1445
1459
 
@@ -1447,17 +1461,16 @@ class RandomHSV:
1447
1461
  The adjustments are made within the limits set by hgain, sgain, and vgain during initialization.
1448
1462
 
1449
1463
  Args:
1450
- labels (dict): A dictionary containing image data and metadata. Must include an 'img' key with
1464
+ labels (Dict[str, Any]): A dictionary containing image data and metadata. Must include an 'img' key with
1451
1465
  the image as a numpy array.
1452
1466
 
1453
1467
  Returns:
1454
- (None): The function modifies the input 'labels' dictionary in-place, updating the 'img' key
1455
- with the HSV-augmented image.
1468
+ (Dict[str, Any]): A dictionary containing the mixed image and adjusted labels.
1456
1469
 
1457
1470
  Examples:
1458
1471
  >>> hsv_augmenter = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
1459
1472
  >>> labels = {"img": np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)}
1460
- >>> hsv_augmenter(labels)
1473
+ >>> labels = hsv_augmenter(labels)
1461
1474
  >>> augmented_img = labels["img"]
1462
1475
  """
1463
1476
  img = labels["img"]
@@ -1502,7 +1515,7 @@ class RandomFlip:
1502
1515
  >>> flipped_instances = result["instances"]
1503
1516
  """
1504
1517
 
1505
- def __init__(self, p=0.5, direction="horizontal", flip_idx=None) -> None:
1518
+ def __init__(self, p: float = 0.5, direction: str = "horizontal", flip_idx: List[int] = None) -> None:
1506
1519
  """
1507
1520
  Initialize the RandomFlip class with probability and direction.
1508
1521
 
@@ -1528,7 +1541,7 @@ class RandomFlip:
1528
1541
  self.direction = direction
1529
1542
  self.flip_idx = flip_idx
1530
1543
 
1531
- def __call__(self, labels):
1544
+ def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
1532
1545
  """
1533
1546
  Apply random flip to an image and update any instances like bounding boxes or keypoints accordingly.
1534
1547
 
@@ -1537,14 +1550,14 @@ class RandomFlip:
1537
1550
  match the flipped image.
1538
1551
 
1539
1552
  Args:
1540
- labels (dict): A dictionary containing the following keys:
1541
- 'img' (numpy.ndarray): The image to be flipped.
1553
+ labels (Dict[str, Any]): A dictionary containing the following keys:
1554
+ 'img' (np.ndarray): The image to be flipped.
1542
1555
  'instances' (ultralytics.utils.instance.Instances): An object containing bounding boxes and
1543
1556
  optionally keypoints.
1544
1557
 
1545
1558
  Returns:
1546
- (dict): The same dictionary with the flipped image and updated instances:
1547
- 'img' (numpy.ndarray): The flipped image.
1559
+ (Dict[str, Any]): The same dictionary with the flipped image and updated instances:
1560
+ 'img' (np.ndarray): The flipped image.
1548
1561
  'instances' (ultralytics.utils.instance.Instances): Updated instances matching the flipped image.
1549
1562
 
1550
1563
  Examples:
@@ -1600,7 +1613,15 @@ class LetterBox:
1600
1613
  >>> updated_instances = result["instances"]
1601
1614
  """
1602
1615
 
1603
- def __init__(self, new_shape=(640, 640), auto=False, scale_fill=False, scaleup=True, center=True, stride=32):
1616
+ def __init__(
1617
+ self,
1618
+ new_shape: Tuple[int, int] = (640, 640),
1619
+ auto: bool = False,
1620
+ scale_fill: bool = False,
1621
+ scaleup: bool = True,
1622
+ center: bool = True,
1623
+ stride: int = 32,
1624
+ ):
1604
1625
  """
1605
1626
  Initialize LetterBox object for resizing and padding images.
1606
1627
 
@@ -1633,7 +1654,7 @@ class LetterBox:
1633
1654
  self.stride = stride
1634
1655
  self.center = center # Put the image in the middle or top-left
1635
1656
 
1636
- def __call__(self, labels=None, image=None):
1657
+ def __call__(self, labels: Dict[str, Any] = None, image: np.ndarray = None) -> Union[Dict[str, Any], np.ndarray]:
1637
1658
  """
1638
1659
  Resize and pad an image for object detection, instance segmentation, or pose estimation tasks.
1639
1660
 
@@ -1641,13 +1662,13 @@ class LetterBox:
1641
1662
  aspect ratio and adding padding to fit the new shape. It also updates any associated labels accordingly.
1642
1663
 
1643
1664
  Args:
1644
- labels (Dict | None): A dictionary containing image data and associated labels, or empty dict if None.
1665
+ labels (Dict[str, Any] | None): A dictionary containing image data and associated labels, or empty dict if None.
1645
1666
  image (np.ndarray | None): The input image as a numpy array. If None, the image is taken from 'labels'.
1646
1667
 
1647
1668
  Returns:
1648
- (Dict | Tuple): If 'labels' is provided, returns an updated dictionary with the resized and padded image,
1649
- updated labels, and additional metadata. If 'labels' is empty, returns a tuple containing the resized
1650
- and padded image, and a tuple of (ratio, (left_pad, top_pad)).
1669
+ (Dict[str, Any] | nd.ndarray): If 'labels' is provided, returns an updated dictionary with the resized and padded image,
1670
+ updated labels, and additional metadata. If 'labels' is empty, returns the resized
1671
+ and padded image.
1651
1672
 
1652
1673
  Examples:
1653
1674
  >>> letterbox = LetterBox(new_shape=(640, 640))
@@ -1710,7 +1731,7 @@ class LetterBox:
1710
1731
  return img
1711
1732
 
1712
1733
  @staticmethod
1713
- def _update_labels(labels, ratio, padw, padh):
1734
+ def _update_labels(labels: Dict[str, Any], ratio: Tuple[float, float], padw: float, padh: float) -> Dict[str, Any]:
1714
1735
  """
1715
1736
  Update labels after applying letterboxing to an image.
1716
1737
 
@@ -1718,13 +1739,13 @@ class LetterBox:
1718
1739
  to account for resizing and padding applied during letterboxing.
1719
1740
 
1720
1741
  Args:
1721
- labels (dict): A dictionary containing image labels and instances.
1742
+ labels (Dict[str, Any]): A dictionary containing image labels and instances.
1722
1743
  ratio (Tuple[float, float]): Scaling ratios (width, height) applied to the image.
1723
1744
  padw (float): Padding width added to the image.
1724
1745
  padh (float): Padding height added to the image.
1725
1746
 
1726
1747
  Returns:
1727
- (dict): Updated labels dictionary with modified instance coordinates.
1748
+ (Dict[str, Any]): Updated labels dictionary with modified instance coordinates.
1728
1749
 
1729
1750
  Examples:
1730
1751
  >>> letterbox = LetterBox(new_shape=(640, 640))
@@ -1764,18 +1785,18 @@ class CopyPaste(BaseMixTransform):
1764
1785
  >>> augmented_labels = copypaste(original_labels)
1765
1786
  """
1766
1787
 
1767
- def __init__(self, dataset=None, pre_transform=None, p=0.5, mode="flip") -> None:
1788
+ def __init__(self, dataset=None, pre_transform=None, p: float = 0.5, mode: str = "flip") -> None:
1768
1789
  """Initialize CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
1769
1790
  super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
1770
1791
  assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
1771
1792
  self.mode = mode
1772
1793
 
1773
- def _mix_transform(self, labels):
1794
+ def _mix_transform(self, labels: Dict[str, Any]) -> Dict[str, Any]:
1774
1795
  """Apply Copy-Paste augmentation to combine objects from another image into the current image."""
1775
1796
  labels2 = labels["mix_labels"][0]
1776
1797
  return self._transform(labels, labels2)
1777
1798
 
1778
- def __call__(self, labels):
1799
+ def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
1779
1800
  """Apply Copy-Paste augmentation to an image and its labels."""
1780
1801
  if len(labels["instances"].segments) == 0 or self.p == 0:
1781
1802
  return labels
@@ -1802,7 +1823,7 @@ class CopyPaste(BaseMixTransform):
1802
1823
  labels.pop("mix_labels", None)
1803
1824
  return labels
1804
1825
 
1805
- def _transform(self, labels1, labels2={}):
1826
+ def _transform(self, labels1: Dict[str, Any], labels2: Dict[str, Any] = {}) -> Dict[str, Any]:
1806
1827
  """Apply Copy-Paste augmentation to combine objects from another image into the current image."""
1807
1828
  im = labels1["img"]
1808
1829
  if "mosaic_border" not in labels1:
@@ -1866,7 +1887,7 @@ class Albumentations:
1866
1887
  - Spatial transforms are handled differently and require special processing for bounding boxes.
1867
1888
  """
1868
1889
 
1869
- def __init__(self, p=1.0):
1890
+ def __init__(self, p: float = 1.0) -> None:
1870
1891
  """
1871
1892
  Initialize the Albumentations transform object for YOLO bbox formatted parameters.
1872
1893
 
@@ -1980,7 +2001,7 @@ class Albumentations:
1980
2001
  except Exception as e:
1981
2002
  LOGGER.info(f"{prefix}{e}")
1982
2003
 
1983
- def __call__(self, labels):
2004
+ def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
1984
2005
  """
1985
2006
  Apply Albumentations transformations to input labels.
1986
2007
 
@@ -1988,13 +2009,13 @@ class Albumentations:
1988
2009
  spatial and non-spatial transformations on the input image and its corresponding labels.
1989
2010
 
1990
2011
  Args:
1991
- labels (dict): A dictionary containing image data and annotations. Expected keys are:
1992
- - 'img': numpy.ndarray representing the image
1993
- - 'cls': numpy.ndarray of class labels
2012
+ labels (Dict[str, Any]): A dictionary containing image data and annotations. Expected keys are:
2013
+ - 'img': np.ndarray representing the image
2014
+ - 'cls': np.ndarray of class labels
1994
2015
  - 'instances': object containing bounding boxes and other instance information
1995
2016
 
1996
2017
  Returns:
1997
- (dict): The input dictionary with augmented image and updated annotations.
2018
+ (Dict[str, Any]): The input dictionary with augmented image and updated annotations.
1998
2019
 
1999
2020
  Examples:
2000
2021
  >>> transform = Albumentations(p=0.5)
@@ -2069,15 +2090,15 @@ class Format:
2069
2090
 
2070
2091
  def __init__(
2071
2092
  self,
2072
- bbox_format="xywh",
2073
- normalize=True,
2074
- return_mask=False,
2075
- return_keypoint=False,
2076
- return_obb=False,
2077
- mask_ratio=4,
2078
- mask_overlap=True,
2079
- batch_idx=True,
2080
- bgr=0.0,
2093
+ bbox_format: str = "xywh",
2094
+ normalize: bool = True,
2095
+ return_mask: bool = False,
2096
+ return_keypoint: bool = False,
2097
+ return_obb: bool = False,
2098
+ mask_ratio: int = 4,
2099
+ mask_overlap: bool = True,
2100
+ batch_idx: bool = True,
2101
+ bgr: float = 0.0,
2081
2102
  ):
2082
2103
  """
2083
2104
  Initialize the Format class with given parameters for image and instance annotation formatting.
@@ -2122,7 +2143,7 @@ class Format:
2122
2143
  self.batch_idx = batch_idx # keep the batch indexes
2123
2144
  self.bgr = bgr
2124
2145
 
2125
- def __call__(self, labels):
2146
+ def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
2126
2147
  """
2127
2148
  Format image annotations for object detection, instance segmentation, and pose estimation tasks.
2128
2149
 
@@ -2131,13 +2152,13 @@ class Format:
2131
2152
  applying normalization if required.
2132
2153
 
2133
2154
  Args:
2134
- labels (dict): A dictionary containing image and annotation data with the following keys:
2155
+ labels (Dict[str, Any]): A dictionary containing image and annotation data with the following keys:
2135
2156
  - 'img': The input image as a numpy array.
2136
2157
  - 'cls': Class labels for instances.
2137
2158
  - 'instances': An Instances object containing bounding boxes, segments, and keypoints.
2138
2159
 
2139
2160
  Returns:
2140
- (dict): A dictionary with formatted data, including:
2161
+ (Dict[str, Any]): A dictionary with formatted data, including:
2141
2162
  - 'img': Formatted image tensor.
2142
2163
  - 'cls': Class label's tensor.
2143
2164
  - 'bboxes': Bounding boxes tensor in the specified format.
@@ -2191,7 +2212,7 @@ class Format:
2191
2212
  labels["batch_idx"] = torch.zeros(nl)
2192
2213
  return labels
2193
2214
 
2194
- def _format_img(self, img):
2215
+ def _format_img(self, img: np.ndarray) -> torch.Tensor:
2195
2216
  """
2196
2217
  Format an image for YOLO from a Numpy array to a PyTorch tensor.
2197
2218
 
@@ -2222,20 +2243,22 @@ class Format:
2222
2243
  img = torch.from_numpy(img)
2223
2244
  return img
2224
2245
 
2225
- def _format_segments(self, instances, cls, w, h):
2246
+ def _format_segments(
2247
+ self, instances: Instances, cls: np.ndarray, w: int, h: int
2248
+ ) -> Tuple[np.ndarray, Instances, np.ndarray]:
2226
2249
  """
2227
2250
  Convert polygon segments to bitmap masks.
2228
2251
 
2229
2252
  Args:
2230
2253
  instances (Instances): Object containing segment information.
2231
- cls (numpy.ndarray): Class labels for each instance.
2254
+ cls (np.ndarray): Class labels for each instance.
2232
2255
  w (int): Width of the image.
2233
2256
  h (int): Height of the image.
2234
2257
 
2235
2258
  Returns:
2236
- masks (numpy.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
2259
+ masks (np.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
2237
2260
  instances (Instances): Updated instances object with sorted segments if mask_overlap is True.
2238
- cls (numpy.ndarray): Updated class labels, sorted if mask_overlap is True.
2261
+ cls (np.ndarray): Updated class labels, sorted if mask_overlap is True.
2239
2262
 
2240
2263
  Notes:
2241
2264
  - If self.mask_overlap is True, masks are overlapped and sorted by area.
@@ -2257,7 +2280,7 @@ class Format:
2257
2280
  class LoadVisualPrompt:
2258
2281
  """Create visual prompts from bounding boxes or masks for model input."""
2259
2282
 
2260
- def __init__(self, scale_factor=1 / 8):
2283
+ def __init__(self, scale_factor: float = 1 / 8) -> None:
2261
2284
  """
2262
2285
  Initialize the LoadVisualPrompt with a scale factor.
2263
2286
 
@@ -2266,7 +2289,7 @@ class LoadVisualPrompt:
2266
2289
  """
2267
2290
  self.scale_factor = scale_factor
2268
2291
 
2269
- def make_mask(self, boxes, h, w):
2292
+ def make_mask(self, boxes: torch.Tensor, h: int, w: int) -> torch.Tensor:
2270
2293
  """
2271
2294
  Create binary masks from bounding boxes.
2272
2295
 
@@ -2284,15 +2307,15 @@ class LoadVisualPrompt:
2284
2307
 
2285
2308
  return (r >= x1) * (r < x2) * (c >= y1) * (c < y2)
2286
2309
 
2287
- def __call__(self, labels):
2310
+ def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
2288
2311
  """
2289
2312
  Process labels to create visual prompts.
2290
2313
 
2291
2314
  Args:
2292
- labels (dict): Dictionary containing image data and annotations.
2315
+ labels (Dict[str, Any]): Dictionary containing image data and annotations.
2293
2316
 
2294
2317
  Returns:
2295
- (dict): Updated labels with visual prompts added.
2318
+ (Dict[str, Any]): Updated labels with visual prompts added.
2296
2319
  """
2297
2320
  imgsz = labels["img"].shape[1:]
2298
2321
  bboxes, masks = None, None
@@ -2305,13 +2328,19 @@ class LoadVisualPrompt:
2305
2328
  labels["visuals"] = visuals
2306
2329
  return labels
2307
2330
 
2308
- def get_visuals(self, category, shape, bboxes=None, masks=None):
2331
+ def get_visuals(
2332
+ self,
2333
+ category: Union[int, np.ndarray, torch.Tensor],
2334
+ shape: Tuple[int, int],
2335
+ bboxes: Union[np.ndarray, torch.Tensor] = None,
2336
+ masks: Union[np.ndarray, torch.Tensor] = None,
2337
+ ) -> torch.Tensor:
2309
2338
  """
2310
2339
  Generate visual masks based on bounding boxes or masks.
2311
2340
 
2312
2341
  Args:
2313
2342
  category (int | np.ndarray | torch.Tensor): The category labels for the objects.
2314
- shape (tuple): The shape of the image (height, width).
2343
+ shape (Tuple[int, int]): The shape of the image (height, width).
2315
2344
  bboxes (np.ndarray | torch.Tensor, optional): Bounding boxes for the objects, xyxy format.
2316
2345
  masks (np.ndarray | torch.Tensor, optional): Masks for the objects.
2317
2346
 
@@ -2429,10 +2458,10 @@ class RandomLoadText:
2429
2458
  new sampled text order.
2430
2459
 
2431
2460
  Args:
2432
- labels (dict): A dictionary containing image labels and metadata. Must include 'texts' and 'cls' keys.
2461
+ labels (Dict[str, Any]): A dictionary containing image labels and metadata. Must include 'texts' and 'cls' keys.
2433
2462
 
2434
2463
  Returns:
2435
- (dict): Updated labels dictionary with new 'cls' and 'texts' entries.
2464
+ (Dict[str, Any]): Updated labels dictionary with new 'cls' and 'texts' entries.
2436
2465
 
2437
2466
  Examples:
2438
2467
  >>> loader = RandomLoadText(prompt_format="A photo of {}", neg_samples=(5, 10), max_samples=20)
@@ -2486,7 +2515,7 @@ class RandomLoadText:
2486
2515
  return labels
2487
2516
 
2488
2517
 
2489
- def v8_transforms(dataset, imgsz, hyp, stretch=False):
2518
+ def v8_transforms(dataset, imgsz: int, hyp: IterableSimpleNamespace, stretch: bool = False):
2490
2519
  """
2491
2520
  Apply a series of image transformations for training.
2492
2521
 
@@ -2496,7 +2525,7 @@ def v8_transforms(dataset, imgsz, hyp, stretch=False):
2496
2525
  Args:
2497
2526
  dataset (Dataset): The dataset object containing image data and annotations.
2498
2527
  imgsz (int): The target image size for resizing.
2499
- hyp (Namespace): A dictionary of hyperparameters controlling various aspects of the transformations.
2528
+ hyp (IterableSimpleNamespace): A dictionary of hyperparameters controlling various aspects of the transformations.
2500
2529
  stretch (bool): If True, applies stretching to the image. If False, uses LetterBox resizing.
2501
2530
 
2502
2531
  Returns:
@@ -2556,11 +2585,11 @@ def v8_transforms(dataset, imgsz, hyp, stretch=False):
2556
2585
 
2557
2586
  # Classification augmentations -----------------------------------------------------------------------------------------
2558
2587
  def classify_transforms(
2559
- size=224,
2560
- mean=DEFAULT_MEAN,
2561
- std=DEFAULT_STD,
2562
- interpolation="BILINEAR",
2563
- crop_fraction=None,
2588
+ size: Union[Tuple[int, int], int] = 224,
2589
+ mean: Tuple[float, float, float] = DEFAULT_MEAN,
2590
+ std: Tuple[float, float, float] = DEFAULT_STD,
2591
+ interpolation: str = "BILINEAR",
2592
+ crop_fraction: float = None,
2564
2593
  ):
2565
2594
  """
2566
2595
  Create a composition of image transforms for classification tasks.
@@ -2572,8 +2601,8 @@ def classify_transforms(
2572
2601
  Args:
2573
2602
  size (int | tuple): The target size for the transformed image. If an int, it defines the shortest edge. If a
2574
2603
  tuple, it defines (height, width).
2575
- mean (tuple): Mean values for each RGB channel used in normalization.
2576
- std (tuple): Standard deviation values for each RGB channel used in normalization.
2604
+ mean (Tuple[float, float, float]): Mean values for each RGB channel used in normalization.
2605
+ std (Tuple[float, float, float]): Standard deviation values for each RGB channel used in normalization.
2577
2606
  interpolation (str): Interpolation method of either 'NEAREST', 'BILINEAR' or 'BICUBIC'.
2578
2607
  crop_fraction (float): Deprecated, will be removed in a future version.
2579
2608
 
@@ -2607,20 +2636,20 @@ def classify_transforms(
2607
2636
 
2608
2637
  # Classification training augmentations --------------------------------------------------------------------------------
2609
2638
  def classify_augmentations(
2610
- size=224,
2611
- mean=DEFAULT_MEAN,
2612
- std=DEFAULT_STD,
2613
- scale=None,
2614
- ratio=None,
2615
- hflip=0.5,
2616
- vflip=0.0,
2617
- auto_augment=None,
2618
- hsv_h=0.015, # image HSV-Hue augmentation (fraction)
2619
- hsv_s=0.4, # image HSV-Saturation augmentation (fraction)
2620
- hsv_v=0.4, # image HSV-Value augmentation (fraction)
2621
- force_color_jitter=False,
2622
- erasing=0.0,
2623
- interpolation="BILINEAR",
2639
+ size: int = 224,
2640
+ mean: Tuple[float, float, float] = DEFAULT_MEAN,
2641
+ std: Tuple[float, float, float] = DEFAULT_STD,
2642
+ scale: Tuple[float, float] = None,
2643
+ ratio: Tuple[float, float] = None,
2644
+ hflip: float = 0.5,
2645
+ vflip: float = 0.0,
2646
+ auto_augment: str = None,
2647
+ hsv_h: float = 0.015, # image HSV-Hue augmentation (fraction)
2648
+ hsv_s: float = 0.4, # image HSV-Saturation augmentation (fraction)
2649
+ hsv_v: float = 0.4, # image HSV-Value augmentation (fraction)
2650
+ force_color_jitter: bool = False,
2651
+ erasing: float = 0.0,
2652
+ interpolation: str = "BILINEAR",
2624
2653
  ):
2625
2654
  """
2626
2655
  Create a composition of image augmentation transforms for classification tasks.
@@ -2630,10 +2659,10 @@ def classify_augmentations(
2630
2659
 
2631
2660
  Args:
2632
2661
  size (int): Target size for the image after transformations.
2633
- mean (tuple): Mean values for normalization, one per channel.
2634
- std (tuple): Standard deviation values for normalization, one per channel.
2635
- scale (tuple | None): Range of size of the origin size cropped.
2636
- ratio (tuple | None): Range of aspect ratio of the origin aspect ratio cropped.
2662
+ mean (Tuple[float, float, float]): Mean values for each RGB channel used in normalization.
2663
+ std (Tuple[float, float, float]): Standard deviation values for each RGB channel used in normalization.
2664
+ scale (Tuple[float, float] | None): Range of size of the origin size cropped.
2665
+ ratio (Tuple[float, float] | None): Range of aspect ratio of the origin aspect ratio cropped.
2637
2666
  hflip (float): Probability of horizontal flip.
2638
2667
  vflip (float): Probability of vertical flip.
2639
2668
  auto_augment (str | None): Auto augmentation policy. Can be 'randaugment', 'augmix', 'autoaugment' or None.
@@ -2655,7 +2684,7 @@ def classify_augmentations(
2655
2684
  import torchvision.transforms as T # scope for faster 'import ultralytics'
2656
2685
 
2657
2686
  if not isinstance(size, int):
2658
- raise TypeError(f"classify_transforms() size {size} must be integer, not (list, tuple)")
2687
+ raise TypeError(f"classify_augmentations() size {size} must be integer, not (list, tuple)")
2659
2688
  scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range
2660
2689
  ratio = tuple(ratio or (3.0 / 4.0, 4.0 / 3.0)) # default imagenet ratio range
2661
2690
  interpolation = getattr(T.InterpolationMode, interpolation)
@@ -2734,7 +2763,7 @@ class ClassifyLetterBox:
2734
2763
  (640, 640, 3)
2735
2764
  """
2736
2765
 
2737
- def __init__(self, size=(640, 640), auto=False, stride=32):
2766
+ def __init__(self, size: Union[int, Tuple[int, int]] = (640, 640), auto: bool = False, stride: int = 32):
2738
2767
  """
2739
2768
  Initialize the ClassifyLetterBox object for image preprocessing.
2740
2769
 
@@ -2765,7 +2794,7 @@ class ClassifyLetterBox:
2765
2794
  self.auto = auto # pass max size integer, automatically solve for short side using stride
2766
2795
  self.stride = stride # used with auto
2767
2796
 
2768
- def __call__(self, im):
2797
+ def __call__(self, im: np.ndarray) -> np.ndarray:
2769
2798
  """
2770
2799
  Resize and pad an image using the letterbox method.
2771
2800
 
@@ -2773,10 +2802,10 @@ class ClassifyLetterBox:
2773
2802
  then pads the resized image to match the target size.
2774
2803
 
2775
2804
  Args:
2776
- im (numpy.ndarray): Input image as a numpy array with shape (H, W, C).
2805
+ im (np.ndarray): Input image as a numpy array with shape (H, W, C).
2777
2806
 
2778
2807
  Returns:
2779
- (numpy.ndarray): Resized and padded image as a numpy array with shape (hs, ws, 3), where hs and ws are
2808
+ (np.ndarray): Resized and padded image as a numpy array with shape (hs, ws, 3), where hs and ws are
2780
2809
  the target height and width respectively.
2781
2810
 
2782
2811
  Examples:
@@ -2823,7 +2852,7 @@ class CenterCrop:
2823
2852
  (640, 640, 3)
2824
2853
  """
2825
2854
 
2826
- def __init__(self, size=640):
2855
+ def __init__(self, size: Union[int, Tuple[int, int]] = (640, 640)):
2827
2856
  """
2828
2857
  Initialize the CenterCrop object for image preprocessing.
2829
2858
 
@@ -2847,7 +2876,7 @@ class CenterCrop:
2847
2876
  super().__init__()
2848
2877
  self.h, self.w = (size, size) if isinstance(size, int) else size
2849
2878
 
2850
- def __call__(self, im):
2879
+ def __call__(self, im: Union[Image.Image, np.ndarray]) -> np.ndarray:
2851
2880
  """
2852
2881
  Apply center cropping to an input image.
2853
2882
 
@@ -2855,11 +2884,11 @@ class CenterCrop:
2855
2884
  ratio of the original image while fitting it into the specified dimensions.
2856
2885
 
2857
2886
  Args:
2858
- im (numpy.ndarray | PIL.Image.Image): The input image as a numpy array of shape (H, W, C) or a
2887
+ im (np.ndarray | PIL.Image.Image): The input image as a numpy array of shape (H, W, C) or a
2859
2888
  PIL Image object.
2860
2889
 
2861
2890
  Returns:
2862
- (numpy.ndarray): The center-cropped and resized image as a numpy array of shape (self.h, self.w, C).
2891
+ (np.ndarray): The center-cropped and resized image as a numpy array of shape (self.h, self.w, C).
2863
2892
 
2864
2893
  Examples:
2865
2894
  >>> transform = CenterCrop(size=224)
@@ -2900,7 +2929,7 @@ class ToTensor:
2900
2929
  The output tensor will be in RGB format with shape (C, H, W), normalized to [0, 1].
2901
2930
  """
2902
2931
 
2903
- def __init__(self, half=False):
2932
+ def __init__(self, half: bool = False):
2904
2933
  """
2905
2934
  Initialize the ToTensor object for converting images to PyTorch tensors.
2906
2935
 
@@ -2921,7 +2950,7 @@ class ToTensor:
2921
2950
  super().__init__()
2922
2951
  self.half = half
2923
2952
 
2924
- def __call__(self, im):
2953
+ def __call__(self, im: np.ndarray) -> torch.Tensor:
2925
2954
  """
2926
2955
  Transform an image from a numpy array to a PyTorch tensor.
2927
2956
 
@@ -2930,7 +2959,7 @@ class ToTensor:
2930
2959
  the color channels are reversed from BGR to RGB.
2931
2960
 
2932
2961
  Args:
2933
- im (numpy.ndarray): Input image as a numpy array with shape (H, W, C) in RGB order.
2962
+ im (np.ndarray): Input image as a numpy array with shape (H, W, C) in RGB order.
2934
2963
 
2935
2964
  Returns:
2936
2965
  (torch.Tensor): The transformed image as a PyTorch tensor in float32 or float16, normalized