ultralytics 8.3.163__py3-none-any.whl → 8.3.164__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultralytics/__init__.py +1 -1
- ultralytics/data/augment.py +182 -153
- ultralytics/data/build.py +23 -3
- ultralytics/data/dataset.py +6 -2
- ultralytics/data/loaders.py +2 -2
- ultralytics/data/utils.py +9 -7
- ultralytics/engine/exporter.py +7 -3
- ultralytics/engine/results.py +42 -42
- ultralytics/models/fastsam/model.py +1 -1
- ultralytics/models/fastsam/predict.py +1 -1
- ultralytics/models/sam/model.py +4 -4
- ultralytics/models/sam/modules/blocks.py +5 -5
- ultralytics/models/sam/modules/memory_attention.py +19 -19
- ultralytics/models/sam/modules/transformer.py +24 -22
- ultralytics/models/yolo/detect/val.py +2 -2
- ultralytics/models/yolo/world/train_world.py +9 -1
- ultralytics/solutions/distance_calculation.py +1 -1
- ultralytics/solutions/instance_segmentation.py +2 -2
- ultralytics/solutions/object_blurrer.py +2 -2
- ultralytics/solutions/object_counter.py +2 -2
- ultralytics/solutions/object_cropper.py +1 -1
- ultralytics/solutions/queue_management.py +1 -1
- ultralytics/solutions/security_alarm.py +2 -2
- ultralytics/solutions/templates/similarity-search.html +0 -24
- ultralytics/solutions/vision_eye.py +1 -1
- ultralytics/utils/benchmarks.py +2 -2
- ultralytics/utils/export.py +0 -2
- ultralytics/utils/instance.py +32 -25
- ultralytics/utils/ops.py +8 -8
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/METADATA +1 -1
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/RECORD +35 -35
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/licenses/LICENSE +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/top_level.txt +0 -0
ultralytics/data/augment.py
CHANGED
@@ -12,7 +12,7 @@ from PIL import Image
|
|
12
12
|
from torch.nn import functional as F
|
13
13
|
|
14
14
|
from ultralytics.data.utils import polygons2masks, polygons2masks_overlap
|
15
|
-
from ultralytics.utils import LOGGER, colorstr
|
15
|
+
from ultralytics.utils import LOGGER, IterableSimpleNamespace, colorstr
|
16
16
|
from ultralytics.utils.checks import check_version
|
17
17
|
from ultralytics.utils.instance import Instances
|
18
18
|
from ultralytics.utils.metrics import bbox_ioa
|
@@ -366,7 +366,7 @@ class BaseMixTransform:
|
|
366
366
|
self.pre_transform = pre_transform
|
367
367
|
self.p = p
|
368
368
|
|
369
|
-
def __call__(self, labels):
|
369
|
+
def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
370
370
|
"""
|
371
371
|
Apply pre-processing transforms and cutmix/mixup/mosaic transforms to labels data.
|
372
372
|
|
@@ -374,10 +374,10 @@ class BaseMixTransform:
|
|
374
374
|
selects additional images, applies pre-transforms if specified, and then performs the mix transform.
|
375
375
|
|
376
376
|
Args:
|
377
|
-
labels (
|
377
|
+
labels (Dict[str, Any]): A dictionary containing label data for an image.
|
378
378
|
|
379
379
|
Returns:
|
380
|
-
(
|
380
|
+
(Dict[str, Any]): The transformed labels dictionary, which may include mixed data from other images.
|
381
381
|
|
382
382
|
Examples:
|
383
383
|
>>> transform = BaseMixTransform(dataset, pre_transform=None, p=0.5)
|
@@ -406,7 +406,7 @@ class BaseMixTransform:
|
|
406
406
|
labels.pop("mix_labels", None)
|
407
407
|
return labels
|
408
408
|
|
409
|
-
def _mix_transform(self, labels):
|
409
|
+
def _mix_transform(self, labels: Dict[str, Any]):
|
410
410
|
"""
|
411
411
|
Apply CutMix, MixUp or Mosaic augmentation to the label dictionary.
|
412
412
|
|
@@ -414,11 +414,11 @@ class BaseMixTransform:
|
|
414
414
|
Mosaic. It modifies the input label dictionary in-place with the augmented data.
|
415
415
|
|
416
416
|
Args:
|
417
|
-
labels (
|
417
|
+
labels (Dict[str, Any]): A dictionary containing image and label data. Expected to have a 'mix_labels' key
|
418
418
|
with a list of additional image and label data for mixing.
|
419
419
|
|
420
420
|
Returns:
|
421
|
-
(
|
421
|
+
(Dict[str, Any]): The modified labels dictionary with augmented data after applying the mix transform.
|
422
422
|
|
423
423
|
Examples:
|
424
424
|
>>> transform = BaseMixTransform(dataset)
|
@@ -442,7 +442,7 @@ class BaseMixTransform:
|
|
442
442
|
return random.randint(0, len(self.dataset) - 1)
|
443
443
|
|
444
444
|
@staticmethod
|
445
|
-
def _update_label_text(labels):
|
445
|
+
def _update_label_text(labels: Dict[str, Any]) -> Dict[str, Any]:
|
446
446
|
"""
|
447
447
|
Update label text and class IDs for mixed labels in image augmentation.
|
448
448
|
|
@@ -450,11 +450,11 @@ class BaseMixTransform:
|
|
450
450
|
creating a unified set of text labels and updating class IDs accordingly.
|
451
451
|
|
452
452
|
Args:
|
453
|
-
labels (
|
453
|
+
labels (Dict[str, Any]): A dictionary containing label information, including 'texts' and 'cls' fields,
|
454
454
|
and optionally a 'mix_labels' field with additional label dictionaries.
|
455
455
|
|
456
456
|
Returns:
|
457
|
-
(
|
457
|
+
(Dict[str, Any]): The updated labels dictionary with unified text labels and updated class IDs.
|
458
458
|
|
459
459
|
Examples:
|
460
460
|
>>> labels = {
|
@@ -517,7 +517,7 @@ class Mosaic(BaseMixTransform):
|
|
517
517
|
>>> augmented_labels = mosaic_aug(original_labels)
|
518
518
|
"""
|
519
519
|
|
520
|
-
def __init__(self, dataset, imgsz=640, p=1.0, n=4):
|
520
|
+
def __init__(self, dataset, imgsz: int = 640, p: float = 1.0, n: int = 4):
|
521
521
|
"""
|
522
522
|
Initialize the Mosaic augmentation object.
|
523
523
|
|
@@ -564,7 +564,7 @@ class Mosaic(BaseMixTransform):
|
|
564
564
|
else: # select any images
|
565
565
|
return [random.randint(0, len(self.dataset) - 1) for _ in range(self.n - 1)]
|
566
566
|
|
567
|
-
def _mix_transform(self, labels):
|
567
|
+
def _mix_transform(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
568
568
|
"""
|
569
569
|
Apply mosaic augmentation to the input image and labels.
|
570
570
|
|
@@ -573,12 +573,12 @@ class Mosaic(BaseMixTransform):
|
|
573
573
|
mosaic augmentation.
|
574
574
|
|
575
575
|
Args:
|
576
|
-
labels (
|
576
|
+
labels (Dict[str, Any]): A dictionary containing image data and annotations. Expected keys include:
|
577
577
|
- 'rect_shape': Should be None as rect and mosaic are mutually exclusive.
|
578
578
|
- 'mix_labels': A list of dictionaries containing data for other images to be used in the mosaic.
|
579
579
|
|
580
580
|
Returns:
|
581
|
-
(
|
581
|
+
(Dict[str, Any]): A dictionary containing the mosaic-augmented image and updated annotations.
|
582
582
|
|
583
583
|
Raises:
|
584
584
|
AssertionError: If 'rect_shape' is not None or if 'mix_labels' is empty.
|
@@ -593,7 +593,7 @@ class Mosaic(BaseMixTransform):
|
|
593
593
|
self._mosaic3(labels) if self.n == 3 else self._mosaic4(labels) if self.n == 4 else self._mosaic9(labels)
|
594
594
|
) # This code is modified for mosaic3 method.
|
595
595
|
|
596
|
-
def _mosaic3(self, labels):
|
596
|
+
def _mosaic3(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
597
597
|
"""
|
598
598
|
Create a 1x3 image mosaic by combining three images.
|
599
599
|
|
@@ -601,12 +601,12 @@ class Mosaic(BaseMixTransform):
|
|
601
601
|
additional images on either side. It's part of the Mosaic augmentation technique used in object detection.
|
602
602
|
|
603
603
|
Args:
|
604
|
-
labels (
|
604
|
+
labels (Dict[str, Any]): A dictionary containing image and label information for the main (center) image.
|
605
605
|
Must include 'img' key with the image array, and 'mix_labels' key with a list of two
|
606
606
|
dictionaries containing information for the side images.
|
607
607
|
|
608
608
|
Returns:
|
609
|
-
(
|
609
|
+
(Dict[str, Any]): A dictionary with the mosaic image and updated labels. Keys include:
|
610
610
|
- 'img' (np.ndarray): The mosaic image array with shape (H, W, C).
|
611
611
|
- Other keys from the input labels, updated to reflect the new image dimensions.
|
612
612
|
|
@@ -652,7 +652,7 @@ class Mosaic(BaseMixTransform):
|
|
652
652
|
final_labels["img"] = img3[-self.border[0] : self.border[0], -self.border[1] : self.border[1]]
|
653
653
|
return final_labels
|
654
654
|
|
655
|
-
def _mosaic4(self, labels):
|
655
|
+
def _mosaic4(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
656
656
|
"""
|
657
657
|
Create a 2x2 image mosaic from four input images.
|
658
658
|
|
@@ -660,11 +660,11 @@ class Mosaic(BaseMixTransform):
|
|
660
660
|
updates the corresponding labels for each image in the mosaic.
|
661
661
|
|
662
662
|
Args:
|
663
|
-
labels (
|
663
|
+
labels (Dict[str, Any]): A dictionary containing image data and labels for the base image (index 0) and three
|
664
664
|
additional images (indices 1-3) in the 'mix_labels' key.
|
665
665
|
|
666
666
|
Returns:
|
667
|
-
(
|
667
|
+
(Dict[str, Any]): A dictionary containing the mosaic image and updated labels. The 'img' key contains the mosaic
|
668
668
|
image as a numpy array, and other keys contain the combined and adjusted labels for all four images.
|
669
669
|
|
670
670
|
Examples:
|
@@ -710,7 +710,7 @@ class Mosaic(BaseMixTransform):
|
|
710
710
|
final_labels["img"] = img4
|
711
711
|
return final_labels
|
712
712
|
|
713
|
-
def _mosaic9(self, labels):
|
713
|
+
def _mosaic9(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
714
714
|
"""
|
715
715
|
Create a 3x3 image mosaic from the input image and eight additional images.
|
716
716
|
|
@@ -718,16 +718,16 @@ class Mosaic(BaseMixTransform):
|
|
718
718
|
and eight additional images from the dataset are placed around it in a 3x3 grid pattern.
|
719
719
|
|
720
720
|
Args:
|
721
|
-
labels (
|
721
|
+
labels (Dict[str, Any]): A dictionary containing the input image and its associated labels. It should have
|
722
722
|
the following keys:
|
723
|
-
- 'img' (
|
723
|
+
- 'img' (np.ndarray): The input image.
|
724
724
|
- 'resized_shape' (Tuple[int, int]): The shape of the resized image (height, width).
|
725
725
|
- 'mix_labels' (List[Dict]): A list of dictionaries containing information for the additional
|
726
726
|
eight images, each with the same structure as the input labels.
|
727
727
|
|
728
728
|
Returns:
|
729
|
-
(
|
730
|
-
- 'img' (
|
729
|
+
(Dict[str, Any]): A dictionary containing the mosaic image and updated labels. It includes the following keys:
|
730
|
+
- 'img' (np.ndarray): The final mosaic image.
|
731
731
|
- Other keys from the input labels, updated to reflect the new mosaic arrangement.
|
732
732
|
|
733
733
|
Examples:
|
@@ -783,7 +783,7 @@ class Mosaic(BaseMixTransform):
|
|
783
783
|
return final_labels
|
784
784
|
|
785
785
|
@staticmethod
|
786
|
-
def _update_labels(labels, padw, padh):
|
786
|
+
def _update_labels(labels, padw: int, padh: int) -> Dict[str, Any]:
|
787
787
|
"""
|
788
788
|
Update label coordinates with padding values.
|
789
789
|
|
@@ -791,7 +791,7 @@ class Mosaic(BaseMixTransform):
|
|
791
791
|
values. It also denormalizes the coordinates if they were previously normalized.
|
792
792
|
|
793
793
|
Args:
|
794
|
-
labels (
|
794
|
+
labels (Dict[str, Any]): A dictionary containing image and instance information.
|
795
795
|
padw (int): Padding width to be added to the x-coordinates.
|
796
796
|
padh (int): Padding height to be added to the y-coordinates.
|
797
797
|
|
@@ -809,7 +809,7 @@ class Mosaic(BaseMixTransform):
|
|
809
809
|
labels["instances"].add_padding(padw, padh)
|
810
810
|
return labels
|
811
811
|
|
812
|
-
def _cat_labels(self, mosaic_labels):
|
812
|
+
def _cat_labels(self, mosaic_labels: List[Dict[str, Any]]) -> Dict[str, Any]:
|
813
813
|
"""
|
814
814
|
Concatenate and process labels for mosaic augmentation.
|
815
815
|
|
@@ -817,10 +817,10 @@ class Mosaic(BaseMixTransform):
|
|
817
817
|
mosaic border, and removes zero-area boxes.
|
818
818
|
|
819
819
|
Args:
|
820
|
-
mosaic_labels (List[Dict]): A list of label dictionaries for each image in the mosaic.
|
820
|
+
mosaic_labels (List[Dict[str, Any]]): A list of label dictionaries for each image in the mosaic.
|
821
821
|
|
822
822
|
Returns:
|
823
|
-
(
|
823
|
+
(Dict[str, Any]): A dictionary containing concatenated and processed labels for the mosaic image, including:
|
824
824
|
- im_file (str): File path of the first image in the mosaic.
|
825
825
|
- ori_shape (Tuple[int, int]): Original shape of the first image.
|
826
826
|
- resized_shape (Tuple[int, int]): Shape of the mosaic image (imgsz * 2, imgsz * 2).
|
@@ -883,7 +883,7 @@ class MixUp(BaseMixTransform):
|
|
883
883
|
>>> augmented_labels = mixup(original_labels)
|
884
884
|
"""
|
885
885
|
|
886
|
-
def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
|
886
|
+
def __init__(self, dataset, pre_transform=None, p: float = 0.0) -> None:
|
887
887
|
"""
|
888
888
|
Initialize the MixUp augmentation object.
|
889
889
|
|
@@ -902,7 +902,7 @@ class MixUp(BaseMixTransform):
|
|
902
902
|
"""
|
903
903
|
super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
|
904
904
|
|
905
|
-
def _mix_transform(self, labels):
|
905
|
+
def _mix_transform(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
906
906
|
"""
|
907
907
|
Apply MixUp augmentation to the input labels.
|
908
908
|
|
@@ -910,10 +910,10 @@ class MixUp(BaseMixTransform):
|
|
910
910
|
"mixup: Beyond Empirical Risk Minimization" (https://arxiv.org/abs/1710.09412).
|
911
911
|
|
912
912
|
Args:
|
913
|
-
labels (
|
913
|
+
labels (Dict[str, Any]): A dictionary containing the original image and label information.
|
914
914
|
|
915
915
|
Returns:
|
916
|
-
(
|
916
|
+
(Dict[str, Any]): A dictionary containing the mixed-up image and combined label information.
|
917
917
|
|
918
918
|
Examples:
|
919
919
|
>>> mixer = MixUp(dataset)
|
@@ -952,7 +952,7 @@ class CutMix(BaseMixTransform):
|
|
952
952
|
>>> augmented_labels = cutmix(original_labels)
|
953
953
|
"""
|
954
954
|
|
955
|
-
def __init__(self, dataset, pre_transform=None, p=0.0, beta=1.0, num_areas=3) -> None:
|
955
|
+
def __init__(self, dataset, pre_transform=None, p: float = 0.0, beta: float = 1.0, num_areas: int = 3) -> None:
|
956
956
|
"""
|
957
957
|
Initialize the CutMix augmentation object.
|
958
958
|
|
@@ -967,7 +967,7 @@ class CutMix(BaseMixTransform):
|
|
967
967
|
self.beta = beta
|
968
968
|
self.num_areas = num_areas
|
969
969
|
|
970
|
-
def _rand_bbox(self, width, height):
|
970
|
+
def _rand_bbox(self, width: int, height: int) -> Tuple[int, int, int, int]:
|
971
971
|
"""
|
972
972
|
Generate random bounding box coordinates for the cut region.
|
973
973
|
|
@@ -976,7 +976,7 @@ class CutMix(BaseMixTransform):
|
|
976
976
|
height (int): Height of the image.
|
977
977
|
|
978
978
|
Returns:
|
979
|
-
(
|
979
|
+
(Tuple[int]): (x1, y1, x2, y2) coordinates of the bounding box.
|
980
980
|
"""
|
981
981
|
# Sample mixing ratio from Beta distribution
|
982
982
|
lam = np.random.beta(self.beta, self.beta)
|
@@ -997,15 +997,15 @@ class CutMix(BaseMixTransform):
|
|
997
997
|
|
998
998
|
return x1, y1, x2, y2
|
999
999
|
|
1000
|
-
def _mix_transform(self, labels):
|
1000
|
+
def _mix_transform(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
1001
1001
|
"""
|
1002
1002
|
Apply CutMix augmentation to the input labels.
|
1003
1003
|
|
1004
1004
|
Args:
|
1005
|
-
labels (
|
1005
|
+
labels (Dict[str, Any]): A dictionary containing the original image and label information.
|
1006
1006
|
|
1007
1007
|
Returns:
|
1008
|
-
(
|
1008
|
+
(Dict[str, Any]): A dictionary containing the mixed image and adjusted labels.
|
1009
1009
|
|
1010
1010
|
Examples:
|
1011
1011
|
>>> cutter = CutMix(dataset)
|
@@ -1080,7 +1080,14 @@ class RandomPerspective:
|
|
1080
1080
|
"""
|
1081
1081
|
|
1082
1082
|
def __init__(
|
1083
|
-
self,
|
1083
|
+
self,
|
1084
|
+
degrees: float = 0.0,
|
1085
|
+
translate: float = 0.1,
|
1086
|
+
scale: float = 0.5,
|
1087
|
+
shear: float = 0.0,
|
1088
|
+
perspective: float = 0.0,
|
1089
|
+
border: Tuple[int, int] = (0, 0),
|
1090
|
+
pre_transform=None,
|
1084
1091
|
):
|
1085
1092
|
"""
|
1086
1093
|
Initialize RandomPerspective object with transformation parameters.
|
@@ -1110,7 +1117,7 @@ class RandomPerspective:
|
|
1110
1117
|
self.border = border # mosaic border
|
1111
1118
|
self.pre_transform = pre_transform
|
1112
1119
|
|
1113
|
-
def affine_transform(self, img, border):
|
1120
|
+
def affine_transform(self, img: np.ndarray, border: Tuple[int, int]) -> Tuple[np.ndarray, np.ndarray, float]:
|
1114
1121
|
"""
|
1115
1122
|
Apply a sequence of affine transformations centered around the image center.
|
1116
1123
|
|
@@ -1174,7 +1181,7 @@ class RandomPerspective:
|
|
1174
1181
|
img = img[..., None]
|
1175
1182
|
return img, M, s
|
1176
1183
|
|
1177
|
-
def apply_bboxes(self, bboxes, M):
|
1184
|
+
def apply_bboxes(self, bboxes: np.ndarray, M: np.ndarray) -> np.ndarray:
|
1178
1185
|
"""
|
1179
1186
|
Apply affine transformation to bounding boxes.
|
1180
1187
|
|
@@ -1182,12 +1189,12 @@ class RandomPerspective:
|
|
1182
1189
|
transformation matrix.
|
1183
1190
|
|
1184
1191
|
Args:
|
1185
|
-
bboxes (
|
1192
|
+
bboxes (np.ndarray): Bounding boxes in xyxy format with shape (N, 4), where N is the number
|
1186
1193
|
of bounding boxes.
|
1187
|
-
M (
|
1194
|
+
M (np.ndarray): Affine transformation matrix with shape (3, 3).
|
1188
1195
|
|
1189
1196
|
Returns:
|
1190
|
-
(
|
1197
|
+
(np.ndarray): Transformed bounding boxes in xyxy format with shape (N, 4).
|
1191
1198
|
|
1192
1199
|
Examples:
|
1193
1200
|
>>> bboxes = torch.tensor([[10, 10, 20, 20], [30, 30, 40, 40]])
|
@@ -1208,7 +1215,7 @@ class RandomPerspective:
|
|
1208
1215
|
y = xy[:, [1, 3, 5, 7]]
|
1209
1216
|
return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T
|
1210
1217
|
|
1211
|
-
def apply_segments(self, segments, M):
|
1218
|
+
def apply_segments(self, segments: np.ndarray, M: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
1212
1219
|
"""
|
1213
1220
|
Apply affine transformations to segments and generate new bounding boxes.
|
1214
1221
|
|
@@ -1244,7 +1251,7 @@ class RandomPerspective:
|
|
1244
1251
|
segments[..., 1] = segments[..., 1].clip(bboxes[:, 1:2], bboxes[:, 3:4])
|
1245
1252
|
return bboxes, segments
|
1246
1253
|
|
1247
|
-
def apply_keypoints(self, keypoints, M):
|
1254
|
+
def apply_keypoints(self, keypoints: np.ndarray, M: np.ndarray) -> np.ndarray:
|
1248
1255
|
"""
|
1249
1256
|
Apply affine transformation to keypoints.
|
1250
1257
|
|
@@ -1278,7 +1285,7 @@ class RandomPerspective:
|
|
1278
1285
|
visible[out_mask] = 0
|
1279
1286
|
return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3)
|
1280
1287
|
|
1281
|
-
def __call__(self, labels):
|
1288
|
+
def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
1282
1289
|
"""
|
1283
1290
|
Apply random perspective and affine transformations to an image and its associated labels.
|
1284
1291
|
|
@@ -1287,7 +1294,7 @@ class RandomPerspective:
|
|
1287
1294
|
and keypoints accordingly.
|
1288
1295
|
|
1289
1296
|
Args:
|
1290
|
-
labels (
|
1297
|
+
labels (Dict[str, Any]): A dictionary containing image data and annotations.
|
1291
1298
|
Must include:
|
1292
1299
|
'img' (np.ndarray): The input image.
|
1293
1300
|
'cls' (np.ndarray): Class labels.
|
@@ -1296,7 +1303,7 @@ class RandomPerspective:
|
|
1296
1303
|
'mosaic_border' (Tuple[int, int]): Border size for mosaic augmentation.
|
1297
1304
|
|
1298
1305
|
Returns:
|
1299
|
-
(
|
1306
|
+
(Dict[str, Any]): Transformed labels dictionary containing:
|
1300
1307
|
- 'img' (np.ndarray): The transformed image.
|
1301
1308
|
- 'cls' (np.ndarray): Updated class labels.
|
1302
1309
|
- 'instances' (Instances): Updated object instances.
|
@@ -1357,7 +1364,14 @@ class RandomPerspective:
|
|
1357
1364
|
return labels
|
1358
1365
|
|
1359
1366
|
@staticmethod
|
1360
|
-
def box_candidates(
|
1367
|
+
def box_candidates(
|
1368
|
+
box1: np.ndarray,
|
1369
|
+
box2: np.ndarray,
|
1370
|
+
wh_thr: int = 2,
|
1371
|
+
ar_thr: int = 100,
|
1372
|
+
area_thr: float = 0.1,
|
1373
|
+
eps: float = 1e-16,
|
1374
|
+
) -> np.ndarray:
|
1361
1375
|
"""
|
1362
1376
|
Compute candidate boxes for further processing based on size and aspect ratio criteria.
|
1363
1377
|
|
@@ -1366,20 +1380,20 @@ class RandomPerspective:
|
|
1366
1380
|
been overly distorted or reduced by the augmentation process.
|
1367
1381
|
|
1368
1382
|
Args:
|
1369
|
-
box1 (
|
1383
|
+
box1 (np.ndarray): Original boxes before augmentation, shape (4, N) where n is the
|
1370
1384
|
number of boxes. Format is [x1, y1, x2, y2] in absolute coordinates.
|
1371
|
-
box2 (
|
1385
|
+
box2 (np.ndarray): Augmented boxes after transformation, shape (4, N). Format is
|
1372
1386
|
[x1, y1, x2, y2] in absolute coordinates.
|
1373
|
-
wh_thr (
|
1387
|
+
wh_thr (int): Width and height threshold in pixels. Boxes smaller than this in either
|
1374
1388
|
dimension are rejected.
|
1375
|
-
ar_thr (
|
1389
|
+
ar_thr (int): Aspect ratio threshold. Boxes with an aspect ratio greater than this
|
1376
1390
|
value are rejected.
|
1377
1391
|
area_thr (float): Area ratio threshold. Boxes with an area ratio (new/old) less than
|
1378
1392
|
this value are rejected.
|
1379
1393
|
eps (float): Small epsilon value to prevent division by zero.
|
1380
1394
|
|
1381
1395
|
Returns:
|
1382
|
-
(
|
1396
|
+
(np.ndarray): Boolean array of shape (n) indicating which boxes are candidates.
|
1383
1397
|
True values correspond to boxes that meet all criteria.
|
1384
1398
|
|
1385
1399
|
Examples:
|
@@ -1420,7 +1434,7 @@ class RandomHSV:
|
|
1420
1434
|
>>> augmented_image = augmented_labels["img"]
|
1421
1435
|
"""
|
1422
1436
|
|
1423
|
-
def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None:
|
1437
|
+
def __init__(self, hgain: float = 0.5, sgain: float = 0.5, vgain: float = 0.5) -> None:
|
1424
1438
|
"""
|
1425
1439
|
Initialize the RandomHSV object for random HSV (Hue, Saturation, Value) augmentation.
|
1426
1440
|
|
@@ -1439,7 +1453,7 @@ class RandomHSV:
|
|
1439
1453
|
self.sgain = sgain
|
1440
1454
|
self.vgain = vgain
|
1441
1455
|
|
1442
|
-
def __call__(self, labels):
|
1456
|
+
def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
1443
1457
|
"""
|
1444
1458
|
Apply random HSV augmentation to an image within predefined limits.
|
1445
1459
|
|
@@ -1447,17 +1461,16 @@ class RandomHSV:
|
|
1447
1461
|
The adjustments are made within the limits set by hgain, sgain, and vgain during initialization.
|
1448
1462
|
|
1449
1463
|
Args:
|
1450
|
-
labels (
|
1464
|
+
labels (Dict[str, Any]): A dictionary containing image data and metadata. Must include an 'img' key with
|
1451
1465
|
the image as a numpy array.
|
1452
1466
|
|
1453
1467
|
Returns:
|
1454
|
-
(
|
1455
|
-
with the HSV-augmented image.
|
1468
|
+
(Dict[str, Any]): A dictionary containing the mixed image and adjusted labels.
|
1456
1469
|
|
1457
1470
|
Examples:
|
1458
1471
|
>>> hsv_augmenter = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
|
1459
1472
|
>>> labels = {"img": np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)}
|
1460
|
-
>>> hsv_augmenter(labels)
|
1473
|
+
>>> labels = hsv_augmenter(labels)
|
1461
1474
|
>>> augmented_img = labels["img"]
|
1462
1475
|
"""
|
1463
1476
|
img = labels["img"]
|
@@ -1502,7 +1515,7 @@ class RandomFlip:
|
|
1502
1515
|
>>> flipped_instances = result["instances"]
|
1503
1516
|
"""
|
1504
1517
|
|
1505
|
-
def __init__(self, p=0.5, direction="horizontal", flip_idx=None) -> None:
|
1518
|
+
def __init__(self, p: float = 0.5, direction: str = "horizontal", flip_idx: List[int] = None) -> None:
|
1506
1519
|
"""
|
1507
1520
|
Initialize the RandomFlip class with probability and direction.
|
1508
1521
|
|
@@ -1528,7 +1541,7 @@ class RandomFlip:
|
|
1528
1541
|
self.direction = direction
|
1529
1542
|
self.flip_idx = flip_idx
|
1530
1543
|
|
1531
|
-
def __call__(self, labels):
|
1544
|
+
def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
1532
1545
|
"""
|
1533
1546
|
Apply random flip to an image and update any instances like bounding boxes or keypoints accordingly.
|
1534
1547
|
|
@@ -1537,14 +1550,14 @@ class RandomFlip:
|
|
1537
1550
|
match the flipped image.
|
1538
1551
|
|
1539
1552
|
Args:
|
1540
|
-
labels (
|
1541
|
-
'img' (
|
1553
|
+
labels (Dict[str, Any]): A dictionary containing the following keys:
|
1554
|
+
'img' (np.ndarray): The image to be flipped.
|
1542
1555
|
'instances' (ultralytics.utils.instance.Instances): An object containing bounding boxes and
|
1543
1556
|
optionally keypoints.
|
1544
1557
|
|
1545
1558
|
Returns:
|
1546
|
-
(
|
1547
|
-
'img' (
|
1559
|
+
(Dict[str, Any]): The same dictionary with the flipped image and updated instances:
|
1560
|
+
'img' (np.ndarray): The flipped image.
|
1548
1561
|
'instances' (ultralytics.utils.instance.Instances): Updated instances matching the flipped image.
|
1549
1562
|
|
1550
1563
|
Examples:
|
@@ -1600,7 +1613,15 @@ class LetterBox:
|
|
1600
1613
|
>>> updated_instances = result["instances"]
|
1601
1614
|
"""
|
1602
1615
|
|
1603
|
-
def __init__(
|
1616
|
+
def __init__(
|
1617
|
+
self,
|
1618
|
+
new_shape: Tuple[int, int] = (640, 640),
|
1619
|
+
auto: bool = False,
|
1620
|
+
scale_fill: bool = False,
|
1621
|
+
scaleup: bool = True,
|
1622
|
+
center: bool = True,
|
1623
|
+
stride: int = 32,
|
1624
|
+
):
|
1604
1625
|
"""
|
1605
1626
|
Initialize LetterBox object for resizing and padding images.
|
1606
1627
|
|
@@ -1633,7 +1654,7 @@ class LetterBox:
|
|
1633
1654
|
self.stride = stride
|
1634
1655
|
self.center = center # Put the image in the middle or top-left
|
1635
1656
|
|
1636
|
-
def __call__(self, labels=None, image=None):
|
1657
|
+
def __call__(self, labels: Dict[str, Any] = None, image: np.ndarray = None) -> Union[Dict[str, Any], np.ndarray]:
|
1637
1658
|
"""
|
1638
1659
|
Resize and pad an image for object detection, instance segmentation, or pose estimation tasks.
|
1639
1660
|
|
@@ -1641,13 +1662,13 @@ class LetterBox:
|
|
1641
1662
|
aspect ratio and adding padding to fit the new shape. It also updates any associated labels accordingly.
|
1642
1663
|
|
1643
1664
|
Args:
|
1644
|
-
labels (Dict | None): A dictionary containing image data and associated labels, or empty dict if None.
|
1665
|
+
labels (Dict[str, Any] | None): A dictionary containing image data and associated labels, or empty dict if None.
|
1645
1666
|
image (np.ndarray | None): The input image as a numpy array. If None, the image is taken from 'labels'.
|
1646
1667
|
|
1647
1668
|
Returns:
|
1648
|
-
(Dict |
|
1649
|
-
updated labels, and additional metadata. If 'labels' is empty, returns
|
1650
|
-
and padded image
|
1669
|
+
(Dict[str, Any] | nd.ndarray): If 'labels' is provided, returns an updated dictionary with the resized and padded image,
|
1670
|
+
updated labels, and additional metadata. If 'labels' is empty, returns the resized
|
1671
|
+
and padded image.
|
1651
1672
|
|
1652
1673
|
Examples:
|
1653
1674
|
>>> letterbox = LetterBox(new_shape=(640, 640))
|
@@ -1710,7 +1731,7 @@ class LetterBox:
|
|
1710
1731
|
return img
|
1711
1732
|
|
1712
1733
|
@staticmethod
|
1713
|
-
def _update_labels(labels, ratio, padw, padh):
|
1734
|
+
def _update_labels(labels: Dict[str, Any], ratio: Tuple[float, float], padw: float, padh: float) -> Dict[str, Any]:
|
1714
1735
|
"""
|
1715
1736
|
Update labels after applying letterboxing to an image.
|
1716
1737
|
|
@@ -1718,13 +1739,13 @@ class LetterBox:
|
|
1718
1739
|
to account for resizing and padding applied during letterboxing.
|
1719
1740
|
|
1720
1741
|
Args:
|
1721
|
-
labels (
|
1742
|
+
labels (Dict[str, Any]): A dictionary containing image labels and instances.
|
1722
1743
|
ratio (Tuple[float, float]): Scaling ratios (width, height) applied to the image.
|
1723
1744
|
padw (float): Padding width added to the image.
|
1724
1745
|
padh (float): Padding height added to the image.
|
1725
1746
|
|
1726
1747
|
Returns:
|
1727
|
-
(
|
1748
|
+
(Dict[str, Any]): Updated labels dictionary with modified instance coordinates.
|
1728
1749
|
|
1729
1750
|
Examples:
|
1730
1751
|
>>> letterbox = LetterBox(new_shape=(640, 640))
|
@@ -1764,18 +1785,18 @@ class CopyPaste(BaseMixTransform):
|
|
1764
1785
|
>>> augmented_labels = copypaste(original_labels)
|
1765
1786
|
"""
|
1766
1787
|
|
1767
|
-
def __init__(self, dataset=None, pre_transform=None, p=0.5, mode="flip") -> None:
|
1788
|
+
def __init__(self, dataset=None, pre_transform=None, p: float = 0.5, mode: str = "flip") -> None:
|
1768
1789
|
"""Initialize CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
|
1769
1790
|
super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
|
1770
1791
|
assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
|
1771
1792
|
self.mode = mode
|
1772
1793
|
|
1773
|
-
def _mix_transform(self, labels):
|
1794
|
+
def _mix_transform(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
1774
1795
|
"""Apply Copy-Paste augmentation to combine objects from another image into the current image."""
|
1775
1796
|
labels2 = labels["mix_labels"][0]
|
1776
1797
|
return self._transform(labels, labels2)
|
1777
1798
|
|
1778
|
-
def __call__(self, labels):
|
1799
|
+
def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
1779
1800
|
"""Apply Copy-Paste augmentation to an image and its labels."""
|
1780
1801
|
if len(labels["instances"].segments) == 0 or self.p == 0:
|
1781
1802
|
return labels
|
@@ -1802,7 +1823,7 @@ class CopyPaste(BaseMixTransform):
|
|
1802
1823
|
labels.pop("mix_labels", None)
|
1803
1824
|
return labels
|
1804
1825
|
|
1805
|
-
def _transform(self, labels1, labels2={}):
|
1826
|
+
def _transform(self, labels1: Dict[str, Any], labels2: Dict[str, Any] = {}) -> Dict[str, Any]:
|
1806
1827
|
"""Apply Copy-Paste augmentation to combine objects from another image into the current image."""
|
1807
1828
|
im = labels1["img"]
|
1808
1829
|
if "mosaic_border" not in labels1:
|
@@ -1866,7 +1887,7 @@ class Albumentations:
|
|
1866
1887
|
- Spatial transforms are handled differently and require special processing for bounding boxes.
|
1867
1888
|
"""
|
1868
1889
|
|
1869
|
-
def __init__(self, p=1.0):
|
1890
|
+
def __init__(self, p: float = 1.0) -> None:
|
1870
1891
|
"""
|
1871
1892
|
Initialize the Albumentations transform object for YOLO bbox formatted parameters.
|
1872
1893
|
|
@@ -1980,7 +2001,7 @@ class Albumentations:
|
|
1980
2001
|
except Exception as e:
|
1981
2002
|
LOGGER.info(f"{prefix}{e}")
|
1982
2003
|
|
1983
|
-
def __call__(self, labels):
|
2004
|
+
def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
1984
2005
|
"""
|
1985
2006
|
Apply Albumentations transformations to input labels.
|
1986
2007
|
|
@@ -1988,13 +2009,13 @@ class Albumentations:
|
|
1988
2009
|
spatial and non-spatial transformations on the input image and its corresponding labels.
|
1989
2010
|
|
1990
2011
|
Args:
|
1991
|
-
labels (
|
1992
|
-
- 'img':
|
1993
|
-
- 'cls':
|
2012
|
+
labels (Dict[str, Any]): A dictionary containing image data and annotations. Expected keys are:
|
2013
|
+
- 'img': np.ndarray representing the image
|
2014
|
+
- 'cls': np.ndarray of class labels
|
1994
2015
|
- 'instances': object containing bounding boxes and other instance information
|
1995
2016
|
|
1996
2017
|
Returns:
|
1997
|
-
(
|
2018
|
+
(Dict[str, Any]): The input dictionary with augmented image and updated annotations.
|
1998
2019
|
|
1999
2020
|
Examples:
|
2000
2021
|
>>> transform = Albumentations(p=0.5)
|
@@ -2069,15 +2090,15 @@ class Format:
|
|
2069
2090
|
|
2070
2091
|
def __init__(
|
2071
2092
|
self,
|
2072
|
-
bbox_format="xywh",
|
2073
|
-
normalize=True,
|
2074
|
-
return_mask=False,
|
2075
|
-
return_keypoint=False,
|
2076
|
-
return_obb=False,
|
2077
|
-
mask_ratio=4,
|
2078
|
-
mask_overlap=True,
|
2079
|
-
batch_idx=True,
|
2080
|
-
bgr=0.0,
|
2093
|
+
bbox_format: str = "xywh",
|
2094
|
+
normalize: bool = True,
|
2095
|
+
return_mask: bool = False,
|
2096
|
+
return_keypoint: bool = False,
|
2097
|
+
return_obb: bool = False,
|
2098
|
+
mask_ratio: int = 4,
|
2099
|
+
mask_overlap: bool = True,
|
2100
|
+
batch_idx: bool = True,
|
2101
|
+
bgr: float = 0.0,
|
2081
2102
|
):
|
2082
2103
|
"""
|
2083
2104
|
Initialize the Format class with given parameters for image and instance annotation formatting.
|
@@ -2122,7 +2143,7 @@ class Format:
|
|
2122
2143
|
self.batch_idx = batch_idx # keep the batch indexes
|
2123
2144
|
self.bgr = bgr
|
2124
2145
|
|
2125
|
-
def __call__(self, labels):
|
2146
|
+
def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
2126
2147
|
"""
|
2127
2148
|
Format image annotations for object detection, instance segmentation, and pose estimation tasks.
|
2128
2149
|
|
@@ -2131,13 +2152,13 @@ class Format:
|
|
2131
2152
|
applying normalization if required.
|
2132
2153
|
|
2133
2154
|
Args:
|
2134
|
-
labels (
|
2155
|
+
labels (Dict[str, Any]): A dictionary containing image and annotation data with the following keys:
|
2135
2156
|
- 'img': The input image as a numpy array.
|
2136
2157
|
- 'cls': Class labels for instances.
|
2137
2158
|
- 'instances': An Instances object containing bounding boxes, segments, and keypoints.
|
2138
2159
|
|
2139
2160
|
Returns:
|
2140
|
-
(
|
2161
|
+
(Dict[str, Any]): A dictionary with formatted data, including:
|
2141
2162
|
- 'img': Formatted image tensor.
|
2142
2163
|
- 'cls': Class label's tensor.
|
2143
2164
|
- 'bboxes': Bounding boxes tensor in the specified format.
|
@@ -2191,7 +2212,7 @@ class Format:
|
|
2191
2212
|
labels["batch_idx"] = torch.zeros(nl)
|
2192
2213
|
return labels
|
2193
2214
|
|
2194
|
-
def _format_img(self, img):
|
2215
|
+
def _format_img(self, img: np.ndarray) -> torch.Tensor:
|
2195
2216
|
"""
|
2196
2217
|
Format an image for YOLO from a Numpy array to a PyTorch tensor.
|
2197
2218
|
|
@@ -2222,20 +2243,22 @@ class Format:
|
|
2222
2243
|
img = torch.from_numpy(img)
|
2223
2244
|
return img
|
2224
2245
|
|
2225
|
-
def _format_segments(
|
2246
|
+
def _format_segments(
|
2247
|
+
self, instances: Instances, cls: np.ndarray, w: int, h: int
|
2248
|
+
) -> Tuple[np.ndarray, Instances, np.ndarray]:
|
2226
2249
|
"""
|
2227
2250
|
Convert polygon segments to bitmap masks.
|
2228
2251
|
|
2229
2252
|
Args:
|
2230
2253
|
instances (Instances): Object containing segment information.
|
2231
|
-
cls (
|
2254
|
+
cls (np.ndarray): Class labels for each instance.
|
2232
2255
|
w (int): Width of the image.
|
2233
2256
|
h (int): Height of the image.
|
2234
2257
|
|
2235
2258
|
Returns:
|
2236
|
-
masks (
|
2259
|
+
masks (np.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
|
2237
2260
|
instances (Instances): Updated instances object with sorted segments if mask_overlap is True.
|
2238
|
-
cls (
|
2261
|
+
cls (np.ndarray): Updated class labels, sorted if mask_overlap is True.
|
2239
2262
|
|
2240
2263
|
Notes:
|
2241
2264
|
- If self.mask_overlap is True, masks are overlapped and sorted by area.
|
@@ -2257,7 +2280,7 @@ class Format:
|
|
2257
2280
|
class LoadVisualPrompt:
|
2258
2281
|
"""Create visual prompts from bounding boxes or masks for model input."""
|
2259
2282
|
|
2260
|
-
def __init__(self, scale_factor=1 / 8):
|
2283
|
+
def __init__(self, scale_factor: float = 1 / 8) -> None:
|
2261
2284
|
"""
|
2262
2285
|
Initialize the LoadVisualPrompt with a scale factor.
|
2263
2286
|
|
@@ -2266,7 +2289,7 @@ class LoadVisualPrompt:
|
|
2266
2289
|
"""
|
2267
2290
|
self.scale_factor = scale_factor
|
2268
2291
|
|
2269
|
-
def make_mask(self, boxes, h, w):
|
2292
|
+
def make_mask(self, boxes: torch.Tensor, h: int, w: int) -> torch.Tensor:
|
2270
2293
|
"""
|
2271
2294
|
Create binary masks from bounding boxes.
|
2272
2295
|
|
@@ -2284,15 +2307,15 @@ class LoadVisualPrompt:
|
|
2284
2307
|
|
2285
2308
|
return (r >= x1) * (r < x2) * (c >= y1) * (c < y2)
|
2286
2309
|
|
2287
|
-
def __call__(self, labels):
|
2310
|
+
def __call__(self, labels: Dict[str, Any]) -> Dict[str, Any]:
|
2288
2311
|
"""
|
2289
2312
|
Process labels to create visual prompts.
|
2290
2313
|
|
2291
2314
|
Args:
|
2292
|
-
labels (
|
2315
|
+
labels (Dict[str, Any]): Dictionary containing image data and annotations.
|
2293
2316
|
|
2294
2317
|
Returns:
|
2295
|
-
(
|
2318
|
+
(Dict[str, Any]): Updated labels with visual prompts added.
|
2296
2319
|
"""
|
2297
2320
|
imgsz = labels["img"].shape[1:]
|
2298
2321
|
bboxes, masks = None, None
|
@@ -2305,13 +2328,19 @@ class LoadVisualPrompt:
|
|
2305
2328
|
labels["visuals"] = visuals
|
2306
2329
|
return labels
|
2307
2330
|
|
2308
|
-
def get_visuals(
|
2331
|
+
def get_visuals(
|
2332
|
+
self,
|
2333
|
+
category: Union[int, np.ndarray, torch.Tensor],
|
2334
|
+
shape: Tuple[int, int],
|
2335
|
+
bboxes: Union[np.ndarray, torch.Tensor] = None,
|
2336
|
+
masks: Union[np.ndarray, torch.Tensor] = None,
|
2337
|
+
) -> torch.Tensor:
|
2309
2338
|
"""
|
2310
2339
|
Generate visual masks based on bounding boxes or masks.
|
2311
2340
|
|
2312
2341
|
Args:
|
2313
2342
|
category (int | np.ndarray | torch.Tensor): The category labels for the objects.
|
2314
|
-
shape (
|
2343
|
+
shape (Tuple[int, int]): The shape of the image (height, width).
|
2315
2344
|
bboxes (np.ndarray | torch.Tensor, optional): Bounding boxes for the objects, xyxy format.
|
2316
2345
|
masks (np.ndarray | torch.Tensor, optional): Masks for the objects.
|
2317
2346
|
|
@@ -2429,10 +2458,10 @@ class RandomLoadText:
|
|
2429
2458
|
new sampled text order.
|
2430
2459
|
|
2431
2460
|
Args:
|
2432
|
-
labels (
|
2461
|
+
labels (Dict[str, Any]): A dictionary containing image labels and metadata. Must include 'texts' and 'cls' keys.
|
2433
2462
|
|
2434
2463
|
Returns:
|
2435
|
-
(
|
2464
|
+
(Dict[str, Any]): Updated labels dictionary with new 'cls' and 'texts' entries.
|
2436
2465
|
|
2437
2466
|
Examples:
|
2438
2467
|
>>> loader = RandomLoadText(prompt_format="A photo of {}", neg_samples=(5, 10), max_samples=20)
|
@@ -2486,7 +2515,7 @@ class RandomLoadText:
|
|
2486
2515
|
return labels
|
2487
2516
|
|
2488
2517
|
|
2489
|
-
def v8_transforms(dataset, imgsz, hyp, stretch=False):
|
2518
|
+
def v8_transforms(dataset, imgsz: int, hyp: IterableSimpleNamespace, stretch: bool = False):
|
2490
2519
|
"""
|
2491
2520
|
Apply a series of image transformations for training.
|
2492
2521
|
|
@@ -2496,7 +2525,7 @@ def v8_transforms(dataset, imgsz, hyp, stretch=False):
|
|
2496
2525
|
Args:
|
2497
2526
|
dataset (Dataset): The dataset object containing image data and annotations.
|
2498
2527
|
imgsz (int): The target image size for resizing.
|
2499
|
-
hyp (
|
2528
|
+
hyp (IterableSimpleNamespace): A dictionary of hyperparameters controlling various aspects of the transformations.
|
2500
2529
|
stretch (bool): If True, applies stretching to the image. If False, uses LetterBox resizing.
|
2501
2530
|
|
2502
2531
|
Returns:
|
@@ -2556,11 +2585,11 @@ def v8_transforms(dataset, imgsz, hyp, stretch=False):
|
|
2556
2585
|
|
2557
2586
|
# Classification augmentations -----------------------------------------------------------------------------------------
|
2558
2587
|
def classify_transforms(
|
2559
|
-
size=224,
|
2560
|
-
mean=DEFAULT_MEAN,
|
2561
|
-
std=DEFAULT_STD,
|
2562
|
-
interpolation="BILINEAR",
|
2563
|
-
crop_fraction=None,
|
2588
|
+
size: Union[Tuple[int, int], int] = 224,
|
2589
|
+
mean: Tuple[float, float, float] = DEFAULT_MEAN,
|
2590
|
+
std: Tuple[float, float, float] = DEFAULT_STD,
|
2591
|
+
interpolation: str = "BILINEAR",
|
2592
|
+
crop_fraction: float = None,
|
2564
2593
|
):
|
2565
2594
|
"""
|
2566
2595
|
Create a composition of image transforms for classification tasks.
|
@@ -2572,8 +2601,8 @@ def classify_transforms(
|
|
2572
2601
|
Args:
|
2573
2602
|
size (int | tuple): The target size for the transformed image. If an int, it defines the shortest edge. If a
|
2574
2603
|
tuple, it defines (height, width).
|
2575
|
-
mean (
|
2576
|
-
std (
|
2604
|
+
mean (Tuple[float, float, float]): Mean values for each RGB channel used in normalization.
|
2605
|
+
std (Tuple[float, float, float]): Standard deviation values for each RGB channel used in normalization.
|
2577
2606
|
interpolation (str): Interpolation method of either 'NEAREST', 'BILINEAR' or 'BICUBIC'.
|
2578
2607
|
crop_fraction (float): Deprecated, will be removed in a future version.
|
2579
2608
|
|
@@ -2607,20 +2636,20 @@ def classify_transforms(
|
|
2607
2636
|
|
2608
2637
|
# Classification training augmentations --------------------------------------------------------------------------------
|
2609
2638
|
def classify_augmentations(
|
2610
|
-
size=224,
|
2611
|
-
mean=DEFAULT_MEAN,
|
2612
|
-
std=DEFAULT_STD,
|
2613
|
-
scale=None,
|
2614
|
-
ratio=None,
|
2615
|
-
hflip=0.5,
|
2616
|
-
vflip=0.0,
|
2617
|
-
auto_augment=None,
|
2618
|
-
hsv_h=0.015, # image HSV-Hue augmentation (fraction)
|
2619
|
-
hsv_s=0.4, # image HSV-Saturation augmentation (fraction)
|
2620
|
-
hsv_v=0.4, # image HSV-Value augmentation (fraction)
|
2621
|
-
force_color_jitter=False,
|
2622
|
-
erasing=0.0,
|
2623
|
-
interpolation="BILINEAR",
|
2639
|
+
size: int = 224,
|
2640
|
+
mean: Tuple[float, float, float] = DEFAULT_MEAN,
|
2641
|
+
std: Tuple[float, float, float] = DEFAULT_STD,
|
2642
|
+
scale: Tuple[float, float] = None,
|
2643
|
+
ratio: Tuple[float, float] = None,
|
2644
|
+
hflip: float = 0.5,
|
2645
|
+
vflip: float = 0.0,
|
2646
|
+
auto_augment: str = None,
|
2647
|
+
hsv_h: float = 0.015, # image HSV-Hue augmentation (fraction)
|
2648
|
+
hsv_s: float = 0.4, # image HSV-Saturation augmentation (fraction)
|
2649
|
+
hsv_v: float = 0.4, # image HSV-Value augmentation (fraction)
|
2650
|
+
force_color_jitter: bool = False,
|
2651
|
+
erasing: float = 0.0,
|
2652
|
+
interpolation: str = "BILINEAR",
|
2624
2653
|
):
|
2625
2654
|
"""
|
2626
2655
|
Create a composition of image augmentation transforms for classification tasks.
|
@@ -2630,10 +2659,10 @@ def classify_augmentations(
|
|
2630
2659
|
|
2631
2660
|
Args:
|
2632
2661
|
size (int): Target size for the image after transformations.
|
2633
|
-
mean (
|
2634
|
-
std (
|
2635
|
-
scale (
|
2636
|
-
ratio (
|
2662
|
+
mean (Tuple[float, float, float]): Mean values for each RGB channel used in normalization.
|
2663
|
+
std (Tuple[float, float, float]): Standard deviation values for each RGB channel used in normalization.
|
2664
|
+
scale (Tuple[float, float] | None): Range of size of the origin size cropped.
|
2665
|
+
ratio (Tuple[float, float] | None): Range of aspect ratio of the origin aspect ratio cropped.
|
2637
2666
|
hflip (float): Probability of horizontal flip.
|
2638
2667
|
vflip (float): Probability of vertical flip.
|
2639
2668
|
auto_augment (str | None): Auto augmentation policy. Can be 'randaugment', 'augmix', 'autoaugment' or None.
|
@@ -2655,7 +2684,7 @@ def classify_augmentations(
|
|
2655
2684
|
import torchvision.transforms as T # scope for faster 'import ultralytics'
|
2656
2685
|
|
2657
2686
|
if not isinstance(size, int):
|
2658
|
-
raise TypeError(f"
|
2687
|
+
raise TypeError(f"classify_augmentations() size {size} must be integer, not (list, tuple)")
|
2659
2688
|
scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range
|
2660
2689
|
ratio = tuple(ratio or (3.0 / 4.0, 4.0 / 3.0)) # default imagenet ratio range
|
2661
2690
|
interpolation = getattr(T.InterpolationMode, interpolation)
|
@@ -2734,7 +2763,7 @@ class ClassifyLetterBox:
|
|
2734
2763
|
(640, 640, 3)
|
2735
2764
|
"""
|
2736
2765
|
|
2737
|
-
def __init__(self, size=(640, 640), auto=False, stride=32):
|
2766
|
+
def __init__(self, size: Union[int, Tuple[int, int]] = (640, 640), auto: bool = False, stride: int = 32):
|
2738
2767
|
"""
|
2739
2768
|
Initialize the ClassifyLetterBox object for image preprocessing.
|
2740
2769
|
|
@@ -2765,7 +2794,7 @@ class ClassifyLetterBox:
|
|
2765
2794
|
self.auto = auto # pass max size integer, automatically solve for short side using stride
|
2766
2795
|
self.stride = stride # used with auto
|
2767
2796
|
|
2768
|
-
def __call__(self, im):
|
2797
|
+
def __call__(self, im: np.ndarray) -> np.ndarray:
|
2769
2798
|
"""
|
2770
2799
|
Resize and pad an image using the letterbox method.
|
2771
2800
|
|
@@ -2773,10 +2802,10 @@ class ClassifyLetterBox:
|
|
2773
2802
|
then pads the resized image to match the target size.
|
2774
2803
|
|
2775
2804
|
Args:
|
2776
|
-
im (
|
2805
|
+
im (np.ndarray): Input image as a numpy array with shape (H, W, C).
|
2777
2806
|
|
2778
2807
|
Returns:
|
2779
|
-
(
|
2808
|
+
(np.ndarray): Resized and padded image as a numpy array with shape (hs, ws, 3), where hs and ws are
|
2780
2809
|
the target height and width respectively.
|
2781
2810
|
|
2782
2811
|
Examples:
|
@@ -2823,7 +2852,7 @@ class CenterCrop:
|
|
2823
2852
|
(640, 640, 3)
|
2824
2853
|
"""
|
2825
2854
|
|
2826
|
-
def __init__(self, size=640):
|
2855
|
+
def __init__(self, size: Union[int, Tuple[int, int]] = (640, 640)):
|
2827
2856
|
"""
|
2828
2857
|
Initialize the CenterCrop object for image preprocessing.
|
2829
2858
|
|
@@ -2847,7 +2876,7 @@ class CenterCrop:
|
|
2847
2876
|
super().__init__()
|
2848
2877
|
self.h, self.w = (size, size) if isinstance(size, int) else size
|
2849
2878
|
|
2850
|
-
def __call__(self, im):
|
2879
|
+
def __call__(self, im: Union[Image.Image, np.ndarray]) -> np.ndarray:
|
2851
2880
|
"""
|
2852
2881
|
Apply center cropping to an input image.
|
2853
2882
|
|
@@ -2855,11 +2884,11 @@ class CenterCrop:
|
|
2855
2884
|
ratio of the original image while fitting it into the specified dimensions.
|
2856
2885
|
|
2857
2886
|
Args:
|
2858
|
-
im (
|
2887
|
+
im (np.ndarray | PIL.Image.Image): The input image as a numpy array of shape (H, W, C) or a
|
2859
2888
|
PIL Image object.
|
2860
2889
|
|
2861
2890
|
Returns:
|
2862
|
-
(
|
2891
|
+
(np.ndarray): The center-cropped and resized image as a numpy array of shape (self.h, self.w, C).
|
2863
2892
|
|
2864
2893
|
Examples:
|
2865
2894
|
>>> transform = CenterCrop(size=224)
|
@@ -2900,7 +2929,7 @@ class ToTensor:
|
|
2900
2929
|
The output tensor will be in RGB format with shape (C, H, W), normalized to [0, 1].
|
2901
2930
|
"""
|
2902
2931
|
|
2903
|
-
def __init__(self, half=False):
|
2932
|
+
def __init__(self, half: bool = False):
|
2904
2933
|
"""
|
2905
2934
|
Initialize the ToTensor object for converting images to PyTorch tensors.
|
2906
2935
|
|
@@ -2921,7 +2950,7 @@ class ToTensor:
|
|
2921
2950
|
super().__init__()
|
2922
2951
|
self.half = half
|
2923
2952
|
|
2924
|
-
def __call__(self, im):
|
2953
|
+
def __call__(self, im: np.ndarray) -> torch.Tensor:
|
2925
2954
|
"""
|
2926
2955
|
Transform an image from a numpy array to a PyTorch tensor.
|
2927
2956
|
|
@@ -2930,7 +2959,7 @@ class ToTensor:
|
|
2930
2959
|
the color channels are reversed from BGR to RGB.
|
2931
2960
|
|
2932
2961
|
Args:
|
2933
|
-
im (
|
2962
|
+
im (np.ndarray): Input image as a numpy array with shape (H, W, C) in RGB order.
|
2934
2963
|
|
2935
2964
|
Returns:
|
2936
2965
|
(torch.Tensor): The transformed image as a PyTorch tensor in float32 or float16, normalized
|