ultralytics 8.3.163__py3-none-any.whl → 8.3.165__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ultralytics/__init__.py +1 -1
  2. ultralytics/cfg/datasets/HomeObjects-3K.yaml +2 -3
  3. ultralytics/cfg/datasets/african-wildlife.yaml +3 -3
  4. ultralytics/cfg/datasets/brain-tumor.yaml +3 -4
  5. ultralytics/cfg/datasets/carparts-seg.yaml +4 -4
  6. ultralytics/cfg/datasets/crack-seg.yaml +4 -4
  7. ultralytics/cfg/datasets/dog-pose.yaml +2 -2
  8. ultralytics/cfg/datasets/hand-keypoints.yaml +2 -2
  9. ultralytics/cfg/datasets/medical-pills.yaml +2 -3
  10. ultralytics/cfg/datasets/package-seg.yaml +4 -4
  11. ultralytics/cfg/datasets/signature.yaml +3 -3
  12. ultralytics/cfg/datasets/tiger-pose.yaml +3 -3
  13. ultralytics/data/augment.py +182 -153
  14. ultralytics/data/build.py +23 -3
  15. ultralytics/data/dataset.py +6 -2
  16. ultralytics/data/loaders.py +2 -2
  17. ultralytics/data/utils.py +9 -7
  18. ultralytics/engine/exporter.py +7 -3
  19. ultralytics/engine/results.py +42 -42
  20. ultralytics/models/fastsam/model.py +1 -1
  21. ultralytics/models/fastsam/predict.py +1 -1
  22. ultralytics/models/sam/model.py +4 -4
  23. ultralytics/models/sam/modules/blocks.py +5 -5
  24. ultralytics/models/sam/modules/memory_attention.py +19 -19
  25. ultralytics/models/sam/modules/transformer.py +24 -22
  26. ultralytics/models/yolo/detect/val.py +2 -2
  27. ultralytics/models/yolo/world/train_world.py +9 -1
  28. ultralytics/solutions/distance_calculation.py +1 -1
  29. ultralytics/solutions/instance_segmentation.py +2 -2
  30. ultralytics/solutions/object_blurrer.py +2 -2
  31. ultralytics/solutions/object_counter.py +2 -2
  32. ultralytics/solutions/object_cropper.py +1 -1
  33. ultralytics/solutions/queue_management.py +1 -1
  34. ultralytics/solutions/security_alarm.py +2 -2
  35. ultralytics/solutions/templates/similarity-search.html +0 -24
  36. ultralytics/solutions/vision_eye.py +1 -1
  37. ultralytics/utils/benchmarks.py +2 -2
  38. ultralytics/utils/export.py +0 -2
  39. ultralytics/utils/instance.py +32 -25
  40. ultralytics/utils/ops.py +8 -8
  41. {ultralytics-8.3.163.dist-info → ultralytics-8.3.165.dist-info}/METADATA +1 -1
  42. {ultralytics-8.3.163.dist-info → ultralytics-8.3.165.dist-info}/RECORD +46 -46
  43. {ultralytics-8.3.163.dist-info → ultralytics-8.3.165.dist-info}/WHEEL +0 -0
  44. {ultralytics-8.3.163.dist-info → ultralytics-8.3.165.dist-info}/entry_points.txt +0 -0
  45. {ultralytics-8.3.163.dist-info → ultralytics-8.3.165.dist-info}/licenses/LICENSE +0 -0
  46. {ultralytics-8.3.163.dist-info → ultralytics-8.3.165.dist-info}/top_level.txt +0 -0
ultralytics/data/build.py CHANGED
@@ -3,13 +3,14 @@
3
3
  import os
4
4
  import random
5
5
  from pathlib import Path
6
- from typing import Any, Iterator
6
+ from typing import Any, Dict, Iterator
7
7
 
8
8
  import numpy as np
9
9
  import torch
10
10
  from PIL import Image
11
11
  from torch.utils.data import dataloader, distributed
12
12
 
13
+ from ultralytics.cfg import IterableSimpleNamespace
13
14
  from ultralytics.data.dataset import GroundingDataset, YOLODataset, YOLOMultiModalDataset
14
15
  from ultralytics.data.loaders import (
15
16
  LOADERS,
@@ -111,7 +112,16 @@ def seed_worker(worker_id: int): # noqa
111
112
  random.seed(worker_seed)
112
113
 
113
114
 
114
- def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, stride=32, multi_modal=False):
115
+ def build_yolo_dataset(
116
+ cfg: IterableSimpleNamespace,
117
+ img_path: str,
118
+ batch: int,
119
+ data: Dict[str, Any],
120
+ mode: str = "train",
121
+ rect: bool = False,
122
+ stride: int = 32,
123
+ multi_modal: bool = False,
124
+ ):
115
125
  """Build and return a YOLO dataset based on configuration parameters."""
116
126
  dataset = YOLOMultiModalDataset if multi_modal else YOLODataset
117
127
  return dataset(
@@ -133,11 +143,21 @@ def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, str
133
143
  )
134
144
 
135
145
 
136
- def build_grounding(cfg, img_path, json_file, batch, mode="train", rect=False, stride=32):
146
+ def build_grounding(
147
+ cfg: IterableSimpleNamespace,
148
+ img_path: str,
149
+ json_file: str,
150
+ batch: int,
151
+ mode: str = "train",
152
+ rect: bool = False,
153
+ stride: int = 32,
154
+ max_samples: int = 80,
155
+ ):
137
156
  """Build and return a GroundingDataset based on configuration parameters."""
138
157
  return GroundingDataset(
139
158
  img_path=img_path,
140
159
  json_file=json_file,
160
+ max_samples=max_samples,
141
161
  imgsz=cfg.imgsz,
142
162
  batch_size=batch,
143
163
  augment=mode == "train", # augmentation
@@ -411,6 +411,7 @@ class YOLOMultiModalDataset(YOLODataset):
411
411
  @staticmethod
412
412
  def _get_neg_texts(category_freq: Dict, threshold: int = 100) -> List[str]:
413
413
  """Get negative text samples based on frequency threshold."""
414
+ threshold = min(max(category_freq.values()), 100)
414
415
  return [k for k, v in category_freq.items() if v >= threshold]
415
416
 
416
417
 
@@ -434,18 +435,20 @@ class GroundingDataset(YOLODataset):
434
435
  >>> len(dataset) # Number of valid images with annotations
435
436
  """
436
437
 
437
- def __init__(self, *args, task: str = "detect", json_file: str = "", **kwargs):
438
+ def __init__(self, *args, task: str = "detect", json_file: str = "", max_samples: int = 80, **kwargs):
438
439
  """
439
440
  Initialize a GroundingDataset for object detection.
440
441
 
441
442
  Args:
442
443
  json_file (str): Path to the JSON file containing annotations.
443
444
  task (str): Must be 'detect' or 'segment' for GroundingDataset.
445
+ max_samples (int): Maximum number of samples to load for text augmentation.
444
446
  *args (Any): Additional positional arguments for the parent class.
445
447
  **kwargs (Any): Additional keyword arguments for the parent class.
446
448
  """
447
449
  assert task in {"detect", "segment"}, "GroundingDataset currently only supports `detect` and `segment` tasks"
448
450
  self.json_file = json_file
451
+ self.max_samples = max_samples
449
452
  super().__init__(*args, task=task, data={"channels": 3}, **kwargs)
450
453
 
451
454
  def get_img_files(self, img_path: str) -> List:
@@ -625,7 +628,7 @@ class GroundingDataset(YOLODataset):
625
628
  # the strategy of selecting negative is restricted in one dataset,
626
629
  # while official pre-saved neg embeddings from all datasets at once.
627
630
  transform = RandomLoadText(
628
- max_samples=80,
631
+ max_samples=min(self.max_samples, 80),
629
632
  padding=True,
630
633
  padding_value=self._get_neg_texts(self.category_freq),
631
634
  )
@@ -651,6 +654,7 @@ class GroundingDataset(YOLODataset):
651
654
  @staticmethod
652
655
  def _get_neg_texts(category_freq: Dict, threshold: int = 100) -> List[str]:
653
656
  """Get negative text samples based on frequency threshold."""
657
+ threshold = min(max(category_freq.values()), 100)
654
658
  return [k for k, v in category_freq.items() if v >= threshold]
655
659
 
656
660
 
@@ -451,9 +451,9 @@ class LoadImagesAndVideos:
451
451
  self.mode = "image"
452
452
  if path.rpartition(".")[-1].lower() == "heic":
453
453
  # Load HEIC image using Pillow with pillow-heif
454
- check_requirements("pillow-heif")
454
+ check_requirements("pi-heif")
455
455
 
456
- from pillow_heif import register_heif_opener
456
+ from pi_heif import register_heif_opener
457
457
 
458
458
  register_heif_opener() # Register HEIF opener with Pillow
459
459
  with Image.open(path) as img:
ultralytics/data/utils.py CHANGED
@@ -9,7 +9,7 @@ import zipfile
9
9
  from multiprocessing.pool import ThreadPool
10
10
  from pathlib import Path
11
11
  from tarfile import is_tarfile
12
- from typing import Dict, List, Tuple, Union
12
+ from typing import Any, Dict, List, Tuple, Union
13
13
 
14
14
  import cv2
15
15
  import numpy as np
@@ -284,7 +284,7 @@ def visualize_image_annotations(image_path: str, txt_path: str, label_map: Dict[
284
284
  w = width * img_width
285
285
  h = height * img_height
286
286
  annotations.append((x, y, w, h, int(class_id)))
287
- fig, ax = plt.subplots(1) # Plot the image and annotations
287
+ _, ax = plt.subplots(1) # Plot the image and annotations
288
288
  for x, y, w, h, label in annotations:
289
289
  color = tuple(c / 255 for c in colors(label, True)) # Get and normalize the RGB color
290
290
  rect = plt.Rectangle((x, y), w, h, linewidth=2, edgecolor=color, facecolor="none") # Create a rectangle
@@ -384,7 +384,7 @@ def find_dataset_yaml(path: Path) -> Path:
384
384
  return files[0]
385
385
 
386
386
 
387
- def check_det_dataset(dataset: str, autodownload: bool = True) -> Dict:
387
+ def check_det_dataset(dataset: str, autodownload: bool = True) -> Dict[str, Any]:
388
388
  """
389
389
  Download, verify, and/or unzip a dataset if not found locally.
390
390
 
@@ -397,7 +397,7 @@ def check_det_dataset(dataset: str, autodownload: bool = True) -> Dict:
397
397
  autodownload (bool, optional): Whether to automatically download the dataset if not found.
398
398
 
399
399
  Returns:
400
- (Dict): Parsed dataset information and paths.
400
+ (Dict[str, Any]): Parsed dataset information and paths.
401
401
  """
402
402
  file = check_file(dataset)
403
403
 
@@ -479,7 +479,7 @@ def check_det_dataset(dataset: str, autodownload: bool = True) -> Dict:
479
479
  return data # dictionary
480
480
 
481
481
 
482
- def check_cls_dataset(dataset: Union[str, Path], split: str = "") -> Dict:
482
+ def check_cls_dataset(dataset: Union[str, Path], split: str = "") -> Dict[str, Any]:
483
483
  """
484
484
  Check a classification dataset such as Imagenet.
485
485
 
@@ -491,13 +491,13 @@ def check_cls_dataset(dataset: Union[str, Path], split: str = "") -> Dict:
491
491
  split (str, optional): The split of the dataset. Either 'val', 'test', or ''.
492
492
 
493
493
  Returns:
494
- (Dict): A dictionary containing the following keys:
494
+ (Dict[str, Any]): A dictionary containing the following keys:
495
495
 
496
496
  - 'train' (Path): The directory path containing the training set of the dataset.
497
497
  - 'val' (Path): The directory path containing the validation set of the dataset.
498
498
  - 'test' (Path): The directory path containing the test set of the dataset.
499
499
  - 'nc' (int): The number of classes in the dataset.
500
- - 'names' (Dict): A dictionary of class names in the dataset.
500
+ - 'names' (Dict[int, str]): A dictionary of class names in the dataset.
501
501
  """
502
502
  # Download (optional if dataset=https://file.zip is passed directly)
503
503
  if str(dataset).startswith(("http:/", "https:/")):
@@ -535,6 +535,8 @@ def check_cls_dataset(dataset: Union[str, Path], split: str = "") -> Dict:
535
535
  if (data_dir / "val").exists()
536
536
  else data_dir / "validation"
537
537
  if (data_dir / "validation").exists()
538
+ else data_dir / "valid"
539
+ if (data_dir / "valid").exists()
538
540
  else None
539
541
  ) # data/test or data/val
540
542
  test_set = data_dir / "test" if (data_dir / "test").exists() else None # data/val or data/test
@@ -294,10 +294,10 @@ class Exporter:
294
294
 
295
295
  # Device
296
296
  dla = None
297
- if fmt == "engine" and self.args.device is None:
297
+ if engine and self.args.device is None:
298
298
  LOGGER.warning("TensorRT requires GPU export, automatically assigning device=0")
299
299
  self.args.device = "0"
300
- if fmt == "engine" and "dla" in str(self.args.device): # convert int/list to str first
300
+ if engine and "dla" in str(self.args.device): # convert int/list to str first
301
301
  dla = self.args.device.rsplit(":", 1)[-1]
302
302
  self.args.device = "0" # update device to "0"
303
303
  assert dla in {"0", "1"}, f"Expected self.args.device='dla:0' or 'dla:1, but got {self.args.device}."
@@ -348,6 +348,10 @@ class Exporter:
348
348
  LOGGER.warning("'nms=True' is not available for end2end models. Forcing 'nms=False'.")
349
349
  self.args.nms = False
350
350
  self.args.conf = self.args.conf or 0.25 # set conf default value for nms export
351
+ if (engine or self.args.nms) and self.args.dynamic and self.args.batch == 1:
352
+ LOGGER.warning(
353
+ f"'dynamic=True' model with '{'nms=True' if self.args.nms else 'format=engine'}' requires max batch size, i.e. 'batch=16'"
354
+ )
351
355
  if edgetpu:
352
356
  if not LINUX or ARM64:
353
357
  raise SystemError(
@@ -516,7 +520,7 @@ class Exporter:
516
520
  f"work. Use export 'imgsz={max(self.imgsz)}' if val is required."
517
521
  )
518
522
  imgsz = self.imgsz[0] if square else str(self.imgsz)[1:-1].replace(" ", "")
519
- predict_data = f"data={data}" if model.task == "segment" and fmt == "pb" else ""
523
+ predict_data = f"data={data}" if model.task == "segment" and pb else ""
520
524
  q = "int8" if self.args.int8 else "half" if self.args.half else "" # quantization
521
525
  LOGGER.info(
522
526
  f"\nExport complete ({time.time() - t:.1f}s)"
@@ -196,7 +196,7 @@ class Results(SimpleClass, DataExportMixin):
196
196
  It supports visualization, data export, and various coordinate transformations.
197
197
 
198
198
  Attributes:
199
- orig_img (numpy.ndarray): The original image as a numpy array.
199
+ orig_img (np.ndarray): The original image as a numpy array.
200
200
  orig_shape (Tuple[int, int]): Original image shape in (height, width) format.
201
201
  boxes (Boxes | None): Detected bounding boxes.
202
202
  masks (Masks | None): Segmentation masks.
@@ -254,7 +254,7 @@ class Results(SimpleClass, DataExportMixin):
254
254
  Initialize the Results class for storing and manipulating inference results.
255
255
 
256
256
  Args:
257
- orig_img (numpy.ndarray): The original image as a numpy array.
257
+ orig_img (np.ndarray): The original image as a numpy array.
258
258
  path (str): The path to the image file.
259
259
  names (dict): A dictionary of class names.
260
260
  boxes (torch.Tensor | None): A 2D tensor of bounding box coordinates for each detection.
@@ -862,16 +862,16 @@ class Boxes(BaseTensor):
862
862
  methods for easy manipulation and conversion between different coordinate systems.
863
863
 
864
864
  Attributes:
865
- data (torch.Tensor | numpy.ndarray): The raw tensor containing detection boxes and associated data.
865
+ data (torch.Tensor | np.ndarray): The raw tensor containing detection boxes and associated data.
866
866
  orig_shape (Tuple[int, int]): The original image dimensions (height, width).
867
867
  is_track (bool): Indicates whether tracking IDs are included in the box data.
868
- xyxy (torch.Tensor | numpy.ndarray): Boxes in [x1, y1, x2, y2] format.
869
- conf (torch.Tensor | numpy.ndarray): Confidence scores for each box.
870
- cls (torch.Tensor | numpy.ndarray): Class labels for each box.
868
+ xyxy (torch.Tensor | np.ndarray): Boxes in [x1, y1, x2, y2] format.
869
+ conf (torch.Tensor | np.ndarray): Confidence scores for each box.
870
+ cls (torch.Tensor | np.ndarray): Class labels for each box.
871
871
  id (torch.Tensor | None): Tracking IDs for each box (if available).
872
- xywh (torch.Tensor | numpy.ndarray): Boxes in [x, y, width, height] format.
873
- xyxyn (torch.Tensor | numpy.ndarray): Normalized [x1, y1, x2, y2] boxes relative to orig_shape.
874
- xywhn (torch.Tensor | numpy.ndarray): Normalized [x, y, width, height] boxes relative to orig_shape.
872
+ xywh (torch.Tensor | np.ndarray): Boxes in [x, y, width, height] format.
873
+ xyxyn (torch.Tensor | np.ndarray): Normalized [x1, y1, x2, y2] boxes relative to orig_shape.
874
+ xywhn (torch.Tensor | np.ndarray): Normalized [x, y, width, height] boxes relative to orig_shape.
875
875
 
876
876
  Methods:
877
877
  cpu: Return a copy of the object with all tensors on CPU memory.
@@ -931,7 +931,7 @@ class Boxes(BaseTensor):
931
931
  Return bounding boxes in [x1, y1, x2, y2] format.
932
932
 
933
933
  Returns:
934
- (torch.Tensor | numpy.ndarray): A tensor or numpy array of shape (n, 4) containing bounding box
934
+ (torch.Tensor | np.ndarray): A tensor or numpy array of shape (n, 4) containing bounding box
935
935
  coordinates in [x1, y1, x2, y2] format, where n is the number of boxes.
936
936
 
937
937
  Examples:
@@ -948,7 +948,7 @@ class Boxes(BaseTensor):
948
948
  Return the confidence scores for each detection box.
949
949
 
950
950
  Returns:
951
- (torch.Tensor | numpy.ndarray): A 1D tensor or array containing confidence scores for each detection,
951
+ (torch.Tensor | np.ndarray): A 1D tensor or array containing confidence scores for each detection,
952
952
  with shape (N,) where N is the number of detections.
953
953
 
954
954
  Examples:
@@ -965,7 +965,7 @@ class Boxes(BaseTensor):
965
965
  Return the class ID tensor representing category predictions for each bounding box.
966
966
 
967
967
  Returns:
968
- (torch.Tensor | numpy.ndarray): A tensor or numpy array containing the class IDs for each detection box.
968
+ (torch.Tensor | np.ndarray): A tensor or numpy array containing the class IDs for each detection box.
969
969
  The shape is (N,), where N is the number of boxes.
970
970
 
971
971
  Examples:
@@ -1008,7 +1008,7 @@ class Boxes(BaseTensor):
1008
1008
  Convert bounding boxes from [x1, y1, x2, y2] format to [x, y, width, height] format.
1009
1009
 
1010
1010
  Returns:
1011
- (torch.Tensor | numpy.ndarray): Boxes in [x_center, y_center, width, height] format, where x_center,
1011
+ (torch.Tensor | np.ndarray): Boxes in [x_center, y_center, width, height] format, where x_center,
1012
1012
  y_center are the coordinates of the center point of the bounding box, width, height are the
1013
1013
  dimensions of the bounding box and the shape of the returned tensor is (N, 4), where N is the
1014
1014
  number of boxes.
@@ -1032,7 +1032,7 @@ class Boxes(BaseTensor):
1032
1032
  normalized to the range [0, 1] based on the original image dimensions.
1033
1033
 
1034
1034
  Returns:
1035
- (torch.Tensor | numpy.ndarray): Normalized bounding box coordinates with shape (N, 4), where N is
1035
+ (torch.Tensor | np.ndarray): Normalized bounding box coordinates with shape (N, 4), where N is
1036
1036
  the number of boxes. Each row contains [x1, y1, x2, y2] values normalized to [0, 1].
1037
1037
 
1038
1038
  Examples:
@@ -1056,7 +1056,7 @@ class Boxes(BaseTensor):
1056
1056
  [x_center, y_center, width, height], where all values are relative to the original image dimensions.
1057
1057
 
1058
1058
  Returns:
1059
- (torch.Tensor | numpy.ndarray): Normalized bounding boxes with shape (N, 4), where N is the
1059
+ (torch.Tensor | np.ndarray): Normalized bounding boxes with shape (N, 4), where N is the
1060
1060
  number of boxes. Each row contains [x_center, y_center, width, height] values normalized
1061
1061
  to [0, 1] based on the original image dimensions.
1062
1062
 
@@ -1080,10 +1080,10 @@ class Masks(BaseTensor):
1080
1080
  including methods for converting between pixel and normalized coordinates.
1081
1081
 
1082
1082
  Attributes:
1083
- data (torch.Tensor | numpy.ndarray): The raw tensor or array containing mask data.
1083
+ data (torch.Tensor | np.ndarray): The raw tensor or array containing mask data.
1084
1084
  orig_shape (tuple): Original image shape in (height, width) format.
1085
- xy (List[numpy.ndarray]): A list of segments in pixel coordinates.
1086
- xyn (List[numpy.ndarray]): A list of normalized segments.
1085
+ xy (List[np.ndarray]): A list of segments in pixel coordinates.
1086
+ xyn (List[np.ndarray]): A list of normalized segments.
1087
1087
 
1088
1088
  Methods:
1089
1089
  cpu: Return a copy of the Masks object with the mask tensor on CPU memory.
@@ -1128,7 +1128,7 @@ class Masks(BaseTensor):
1128
1128
  are normalized relative to the original image shape.
1129
1129
 
1130
1130
  Returns:
1131
- (List[numpy.ndarray]): A list of numpy arrays, where each array contains the normalized xy-coordinates
1131
+ (List[np.ndarray]): A list of numpy arrays, where each array contains the normalized xy-coordinates
1132
1132
  of a single segmentation mask. Each array has shape (N, 2), where N is the number of points in the
1133
1133
  mask contour.
1134
1134
 
@@ -1153,7 +1153,7 @@ class Masks(BaseTensor):
1153
1153
  Masks object. The coordinates are scaled to match the original image dimensions.
1154
1154
 
1155
1155
  Returns:
1156
- (List[numpy.ndarray]): A list of numpy arrays, where each array contains the [x, y] pixel
1156
+ (List[np.ndarray]): A list of numpy arrays, where each array contains the [x, y] pixel
1157
1157
  coordinates for a single segmentation mask. Each array has shape (N, 2), where N is the
1158
1158
  number of points in the segment.
1159
1159
 
@@ -1257,7 +1257,7 @@ class Keypoints(BaseTensor):
1257
1257
  Return normalized coordinates (x, y) of keypoints relative to the original image size.
1258
1258
 
1259
1259
  Returns:
1260
- (torch.Tensor | numpy.ndarray): A tensor or array of shape (N, K, 2) containing normalized keypoint
1260
+ (torch.Tensor | np.ndarray): A tensor or array of shape (N, K, 2) containing normalized keypoint
1261
1261
  coordinates, where N is the number of instances, K is the number of keypoints, and the last
1262
1262
  dimension contains [x, y] values in the range [0, 1].
1263
1263
 
@@ -1299,12 +1299,12 @@ class Probs(BaseTensor):
1299
1299
  classification probabilities, including top-1 and top-5 predictions.
1300
1300
 
1301
1301
  Attributes:
1302
- data (torch.Tensor | numpy.ndarray): The raw tensor or array containing classification probabilities.
1302
+ data (torch.Tensor | np.ndarray): The raw tensor or array containing classification probabilities.
1303
1303
  orig_shape (tuple | None): The original image shape as (height, width). Not used in this class.
1304
1304
  top1 (int): Index of the class with the highest probability.
1305
1305
  top5 (List[int]): Indices of the top 5 classes by probability.
1306
- top1conf (torch.Tensor | numpy.ndarray): Confidence score of the top 1 class.
1307
- top5conf (torch.Tensor | numpy.ndarray): Confidence scores of the top 5 classes.
1306
+ top1conf (torch.Tensor | np.ndarray): Confidence score of the top 1 class.
1307
+ top5conf (torch.Tensor | np.ndarray): Confidence scores of the top 5 classes.
1308
1308
 
1309
1309
  Methods:
1310
1310
  cpu: Return a copy of the probabilities tensor on CPU memory.
@@ -1399,7 +1399,7 @@ class Probs(BaseTensor):
1399
1399
  from the classification results.
1400
1400
 
1401
1401
  Returns:
1402
- (torch.Tensor | numpy.ndarray): A tensor containing the confidence score of the top 1 class.
1402
+ (torch.Tensor | np.ndarray): A tensor containing the confidence score of the top 1 class.
1403
1403
 
1404
1404
  Examples:
1405
1405
  >>> results = model("image.jpg") # classify an image
@@ -1420,7 +1420,7 @@ class Probs(BaseTensor):
1420
1420
  along with their associated confidence levels.
1421
1421
 
1422
1422
  Returns:
1423
- (torch.Tensor | numpy.ndarray): A tensor or array containing the confidence scores for the
1423
+ (torch.Tensor | np.ndarray): A tensor or array containing the confidence scores for the
1424
1424
  top 5 predicted classes, sorted in descending order of probability.
1425
1425
 
1426
1426
  Examples:
@@ -1444,13 +1444,13 @@ class OBB(BaseTensor):
1444
1444
  data (torch.Tensor): The raw OBB tensor containing box coordinates and associated data.
1445
1445
  orig_shape (tuple): Original image size as (height, width).
1446
1446
  is_track (bool): Indicates whether tracking IDs are included in the box data.
1447
- xywhr (torch.Tensor | numpy.ndarray): Boxes in [x_center, y_center, width, height, rotation] format.
1448
- conf (torch.Tensor | numpy.ndarray): Confidence scores for each box.
1449
- cls (torch.Tensor | numpy.ndarray): Class labels for each box.
1450
- id (torch.Tensor | numpy.ndarray): Tracking IDs for each box, if available.
1451
- xyxyxyxy (torch.Tensor | numpy.ndarray): Boxes in 8-point [x1, y1, x2, y2, x3, y3, x4, y4] format.
1452
- xyxyxyxyn (torch.Tensor | numpy.ndarray): Normalized 8-point coordinates relative to orig_shape.
1453
- xyxy (torch.Tensor | numpy.ndarray): Axis-aligned bounding boxes in [x1, y1, x2, y2] format.
1447
+ xywhr (torch.Tensor | np.ndarray): Boxes in [x_center, y_center, width, height, rotation] format.
1448
+ conf (torch.Tensor | np.ndarray): Confidence scores for each box.
1449
+ cls (torch.Tensor | np.ndarray): Class labels for each box.
1450
+ id (torch.Tensor | np.ndarray): Tracking IDs for each box, if available.
1451
+ xyxyxyxy (torch.Tensor | np.ndarray): Boxes in 8-point [x1, y1, x2, y2, x3, y3, x4, y4] format.
1452
+ xyxyxyxyn (torch.Tensor | np.ndarray): Normalized 8-point coordinates relative to orig_shape.
1453
+ xyxy (torch.Tensor | np.ndarray): Axis-aligned bounding boxes in [x1, y1, x2, y2] format.
1454
1454
 
1455
1455
  Methods:
1456
1456
  cpu: Return a copy of the OBB object with all tensors on CPU memory.
@@ -1474,13 +1474,13 @@ class OBB(BaseTensor):
1474
1474
  various properties and methods to access and transform the OBB data.
1475
1475
 
1476
1476
  Args:
1477
- boxes (torch.Tensor | numpy.ndarray): A tensor or numpy array containing the detection boxes,
1477
+ boxes (torch.Tensor | np.ndarray): A tensor or numpy array containing the detection boxes,
1478
1478
  with shape (num_boxes, 7) or (num_boxes, 8). The last two columns contain confidence and class values.
1479
1479
  If present, the third last column contains track IDs, and the fifth column contains rotation.
1480
1480
  orig_shape (Tuple[int, int]): Original image size, in the format (height, width).
1481
1481
 
1482
1482
  Attributes:
1483
- data (torch.Tensor | numpy.ndarray): The raw OBB tensor.
1483
+ data (torch.Tensor | np.ndarray): The raw OBB tensor.
1484
1484
  orig_shape (Tuple[int, int]): The original image shape.
1485
1485
  is_track (bool): Whether the boxes include tracking IDs.
1486
1486
 
@@ -1508,7 +1508,7 @@ class OBB(BaseTensor):
1508
1508
  Return boxes in [x_center, y_center, width, height, rotation] format.
1509
1509
 
1510
1510
  Returns:
1511
- (torch.Tensor | numpy.ndarray): A tensor or numpy array containing the oriented bounding boxes with format
1511
+ (torch.Tensor | np.ndarray): A tensor or numpy array containing the oriented bounding boxes with format
1512
1512
  [x_center, y_center, width, height, rotation]. The shape is (N, 5) where N is the number of boxes.
1513
1513
 
1514
1514
  Examples:
@@ -1529,7 +1529,7 @@ class OBB(BaseTensor):
1529
1529
  represents the model's certainty in the detection.
1530
1530
 
1531
1531
  Returns:
1532
- (torch.Tensor | numpy.ndarray): A tensor or numpy array of shape (N,) containing confidence scores
1532
+ (torch.Tensor | np.ndarray): A tensor or numpy array of shape (N,) containing confidence scores
1533
1533
  for N detections, where each score is in the range [0, 1].
1534
1534
 
1535
1535
  Examples:
@@ -1546,7 +1546,7 @@ class OBB(BaseTensor):
1546
1546
  Return the class values of the oriented bounding boxes.
1547
1547
 
1548
1548
  Returns:
1549
- (torch.Tensor | numpy.ndarray): A tensor or numpy array containing the class values for each oriented
1549
+ (torch.Tensor | np.ndarray): A tensor or numpy array containing the class values for each oriented
1550
1550
  bounding box. The shape is (N,), where N is the number of boxes.
1551
1551
 
1552
1552
  Examples:
@@ -1564,7 +1564,7 @@ class OBB(BaseTensor):
1564
1564
  Return the tracking IDs of the oriented bounding boxes (if available).
1565
1565
 
1566
1566
  Returns:
1567
- (torch.Tensor | numpy.ndarray | None): A tensor or numpy array containing the tracking IDs for each
1567
+ (torch.Tensor | np.ndarray | None): A tensor or numpy array containing the tracking IDs for each
1568
1568
  oriented bounding box. Returns None if tracking IDs are not available.
1569
1569
 
1570
1570
  Examples:
@@ -1584,7 +1584,7 @@ class OBB(BaseTensor):
1584
1584
  Convert OBB format to 8-point (xyxyxyxy) coordinate format for rotated bounding boxes.
1585
1585
 
1586
1586
  Returns:
1587
- (torch.Tensor | numpy.ndarray): Rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2), where N is
1587
+ (torch.Tensor | np.ndarray): Rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2), where N is
1588
1588
  the number of boxes. Each box is represented by 4 points (x, y), starting from the top-left corner and
1589
1589
  moving clockwise.
1590
1590
 
@@ -1603,7 +1603,7 @@ class OBB(BaseTensor):
1603
1603
  Convert rotated bounding boxes to normalized xyxyxyxy format.
1604
1604
 
1605
1605
  Returns:
1606
- (torch.Tensor | numpy.ndarray): Normalized rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2),
1606
+ (torch.Tensor | np.ndarray): Normalized rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2),
1607
1607
  where N is the number of boxes. Each box is represented by 4 points (x, y), normalized relative to
1608
1608
  the original image dimensions.
1609
1609
 
@@ -1629,7 +1629,7 @@ class OBB(BaseTensor):
1629
1629
  as IoU calculation with non-rotated boxes.
1630
1630
 
1631
1631
  Returns:
1632
- (torch.Tensor | numpy.ndarray): Axis-aligned bounding boxes in xyxy format with shape (N, 4), where N
1632
+ (torch.Tensor | np.ndarray): Axis-aligned bounding boxes in xyxy format with shape (N, 4), where N
1633
1633
  is the number of boxes. Each row contains [x1, y1, x2, y2] coordinates.
1634
1634
 
1635
1635
  Examples:
@@ -58,7 +58,7 @@ class FastSAM(Model):
58
58
  prompts and passes them to the parent class predict method for processing.
59
59
 
60
60
  Args:
61
- source (str | PIL.Image | numpy.ndarray): Input source for prediction, can be a file path, URL, PIL image,
61
+ source (str | PIL.Image | np.ndarray): Input source for prediction, can be a file path, URL, PIL image,
62
62
  or numpy array.
63
63
  stream (bool): Whether to enable real-time streaming mode for video inputs.
64
64
  bboxes (List, optional): Bounding box coordinates for prompted segmentation in format [[x1, y1, x2, y2]].
@@ -54,7 +54,7 @@ class FastSAMPredictor(SegmentationPredictor):
54
54
  Args:
55
55
  preds (List[torch.Tensor]): Raw predictions from the model.
56
56
  img (torch.Tensor): Input image tensor that was fed to the model.
57
- orig_imgs (List[numpy.ndarray]): Original images before preprocessing.
57
+ orig_imgs (List[np.ndarray]): Original images before preprocessing.
58
58
 
59
59
  Returns:
60
60
  (List[Results]): Processed results with prompts applied.
@@ -87,8 +87,8 @@ class SAM(Model):
87
87
  Perform segmentation prediction on the given image or video source.
88
88
 
89
89
  Args:
90
- source (str | PIL.Image | numpy.ndarray): Path to the image or video file, or a PIL.Image object, or
91
- a numpy.ndarray object.
90
+ source (str | PIL.Image | np.ndarray): Path to the image or video file, or a PIL.Image object, or
91
+ a np.ndarray object.
92
92
  stream (bool): If True, enables real-time streaming.
93
93
  bboxes (List[List[float]] | None): List of bounding box coordinates for prompted segmentation.
94
94
  points (List[List[float]] | None): List of points for prompted segmentation.
@@ -117,8 +117,8 @@ class SAM(Model):
117
117
  for segmentation tasks.
118
118
 
119
119
  Args:
120
- source (str | PIL.Image | numpy.ndarray | None): Path to the image or video file, or a PIL.Image
121
- object, or a numpy.ndarray object.
120
+ source (str | PIL.Image | np.ndarray | None): Path to the image or video file, or a PIL.Image
121
+ object, or a np.ndarray object.
122
122
  stream (bool): If True, enables real-time streaming.
123
123
  bboxes (List[List[float]] | None): List of bounding box coordinates for prompted segmentation.
124
124
  points (List[List[float]] | None): List of points for prompted segmentation.
@@ -411,7 +411,7 @@ class RoPEAttention(Attention):
411
411
 
412
412
  Attributes:
413
413
  compute_cis (Callable): Function to compute axial complex numbers for rotary encoding.
414
- freqs_cis (Tensor): Precomputed frequency tensor for rotary encoding.
414
+ freqs_cis (torch.Tensor): Precomputed frequency tensor for rotary encoding.
415
415
  rope_k_repeat (bool): Flag to repeat query RoPE to match key length for cross-attention to memories.
416
416
 
417
417
  Methods:
@@ -443,7 +443,7 @@ class RoPEAttention(Attention):
443
443
  self.freqs_cis = freqs_cis
444
444
  self.rope_k_repeat = rope_k_repeat # repeat q rope to match k length, needed for cross-attention to memories
445
445
 
446
- def forward(self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0) -> Tensor:
446
+ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, num_k_exclude_rope: int = 0) -> torch.Tensor:
447
447
  """Apply rotary position encoding and compute attention between query, key, and value tensors."""
448
448
  q = self.q_proj(q)
449
449
  k = self.k_proj(k)
@@ -744,7 +744,7 @@ class PositionEmbeddingSine(nn.Module):
744
744
 
745
745
  self.cache = {}
746
746
 
747
- def _encode_xy(self, x: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
747
+ def _encode_xy(self, x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
748
748
  """Encode 2D positions using sine/cosine functions for transformer positional embeddings."""
749
749
  assert len(x) == len(y) and x.ndim == y.ndim == 1
750
750
  x_embed = x * self.scale
@@ -760,7 +760,7 @@ class PositionEmbeddingSine(nn.Module):
760
760
  return pos_x, pos_y
761
761
 
762
762
  @torch.no_grad()
763
- def encode_boxes(self, x: Tensor, y: Tensor, w: Tensor, h: Tensor) -> Tensor:
763
+ def encode_boxes(self, x: torch.Tensor, y: torch.Tensor, w: torch.Tensor, h: torch.Tensor) -> torch.Tensor:
764
764
  """Encode box coordinates and dimensions into positional embeddings for detection."""
765
765
  pos_x, pos_y = self._encode_xy(x, y)
766
766
  return torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
@@ -768,7 +768,7 @@ class PositionEmbeddingSine(nn.Module):
768
768
  encode = encode_boxes # Backwards compatibility
769
769
 
770
770
  @torch.no_grad()
771
- def encode_points(self, x: Tensor, y: Tensor, labels: Tensor) -> Tensor:
771
+ def encode_points(self, x: torch.Tensor, y: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
772
772
  """Encode 2D points with sinusoidal embeddings and append labels."""
773
773
  (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
774
774
  assert bx == by and nx == ny and bx == bl and nx == nl