ultralytics 8.3.163__py3-none-any.whl → 8.3.164__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultralytics/__init__.py +1 -1
- ultralytics/data/augment.py +182 -153
- ultralytics/data/build.py +23 -3
- ultralytics/data/dataset.py +6 -2
- ultralytics/data/loaders.py +2 -2
- ultralytics/data/utils.py +9 -7
- ultralytics/engine/exporter.py +7 -3
- ultralytics/engine/results.py +42 -42
- ultralytics/models/fastsam/model.py +1 -1
- ultralytics/models/fastsam/predict.py +1 -1
- ultralytics/models/sam/model.py +4 -4
- ultralytics/models/sam/modules/blocks.py +5 -5
- ultralytics/models/sam/modules/memory_attention.py +19 -19
- ultralytics/models/sam/modules/transformer.py +24 -22
- ultralytics/models/yolo/detect/val.py +2 -2
- ultralytics/models/yolo/world/train_world.py +9 -1
- ultralytics/solutions/distance_calculation.py +1 -1
- ultralytics/solutions/instance_segmentation.py +2 -2
- ultralytics/solutions/object_blurrer.py +2 -2
- ultralytics/solutions/object_counter.py +2 -2
- ultralytics/solutions/object_cropper.py +1 -1
- ultralytics/solutions/queue_management.py +1 -1
- ultralytics/solutions/security_alarm.py +2 -2
- ultralytics/solutions/templates/similarity-search.html +0 -24
- ultralytics/solutions/vision_eye.py +1 -1
- ultralytics/utils/benchmarks.py +2 -2
- ultralytics/utils/export.py +0 -2
- ultralytics/utils/instance.py +32 -25
- ultralytics/utils/ops.py +8 -8
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/METADATA +1 -1
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/RECORD +35 -35
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/licenses/LICENSE +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/top_level.txt +0 -0
ultralytics/data/build.py
CHANGED
@@ -3,13 +3,14 @@
|
|
3
3
|
import os
|
4
4
|
import random
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Any, Iterator
|
6
|
+
from typing import Any, Dict, Iterator
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
import torch
|
10
10
|
from PIL import Image
|
11
11
|
from torch.utils.data import dataloader, distributed
|
12
12
|
|
13
|
+
from ultralytics.cfg import IterableSimpleNamespace
|
13
14
|
from ultralytics.data.dataset import GroundingDataset, YOLODataset, YOLOMultiModalDataset
|
14
15
|
from ultralytics.data.loaders import (
|
15
16
|
LOADERS,
|
@@ -111,7 +112,16 @@ def seed_worker(worker_id: int): # noqa
|
|
111
112
|
random.seed(worker_seed)
|
112
113
|
|
113
114
|
|
114
|
-
def build_yolo_dataset(
|
115
|
+
def build_yolo_dataset(
|
116
|
+
cfg: IterableSimpleNamespace,
|
117
|
+
img_path: str,
|
118
|
+
batch: int,
|
119
|
+
data: Dict[str, Any],
|
120
|
+
mode: str = "train",
|
121
|
+
rect: bool = False,
|
122
|
+
stride: int = 32,
|
123
|
+
multi_modal: bool = False,
|
124
|
+
):
|
115
125
|
"""Build and return a YOLO dataset based on configuration parameters."""
|
116
126
|
dataset = YOLOMultiModalDataset if multi_modal else YOLODataset
|
117
127
|
return dataset(
|
@@ -133,11 +143,21 @@ def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, str
|
|
133
143
|
)
|
134
144
|
|
135
145
|
|
136
|
-
def build_grounding(
|
146
|
+
def build_grounding(
|
147
|
+
cfg: IterableSimpleNamespace,
|
148
|
+
img_path: str,
|
149
|
+
json_file: str,
|
150
|
+
batch: int,
|
151
|
+
mode: str = "train",
|
152
|
+
rect: bool = False,
|
153
|
+
stride: int = 32,
|
154
|
+
max_samples: int = 80,
|
155
|
+
):
|
137
156
|
"""Build and return a GroundingDataset based on configuration parameters."""
|
138
157
|
return GroundingDataset(
|
139
158
|
img_path=img_path,
|
140
159
|
json_file=json_file,
|
160
|
+
max_samples=max_samples,
|
141
161
|
imgsz=cfg.imgsz,
|
142
162
|
batch_size=batch,
|
143
163
|
augment=mode == "train", # augmentation
|
ultralytics/data/dataset.py
CHANGED
@@ -411,6 +411,7 @@ class YOLOMultiModalDataset(YOLODataset):
|
|
411
411
|
@staticmethod
|
412
412
|
def _get_neg_texts(category_freq: Dict, threshold: int = 100) -> List[str]:
|
413
413
|
"""Get negative text samples based on frequency threshold."""
|
414
|
+
threshold = min(max(category_freq.values()), 100)
|
414
415
|
return [k for k, v in category_freq.items() if v >= threshold]
|
415
416
|
|
416
417
|
|
@@ -434,18 +435,20 @@ class GroundingDataset(YOLODataset):
|
|
434
435
|
>>> len(dataset) # Number of valid images with annotations
|
435
436
|
"""
|
436
437
|
|
437
|
-
def __init__(self, *args, task: str = "detect", json_file: str = "", **kwargs):
|
438
|
+
def __init__(self, *args, task: str = "detect", json_file: str = "", max_samples: int = 80, **kwargs):
|
438
439
|
"""
|
439
440
|
Initialize a GroundingDataset for object detection.
|
440
441
|
|
441
442
|
Args:
|
442
443
|
json_file (str): Path to the JSON file containing annotations.
|
443
444
|
task (str): Must be 'detect' or 'segment' for GroundingDataset.
|
445
|
+
max_samples (int): Maximum number of samples to load for text augmentation.
|
444
446
|
*args (Any): Additional positional arguments for the parent class.
|
445
447
|
**kwargs (Any): Additional keyword arguments for the parent class.
|
446
448
|
"""
|
447
449
|
assert task in {"detect", "segment"}, "GroundingDataset currently only supports `detect` and `segment` tasks"
|
448
450
|
self.json_file = json_file
|
451
|
+
self.max_samples = max_samples
|
449
452
|
super().__init__(*args, task=task, data={"channels": 3}, **kwargs)
|
450
453
|
|
451
454
|
def get_img_files(self, img_path: str) -> List:
|
@@ -625,7 +628,7 @@ class GroundingDataset(YOLODataset):
|
|
625
628
|
# the strategy of selecting negative is restricted in one dataset,
|
626
629
|
# while official pre-saved neg embeddings from all datasets at once.
|
627
630
|
transform = RandomLoadText(
|
628
|
-
max_samples=80,
|
631
|
+
max_samples=min(self.max_samples, 80),
|
629
632
|
padding=True,
|
630
633
|
padding_value=self._get_neg_texts(self.category_freq),
|
631
634
|
)
|
@@ -651,6 +654,7 @@ class GroundingDataset(YOLODataset):
|
|
651
654
|
@staticmethod
|
652
655
|
def _get_neg_texts(category_freq: Dict, threshold: int = 100) -> List[str]:
|
653
656
|
"""Get negative text samples based on frequency threshold."""
|
657
|
+
threshold = min(max(category_freq.values()), 100)
|
654
658
|
return [k for k, v in category_freq.items() if v >= threshold]
|
655
659
|
|
656
660
|
|
ultralytics/data/loaders.py
CHANGED
@@ -451,9 +451,9 @@ class LoadImagesAndVideos:
|
|
451
451
|
self.mode = "image"
|
452
452
|
if path.rpartition(".")[-1].lower() == "heic":
|
453
453
|
# Load HEIC image using Pillow with pillow-heif
|
454
|
-
check_requirements("
|
454
|
+
check_requirements("pi-heif")
|
455
455
|
|
456
|
-
from
|
456
|
+
from pi_heif import register_heif_opener
|
457
457
|
|
458
458
|
register_heif_opener() # Register HEIF opener with Pillow
|
459
459
|
with Image.open(path) as img:
|
ultralytics/data/utils.py
CHANGED
@@ -9,7 +9,7 @@ import zipfile
|
|
9
9
|
from multiprocessing.pool import ThreadPool
|
10
10
|
from pathlib import Path
|
11
11
|
from tarfile import is_tarfile
|
12
|
-
from typing import Dict, List, Tuple, Union
|
12
|
+
from typing import Any, Dict, List, Tuple, Union
|
13
13
|
|
14
14
|
import cv2
|
15
15
|
import numpy as np
|
@@ -284,7 +284,7 @@ def visualize_image_annotations(image_path: str, txt_path: str, label_map: Dict[
|
|
284
284
|
w = width * img_width
|
285
285
|
h = height * img_height
|
286
286
|
annotations.append((x, y, w, h, int(class_id)))
|
287
|
-
|
287
|
+
_, ax = plt.subplots(1) # Plot the image and annotations
|
288
288
|
for x, y, w, h, label in annotations:
|
289
289
|
color = tuple(c / 255 for c in colors(label, True)) # Get and normalize the RGB color
|
290
290
|
rect = plt.Rectangle((x, y), w, h, linewidth=2, edgecolor=color, facecolor="none") # Create a rectangle
|
@@ -384,7 +384,7 @@ def find_dataset_yaml(path: Path) -> Path:
|
|
384
384
|
return files[0]
|
385
385
|
|
386
386
|
|
387
|
-
def check_det_dataset(dataset: str, autodownload: bool = True) -> Dict:
|
387
|
+
def check_det_dataset(dataset: str, autodownload: bool = True) -> Dict[str, Any]:
|
388
388
|
"""
|
389
389
|
Download, verify, and/or unzip a dataset if not found locally.
|
390
390
|
|
@@ -397,7 +397,7 @@ def check_det_dataset(dataset: str, autodownload: bool = True) -> Dict:
|
|
397
397
|
autodownload (bool, optional): Whether to automatically download the dataset if not found.
|
398
398
|
|
399
399
|
Returns:
|
400
|
-
(Dict): Parsed dataset information and paths.
|
400
|
+
(Dict[str, Any]): Parsed dataset information and paths.
|
401
401
|
"""
|
402
402
|
file = check_file(dataset)
|
403
403
|
|
@@ -479,7 +479,7 @@ def check_det_dataset(dataset: str, autodownload: bool = True) -> Dict:
|
|
479
479
|
return data # dictionary
|
480
480
|
|
481
481
|
|
482
|
-
def check_cls_dataset(dataset: Union[str, Path], split: str = "") -> Dict:
|
482
|
+
def check_cls_dataset(dataset: Union[str, Path], split: str = "") -> Dict[str, Any]:
|
483
483
|
"""
|
484
484
|
Check a classification dataset such as Imagenet.
|
485
485
|
|
@@ -491,13 +491,13 @@ def check_cls_dataset(dataset: Union[str, Path], split: str = "") -> Dict:
|
|
491
491
|
split (str, optional): The split of the dataset. Either 'val', 'test', or ''.
|
492
492
|
|
493
493
|
Returns:
|
494
|
-
(Dict): A dictionary containing the following keys:
|
494
|
+
(Dict[str, Any]): A dictionary containing the following keys:
|
495
495
|
|
496
496
|
- 'train' (Path): The directory path containing the training set of the dataset.
|
497
497
|
- 'val' (Path): The directory path containing the validation set of the dataset.
|
498
498
|
- 'test' (Path): The directory path containing the test set of the dataset.
|
499
499
|
- 'nc' (int): The number of classes in the dataset.
|
500
|
-
- 'names' (Dict): A dictionary of class names in the dataset.
|
500
|
+
- 'names' (Dict[int, str]): A dictionary of class names in the dataset.
|
501
501
|
"""
|
502
502
|
# Download (optional if dataset=https://file.zip is passed directly)
|
503
503
|
if str(dataset).startswith(("http:/", "https:/")):
|
@@ -535,6 +535,8 @@ def check_cls_dataset(dataset: Union[str, Path], split: str = "") -> Dict:
|
|
535
535
|
if (data_dir / "val").exists()
|
536
536
|
else data_dir / "validation"
|
537
537
|
if (data_dir / "validation").exists()
|
538
|
+
else data_dir / "valid"
|
539
|
+
if (data_dir / "valid").exists()
|
538
540
|
else None
|
539
541
|
) # data/test or data/val
|
540
542
|
test_set = data_dir / "test" if (data_dir / "test").exists() else None # data/val or data/test
|
ultralytics/engine/exporter.py
CHANGED
@@ -294,10 +294,10 @@ class Exporter:
|
|
294
294
|
|
295
295
|
# Device
|
296
296
|
dla = None
|
297
|
-
if
|
297
|
+
if engine and self.args.device is None:
|
298
298
|
LOGGER.warning("TensorRT requires GPU export, automatically assigning device=0")
|
299
299
|
self.args.device = "0"
|
300
|
-
if
|
300
|
+
if engine and "dla" in str(self.args.device): # convert int/list to str first
|
301
301
|
dla = self.args.device.rsplit(":", 1)[-1]
|
302
302
|
self.args.device = "0" # update device to "0"
|
303
303
|
assert dla in {"0", "1"}, f"Expected self.args.device='dla:0' or 'dla:1, but got {self.args.device}."
|
@@ -348,6 +348,10 @@ class Exporter:
|
|
348
348
|
LOGGER.warning("'nms=True' is not available for end2end models. Forcing 'nms=False'.")
|
349
349
|
self.args.nms = False
|
350
350
|
self.args.conf = self.args.conf or 0.25 # set conf default value for nms export
|
351
|
+
if (engine or self.args.nms) and self.args.dynamic and self.args.batch == 1:
|
352
|
+
LOGGER.warning(
|
353
|
+
f"'dynamic=True' model with '{'nms=True' if self.args.nms else 'format=engine'}' requires max batch size, i.e. 'batch=16'"
|
354
|
+
)
|
351
355
|
if edgetpu:
|
352
356
|
if not LINUX or ARM64:
|
353
357
|
raise SystemError(
|
@@ -516,7 +520,7 @@ class Exporter:
|
|
516
520
|
f"work. Use export 'imgsz={max(self.imgsz)}' if val is required."
|
517
521
|
)
|
518
522
|
imgsz = self.imgsz[0] if square else str(self.imgsz)[1:-1].replace(" ", "")
|
519
|
-
predict_data = f"data={data}" if model.task == "segment" and
|
523
|
+
predict_data = f"data={data}" if model.task == "segment" and pb else ""
|
520
524
|
q = "int8" if self.args.int8 else "half" if self.args.half else "" # quantization
|
521
525
|
LOGGER.info(
|
522
526
|
f"\nExport complete ({time.time() - t:.1f}s)"
|
ultralytics/engine/results.py
CHANGED
@@ -196,7 +196,7 @@ class Results(SimpleClass, DataExportMixin):
|
|
196
196
|
It supports visualization, data export, and various coordinate transformations.
|
197
197
|
|
198
198
|
Attributes:
|
199
|
-
orig_img (
|
199
|
+
orig_img (np.ndarray): The original image as a numpy array.
|
200
200
|
orig_shape (Tuple[int, int]): Original image shape in (height, width) format.
|
201
201
|
boxes (Boxes | None): Detected bounding boxes.
|
202
202
|
masks (Masks | None): Segmentation masks.
|
@@ -254,7 +254,7 @@ class Results(SimpleClass, DataExportMixin):
|
|
254
254
|
Initialize the Results class for storing and manipulating inference results.
|
255
255
|
|
256
256
|
Args:
|
257
|
-
orig_img (
|
257
|
+
orig_img (np.ndarray): The original image as a numpy array.
|
258
258
|
path (str): The path to the image file.
|
259
259
|
names (dict): A dictionary of class names.
|
260
260
|
boxes (torch.Tensor | None): A 2D tensor of bounding box coordinates for each detection.
|
@@ -862,16 +862,16 @@ class Boxes(BaseTensor):
|
|
862
862
|
methods for easy manipulation and conversion between different coordinate systems.
|
863
863
|
|
864
864
|
Attributes:
|
865
|
-
data (torch.Tensor |
|
865
|
+
data (torch.Tensor | np.ndarray): The raw tensor containing detection boxes and associated data.
|
866
866
|
orig_shape (Tuple[int, int]): The original image dimensions (height, width).
|
867
867
|
is_track (bool): Indicates whether tracking IDs are included in the box data.
|
868
|
-
xyxy (torch.Tensor |
|
869
|
-
conf (torch.Tensor |
|
870
|
-
cls (torch.Tensor |
|
868
|
+
xyxy (torch.Tensor | np.ndarray): Boxes in [x1, y1, x2, y2] format.
|
869
|
+
conf (torch.Tensor | np.ndarray): Confidence scores for each box.
|
870
|
+
cls (torch.Tensor | np.ndarray): Class labels for each box.
|
871
871
|
id (torch.Tensor | None): Tracking IDs for each box (if available).
|
872
|
-
xywh (torch.Tensor |
|
873
|
-
xyxyn (torch.Tensor |
|
874
|
-
xywhn (torch.Tensor |
|
872
|
+
xywh (torch.Tensor | np.ndarray): Boxes in [x, y, width, height] format.
|
873
|
+
xyxyn (torch.Tensor | np.ndarray): Normalized [x1, y1, x2, y2] boxes relative to orig_shape.
|
874
|
+
xywhn (torch.Tensor | np.ndarray): Normalized [x, y, width, height] boxes relative to orig_shape.
|
875
875
|
|
876
876
|
Methods:
|
877
877
|
cpu: Return a copy of the object with all tensors on CPU memory.
|
@@ -931,7 +931,7 @@ class Boxes(BaseTensor):
|
|
931
931
|
Return bounding boxes in [x1, y1, x2, y2] format.
|
932
932
|
|
933
933
|
Returns:
|
934
|
-
(torch.Tensor |
|
934
|
+
(torch.Tensor | np.ndarray): A tensor or numpy array of shape (n, 4) containing bounding box
|
935
935
|
coordinates in [x1, y1, x2, y2] format, where n is the number of boxes.
|
936
936
|
|
937
937
|
Examples:
|
@@ -948,7 +948,7 @@ class Boxes(BaseTensor):
|
|
948
948
|
Return the confidence scores for each detection box.
|
949
949
|
|
950
950
|
Returns:
|
951
|
-
(torch.Tensor |
|
951
|
+
(torch.Tensor | np.ndarray): A 1D tensor or array containing confidence scores for each detection,
|
952
952
|
with shape (N,) where N is the number of detections.
|
953
953
|
|
954
954
|
Examples:
|
@@ -965,7 +965,7 @@ class Boxes(BaseTensor):
|
|
965
965
|
Return the class ID tensor representing category predictions for each bounding box.
|
966
966
|
|
967
967
|
Returns:
|
968
|
-
(torch.Tensor |
|
968
|
+
(torch.Tensor | np.ndarray): A tensor or numpy array containing the class IDs for each detection box.
|
969
969
|
The shape is (N,), where N is the number of boxes.
|
970
970
|
|
971
971
|
Examples:
|
@@ -1008,7 +1008,7 @@ class Boxes(BaseTensor):
|
|
1008
1008
|
Convert bounding boxes from [x1, y1, x2, y2] format to [x, y, width, height] format.
|
1009
1009
|
|
1010
1010
|
Returns:
|
1011
|
-
(torch.Tensor |
|
1011
|
+
(torch.Tensor | np.ndarray): Boxes in [x_center, y_center, width, height] format, where x_center,
|
1012
1012
|
y_center are the coordinates of the center point of the bounding box, width, height are the
|
1013
1013
|
dimensions of the bounding box and the shape of the returned tensor is (N, 4), where N is the
|
1014
1014
|
number of boxes.
|
@@ -1032,7 +1032,7 @@ class Boxes(BaseTensor):
|
|
1032
1032
|
normalized to the range [0, 1] based on the original image dimensions.
|
1033
1033
|
|
1034
1034
|
Returns:
|
1035
|
-
(torch.Tensor |
|
1035
|
+
(torch.Tensor | np.ndarray): Normalized bounding box coordinates with shape (N, 4), where N is
|
1036
1036
|
the number of boxes. Each row contains [x1, y1, x2, y2] values normalized to [0, 1].
|
1037
1037
|
|
1038
1038
|
Examples:
|
@@ -1056,7 +1056,7 @@ class Boxes(BaseTensor):
|
|
1056
1056
|
[x_center, y_center, width, height], where all values are relative to the original image dimensions.
|
1057
1057
|
|
1058
1058
|
Returns:
|
1059
|
-
(torch.Tensor |
|
1059
|
+
(torch.Tensor | np.ndarray): Normalized bounding boxes with shape (N, 4), where N is the
|
1060
1060
|
number of boxes. Each row contains [x_center, y_center, width, height] values normalized
|
1061
1061
|
to [0, 1] based on the original image dimensions.
|
1062
1062
|
|
@@ -1080,10 +1080,10 @@ class Masks(BaseTensor):
|
|
1080
1080
|
including methods for converting between pixel and normalized coordinates.
|
1081
1081
|
|
1082
1082
|
Attributes:
|
1083
|
-
data (torch.Tensor |
|
1083
|
+
data (torch.Tensor | np.ndarray): The raw tensor or array containing mask data.
|
1084
1084
|
orig_shape (tuple): Original image shape in (height, width) format.
|
1085
|
-
xy (List[
|
1086
|
-
xyn (List[
|
1085
|
+
xy (List[np.ndarray]): A list of segments in pixel coordinates.
|
1086
|
+
xyn (List[np.ndarray]): A list of normalized segments.
|
1087
1087
|
|
1088
1088
|
Methods:
|
1089
1089
|
cpu: Return a copy of the Masks object with the mask tensor on CPU memory.
|
@@ -1128,7 +1128,7 @@ class Masks(BaseTensor):
|
|
1128
1128
|
are normalized relative to the original image shape.
|
1129
1129
|
|
1130
1130
|
Returns:
|
1131
|
-
(List[
|
1131
|
+
(List[np.ndarray]): A list of numpy arrays, where each array contains the normalized xy-coordinates
|
1132
1132
|
of a single segmentation mask. Each array has shape (N, 2), where N is the number of points in the
|
1133
1133
|
mask contour.
|
1134
1134
|
|
@@ -1153,7 +1153,7 @@ class Masks(BaseTensor):
|
|
1153
1153
|
Masks object. The coordinates are scaled to match the original image dimensions.
|
1154
1154
|
|
1155
1155
|
Returns:
|
1156
|
-
(List[
|
1156
|
+
(List[np.ndarray]): A list of numpy arrays, where each array contains the [x, y] pixel
|
1157
1157
|
coordinates for a single segmentation mask. Each array has shape (N, 2), where N is the
|
1158
1158
|
number of points in the segment.
|
1159
1159
|
|
@@ -1257,7 +1257,7 @@ class Keypoints(BaseTensor):
|
|
1257
1257
|
Return normalized coordinates (x, y) of keypoints relative to the original image size.
|
1258
1258
|
|
1259
1259
|
Returns:
|
1260
|
-
(torch.Tensor |
|
1260
|
+
(torch.Tensor | np.ndarray): A tensor or array of shape (N, K, 2) containing normalized keypoint
|
1261
1261
|
coordinates, where N is the number of instances, K is the number of keypoints, and the last
|
1262
1262
|
dimension contains [x, y] values in the range [0, 1].
|
1263
1263
|
|
@@ -1299,12 +1299,12 @@ class Probs(BaseTensor):
|
|
1299
1299
|
classification probabilities, including top-1 and top-5 predictions.
|
1300
1300
|
|
1301
1301
|
Attributes:
|
1302
|
-
data (torch.Tensor |
|
1302
|
+
data (torch.Tensor | np.ndarray): The raw tensor or array containing classification probabilities.
|
1303
1303
|
orig_shape (tuple | None): The original image shape as (height, width). Not used in this class.
|
1304
1304
|
top1 (int): Index of the class with the highest probability.
|
1305
1305
|
top5 (List[int]): Indices of the top 5 classes by probability.
|
1306
|
-
top1conf (torch.Tensor |
|
1307
|
-
top5conf (torch.Tensor |
|
1306
|
+
top1conf (torch.Tensor | np.ndarray): Confidence score of the top 1 class.
|
1307
|
+
top5conf (torch.Tensor | np.ndarray): Confidence scores of the top 5 classes.
|
1308
1308
|
|
1309
1309
|
Methods:
|
1310
1310
|
cpu: Return a copy of the probabilities tensor on CPU memory.
|
@@ -1399,7 +1399,7 @@ class Probs(BaseTensor):
|
|
1399
1399
|
from the classification results.
|
1400
1400
|
|
1401
1401
|
Returns:
|
1402
|
-
(torch.Tensor |
|
1402
|
+
(torch.Tensor | np.ndarray): A tensor containing the confidence score of the top 1 class.
|
1403
1403
|
|
1404
1404
|
Examples:
|
1405
1405
|
>>> results = model("image.jpg") # classify an image
|
@@ -1420,7 +1420,7 @@ class Probs(BaseTensor):
|
|
1420
1420
|
along with their associated confidence levels.
|
1421
1421
|
|
1422
1422
|
Returns:
|
1423
|
-
(torch.Tensor |
|
1423
|
+
(torch.Tensor | np.ndarray): A tensor or array containing the confidence scores for the
|
1424
1424
|
top 5 predicted classes, sorted in descending order of probability.
|
1425
1425
|
|
1426
1426
|
Examples:
|
@@ -1444,13 +1444,13 @@ class OBB(BaseTensor):
|
|
1444
1444
|
data (torch.Tensor): The raw OBB tensor containing box coordinates and associated data.
|
1445
1445
|
orig_shape (tuple): Original image size as (height, width).
|
1446
1446
|
is_track (bool): Indicates whether tracking IDs are included in the box data.
|
1447
|
-
xywhr (torch.Tensor |
|
1448
|
-
conf (torch.Tensor |
|
1449
|
-
cls (torch.Tensor |
|
1450
|
-
id (torch.Tensor |
|
1451
|
-
xyxyxyxy (torch.Tensor |
|
1452
|
-
xyxyxyxyn (torch.Tensor |
|
1453
|
-
xyxy (torch.Tensor |
|
1447
|
+
xywhr (torch.Tensor | np.ndarray): Boxes in [x_center, y_center, width, height, rotation] format.
|
1448
|
+
conf (torch.Tensor | np.ndarray): Confidence scores for each box.
|
1449
|
+
cls (torch.Tensor | np.ndarray): Class labels for each box.
|
1450
|
+
id (torch.Tensor | np.ndarray): Tracking IDs for each box, if available.
|
1451
|
+
xyxyxyxy (torch.Tensor | np.ndarray): Boxes in 8-point [x1, y1, x2, y2, x3, y3, x4, y4] format.
|
1452
|
+
xyxyxyxyn (torch.Tensor | np.ndarray): Normalized 8-point coordinates relative to orig_shape.
|
1453
|
+
xyxy (torch.Tensor | np.ndarray): Axis-aligned bounding boxes in [x1, y1, x2, y2] format.
|
1454
1454
|
|
1455
1455
|
Methods:
|
1456
1456
|
cpu: Return a copy of the OBB object with all tensors on CPU memory.
|
@@ -1474,13 +1474,13 @@ class OBB(BaseTensor):
|
|
1474
1474
|
various properties and methods to access and transform the OBB data.
|
1475
1475
|
|
1476
1476
|
Args:
|
1477
|
-
boxes (torch.Tensor |
|
1477
|
+
boxes (torch.Tensor | np.ndarray): A tensor or numpy array containing the detection boxes,
|
1478
1478
|
with shape (num_boxes, 7) or (num_boxes, 8). The last two columns contain confidence and class values.
|
1479
1479
|
If present, the third last column contains track IDs, and the fifth column contains rotation.
|
1480
1480
|
orig_shape (Tuple[int, int]): Original image size, in the format (height, width).
|
1481
1481
|
|
1482
1482
|
Attributes:
|
1483
|
-
data (torch.Tensor |
|
1483
|
+
data (torch.Tensor | np.ndarray): The raw OBB tensor.
|
1484
1484
|
orig_shape (Tuple[int, int]): The original image shape.
|
1485
1485
|
is_track (bool): Whether the boxes include tracking IDs.
|
1486
1486
|
|
@@ -1508,7 +1508,7 @@ class OBB(BaseTensor):
|
|
1508
1508
|
Return boxes in [x_center, y_center, width, height, rotation] format.
|
1509
1509
|
|
1510
1510
|
Returns:
|
1511
|
-
(torch.Tensor |
|
1511
|
+
(torch.Tensor | np.ndarray): A tensor or numpy array containing the oriented bounding boxes with format
|
1512
1512
|
[x_center, y_center, width, height, rotation]. The shape is (N, 5) where N is the number of boxes.
|
1513
1513
|
|
1514
1514
|
Examples:
|
@@ -1529,7 +1529,7 @@ class OBB(BaseTensor):
|
|
1529
1529
|
represents the model's certainty in the detection.
|
1530
1530
|
|
1531
1531
|
Returns:
|
1532
|
-
(torch.Tensor |
|
1532
|
+
(torch.Tensor | np.ndarray): A tensor or numpy array of shape (N,) containing confidence scores
|
1533
1533
|
for N detections, where each score is in the range [0, 1].
|
1534
1534
|
|
1535
1535
|
Examples:
|
@@ -1546,7 +1546,7 @@ class OBB(BaseTensor):
|
|
1546
1546
|
Return the class values of the oriented bounding boxes.
|
1547
1547
|
|
1548
1548
|
Returns:
|
1549
|
-
(torch.Tensor |
|
1549
|
+
(torch.Tensor | np.ndarray): A tensor or numpy array containing the class values for each oriented
|
1550
1550
|
bounding box. The shape is (N,), where N is the number of boxes.
|
1551
1551
|
|
1552
1552
|
Examples:
|
@@ -1564,7 +1564,7 @@ class OBB(BaseTensor):
|
|
1564
1564
|
Return the tracking IDs of the oriented bounding boxes (if available).
|
1565
1565
|
|
1566
1566
|
Returns:
|
1567
|
-
(torch.Tensor |
|
1567
|
+
(torch.Tensor | np.ndarray | None): A tensor or numpy array containing the tracking IDs for each
|
1568
1568
|
oriented bounding box. Returns None if tracking IDs are not available.
|
1569
1569
|
|
1570
1570
|
Examples:
|
@@ -1584,7 +1584,7 @@ class OBB(BaseTensor):
|
|
1584
1584
|
Convert OBB format to 8-point (xyxyxyxy) coordinate format for rotated bounding boxes.
|
1585
1585
|
|
1586
1586
|
Returns:
|
1587
|
-
(torch.Tensor |
|
1587
|
+
(torch.Tensor | np.ndarray): Rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2), where N is
|
1588
1588
|
the number of boxes. Each box is represented by 4 points (x, y), starting from the top-left corner and
|
1589
1589
|
moving clockwise.
|
1590
1590
|
|
@@ -1603,7 +1603,7 @@ class OBB(BaseTensor):
|
|
1603
1603
|
Convert rotated bounding boxes to normalized xyxyxyxy format.
|
1604
1604
|
|
1605
1605
|
Returns:
|
1606
|
-
(torch.Tensor |
|
1606
|
+
(torch.Tensor | np.ndarray): Normalized rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2),
|
1607
1607
|
where N is the number of boxes. Each box is represented by 4 points (x, y), normalized relative to
|
1608
1608
|
the original image dimensions.
|
1609
1609
|
|
@@ -1629,7 +1629,7 @@ class OBB(BaseTensor):
|
|
1629
1629
|
as IoU calculation with non-rotated boxes.
|
1630
1630
|
|
1631
1631
|
Returns:
|
1632
|
-
(torch.Tensor |
|
1632
|
+
(torch.Tensor | np.ndarray): Axis-aligned bounding boxes in xyxy format with shape (N, 4), where N
|
1633
1633
|
is the number of boxes. Each row contains [x1, y1, x2, y2] coordinates.
|
1634
1634
|
|
1635
1635
|
Examples:
|
@@ -58,7 +58,7 @@ class FastSAM(Model):
|
|
58
58
|
prompts and passes them to the parent class predict method for processing.
|
59
59
|
|
60
60
|
Args:
|
61
|
-
source (str | PIL.Image |
|
61
|
+
source (str | PIL.Image | np.ndarray): Input source for prediction, can be a file path, URL, PIL image,
|
62
62
|
or numpy array.
|
63
63
|
stream (bool): Whether to enable real-time streaming mode for video inputs.
|
64
64
|
bboxes (List, optional): Bounding box coordinates for prompted segmentation in format [[x1, y1, x2, y2]].
|
@@ -54,7 +54,7 @@ class FastSAMPredictor(SegmentationPredictor):
|
|
54
54
|
Args:
|
55
55
|
preds (List[torch.Tensor]): Raw predictions from the model.
|
56
56
|
img (torch.Tensor): Input image tensor that was fed to the model.
|
57
|
-
orig_imgs (List[
|
57
|
+
orig_imgs (List[np.ndarray]): Original images before preprocessing.
|
58
58
|
|
59
59
|
Returns:
|
60
60
|
(List[Results]): Processed results with prompts applied.
|
ultralytics/models/sam/model.py
CHANGED
@@ -87,8 +87,8 @@ class SAM(Model):
|
|
87
87
|
Perform segmentation prediction on the given image or video source.
|
88
88
|
|
89
89
|
Args:
|
90
|
-
source (str | PIL.Image |
|
91
|
-
a
|
90
|
+
source (str | PIL.Image | np.ndarray): Path to the image or video file, or a PIL.Image object, or
|
91
|
+
a np.ndarray object.
|
92
92
|
stream (bool): If True, enables real-time streaming.
|
93
93
|
bboxes (List[List[float]] | None): List of bounding box coordinates for prompted segmentation.
|
94
94
|
points (List[List[float]] | None): List of points for prompted segmentation.
|
@@ -117,8 +117,8 @@ class SAM(Model):
|
|
117
117
|
for segmentation tasks.
|
118
118
|
|
119
119
|
Args:
|
120
|
-
source (str | PIL.Image |
|
121
|
-
object, or a
|
120
|
+
source (str | PIL.Image | np.ndarray | None): Path to the image or video file, or a PIL.Image
|
121
|
+
object, or a np.ndarray object.
|
122
122
|
stream (bool): If True, enables real-time streaming.
|
123
123
|
bboxes (List[List[float]] | None): List of bounding box coordinates for prompted segmentation.
|
124
124
|
points (List[List[float]] | None): List of points for prompted segmentation.
|
@@ -411,7 +411,7 @@ class RoPEAttention(Attention):
|
|
411
411
|
|
412
412
|
Attributes:
|
413
413
|
compute_cis (Callable): Function to compute axial complex numbers for rotary encoding.
|
414
|
-
freqs_cis (Tensor): Precomputed frequency tensor for rotary encoding.
|
414
|
+
freqs_cis (torch.Tensor): Precomputed frequency tensor for rotary encoding.
|
415
415
|
rope_k_repeat (bool): Flag to repeat query RoPE to match key length for cross-attention to memories.
|
416
416
|
|
417
417
|
Methods:
|
@@ -443,7 +443,7 @@ class RoPEAttention(Attention):
|
|
443
443
|
self.freqs_cis = freqs_cis
|
444
444
|
self.rope_k_repeat = rope_k_repeat # repeat q rope to match k length, needed for cross-attention to memories
|
445
445
|
|
446
|
-
def forward(self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0) -> Tensor:
|
446
|
+
def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, num_k_exclude_rope: int = 0) -> torch.Tensor:
|
447
447
|
"""Apply rotary position encoding and compute attention between query, key, and value tensors."""
|
448
448
|
q = self.q_proj(q)
|
449
449
|
k = self.k_proj(k)
|
@@ -744,7 +744,7 @@ class PositionEmbeddingSine(nn.Module):
|
|
744
744
|
|
745
745
|
self.cache = {}
|
746
746
|
|
747
|
-
def _encode_xy(self, x: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
|
747
|
+
def _encode_xy(self, x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
748
748
|
"""Encode 2D positions using sine/cosine functions for transformer positional embeddings."""
|
749
749
|
assert len(x) == len(y) and x.ndim == y.ndim == 1
|
750
750
|
x_embed = x * self.scale
|
@@ -760,7 +760,7 @@ class PositionEmbeddingSine(nn.Module):
|
|
760
760
|
return pos_x, pos_y
|
761
761
|
|
762
762
|
@torch.no_grad()
|
763
|
-
def encode_boxes(self, x: Tensor, y: Tensor, w: Tensor, h: Tensor) -> Tensor:
|
763
|
+
def encode_boxes(self, x: torch.Tensor, y: torch.Tensor, w: torch.Tensor, h: torch.Tensor) -> torch.Tensor:
|
764
764
|
"""Encode box coordinates and dimensions into positional embeddings for detection."""
|
765
765
|
pos_x, pos_y = self._encode_xy(x, y)
|
766
766
|
return torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
|
@@ -768,7 +768,7 @@ class PositionEmbeddingSine(nn.Module):
|
|
768
768
|
encode = encode_boxes # Backwards compatibility
|
769
769
|
|
770
770
|
@torch.no_grad()
|
771
|
-
def encode_points(self, x: Tensor, y: Tensor, labels: Tensor) -> Tensor:
|
771
|
+
def encode_points(self, x: torch.Tensor, y: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
|
772
772
|
"""Encode 2D points with sinusoidal embeddings and append labels."""
|
773
773
|
(bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
|
774
774
|
assert bx == by and nx == ny and bx == bl and nx == nl
|