ultralytics 8.3.163__py3-none-any.whl → 8.3.164__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultralytics/__init__.py +1 -1
- ultralytics/data/augment.py +182 -153
- ultralytics/data/build.py +23 -3
- ultralytics/data/dataset.py +6 -2
- ultralytics/data/loaders.py +2 -2
- ultralytics/data/utils.py +9 -7
- ultralytics/engine/exporter.py +7 -3
- ultralytics/engine/results.py +42 -42
- ultralytics/models/fastsam/model.py +1 -1
- ultralytics/models/fastsam/predict.py +1 -1
- ultralytics/models/sam/model.py +4 -4
- ultralytics/models/sam/modules/blocks.py +5 -5
- ultralytics/models/sam/modules/memory_attention.py +19 -19
- ultralytics/models/sam/modules/transformer.py +24 -22
- ultralytics/models/yolo/detect/val.py +2 -2
- ultralytics/models/yolo/world/train_world.py +9 -1
- ultralytics/solutions/distance_calculation.py +1 -1
- ultralytics/solutions/instance_segmentation.py +2 -2
- ultralytics/solutions/object_blurrer.py +2 -2
- ultralytics/solutions/object_counter.py +2 -2
- ultralytics/solutions/object_cropper.py +1 -1
- ultralytics/solutions/queue_management.py +1 -1
- ultralytics/solutions/security_alarm.py +2 -2
- ultralytics/solutions/templates/similarity-search.html +0 -24
- ultralytics/solutions/vision_eye.py +1 -1
- ultralytics/utils/benchmarks.py +2 -2
- ultralytics/utils/export.py +0 -2
- ultralytics/utils/instance.py +32 -25
- ultralytics/utils/ops.py +8 -8
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/METADATA +1 -1
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/RECORD +35 -35
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/licenses/LICENSE +0 -0
- {ultralytics-8.3.163.dist-info → ultralytics-8.3.164.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ import copy
|
|
4
4
|
from typing import Optional
|
5
5
|
|
6
6
|
import torch
|
7
|
-
from torch import
|
7
|
+
from torch import nn
|
8
8
|
|
9
9
|
from .blocks import RoPEAttention
|
10
10
|
|
@@ -103,7 +103,7 @@ class MemoryAttentionLayer(nn.Module):
|
|
103
103
|
self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
|
104
104
|
self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
|
105
105
|
|
106
|
-
def _forward_sa(self, tgt: Tensor, query_pos: Optional[Tensor]) -> Tensor:
|
106
|
+
def _forward_sa(self, tgt: torch.Tensor, query_pos: Optional[torch.Tensor]) -> torch.Tensor:
|
107
107
|
"""Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
|
108
108
|
tgt2 = self.norm1(tgt)
|
109
109
|
q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
|
@@ -113,12 +113,12 @@ class MemoryAttentionLayer(nn.Module):
|
|
113
113
|
|
114
114
|
def _forward_ca(
|
115
115
|
self,
|
116
|
-
tgt: Tensor,
|
117
|
-
memory: Tensor,
|
118
|
-
query_pos: Optional[Tensor],
|
119
|
-
pos: Optional[Tensor],
|
116
|
+
tgt: torch.Tensor,
|
117
|
+
memory: torch.Tensor,
|
118
|
+
query_pos: Optional[torch.Tensor],
|
119
|
+
pos: Optional[torch.Tensor],
|
120
120
|
num_k_exclude_rope: int = 0,
|
121
|
-
) -> Tensor:
|
121
|
+
) -> torch.Tensor:
|
122
122
|
"""Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
|
123
123
|
kwds = {}
|
124
124
|
if num_k_exclude_rope > 0:
|
@@ -138,20 +138,20 @@ class MemoryAttentionLayer(nn.Module):
|
|
138
138
|
|
139
139
|
def forward(
|
140
140
|
self,
|
141
|
-
tgt: Tensor,
|
142
|
-
memory: Tensor,
|
143
|
-
pos: Optional[Tensor] = None,
|
144
|
-
query_pos: Optional[Tensor] = None,
|
141
|
+
tgt: torch.Tensor,
|
142
|
+
memory: torch.Tensor,
|
143
|
+
pos: Optional[torch.Tensor] = None,
|
144
|
+
query_pos: Optional[torch.Tensor] = None,
|
145
145
|
num_k_exclude_rope: int = 0,
|
146
146
|
) -> torch.Tensor:
|
147
147
|
"""
|
148
148
|
Process input tensors through self-attention, cross-attention, and feedforward network layers.
|
149
149
|
|
150
150
|
Args:
|
151
|
-
tgt (Tensor): Target tensor for self-attention with shape (N, L, D).
|
152
|
-
memory (Tensor): Memory tensor for cross-attention with shape (N, S, D).
|
153
|
-
pos (Optional[Tensor]): Positional encoding for memory tensor.
|
154
|
-
query_pos (Optional[Tensor]): Positional encoding for target tensor.
|
151
|
+
tgt (torch.Tensor): Target tensor for self-attention with shape (N, L, D).
|
152
|
+
memory (torch.Tensor): Memory tensor for cross-attention with shape (N, S, D).
|
153
|
+
pos (Optional[torch.Tensor]): Positional encoding for memory tensor.
|
154
|
+
query_pos (Optional[torch.Tensor]): Positional encoding for target tensor.
|
155
155
|
num_k_exclude_rope (int): Number of keys to exclude from rotary position embedding.
|
156
156
|
|
157
157
|
Returns:
|
@@ -242,8 +242,8 @@ class MemoryAttention(nn.Module):
|
|
242
242
|
self,
|
243
243
|
curr: torch.Tensor, # self-attention inputs
|
244
244
|
memory: torch.Tensor, # cross-attention inputs
|
245
|
-
curr_pos: Optional[Tensor] = None, # pos_enc for self-attention inputs
|
246
|
-
memory_pos: Optional[Tensor] = None, # pos_enc for cross-attention inputs
|
245
|
+
curr_pos: Optional[torch.Tensor] = None, # pos_enc for self-attention inputs
|
246
|
+
memory_pos: Optional[torch.Tensor] = None, # pos_enc for cross-attention inputs
|
247
247
|
num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
|
248
248
|
) -> torch.Tensor:
|
249
249
|
"""
|
@@ -252,8 +252,8 @@ class MemoryAttention(nn.Module):
|
|
252
252
|
Args:
|
253
253
|
curr (torch.Tensor): Self-attention input tensor, representing the current state.
|
254
254
|
memory (torch.Tensor): Cross-attention input tensor, representing memory information.
|
255
|
-
curr_pos (Optional[Tensor]): Positional encoding for self-attention inputs.
|
256
|
-
memory_pos (Optional[Tensor]): Positional encoding for cross-attention inputs.
|
255
|
+
curr_pos (Optional[torch.Tensor]): Positional encoding for self-attention inputs.
|
256
|
+
memory_pos (Optional[torch.Tensor]): Positional encoding for cross-attention inputs.
|
257
257
|
num_obj_ptr_tokens (int): Number of object pointer tokens to exclude from rotary position embedding.
|
258
258
|
|
259
259
|
Returns:
|
@@ -82,21 +82,21 @@ class TwoWayTransformer(nn.Module):
|
|
82
82
|
|
83
83
|
def forward(
|
84
84
|
self,
|
85
|
-
image_embedding: Tensor,
|
86
|
-
image_pe: Tensor,
|
87
|
-
point_embedding: Tensor,
|
88
|
-
) -> Tuple[Tensor, Tensor]:
|
85
|
+
image_embedding: torch.Tensor,
|
86
|
+
image_pe: torch.Tensor,
|
87
|
+
point_embedding: torch.Tensor,
|
88
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
89
89
|
"""
|
90
90
|
Process image and point embeddings through the Two-Way Transformer.
|
91
91
|
|
92
92
|
Args:
|
93
|
-
image_embedding (Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
|
94
|
-
image_pe (Tensor): Positional encoding to add to the image, with same shape as image_embedding.
|
95
|
-
point_embedding (Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).
|
93
|
+
image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
|
94
|
+
image_pe (torch.Tensor): Positional encoding to add to the image, with same shape as image_embedding.
|
95
|
+
point_embedding (torch.Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).
|
96
96
|
|
97
97
|
Returns:
|
98
|
-
queries (Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
|
99
|
-
keys (Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
|
98
|
+
queries (torch.Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
|
99
|
+
keys (torch.Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
|
100
100
|
"""
|
101
101
|
# BxCxHxW -> BxHWxC == B x N_image_tokens x C
|
102
102
|
image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
|
@@ -196,19 +196,21 @@ class TwoWayAttentionBlock(nn.Module):
|
|
196
196
|
|
197
197
|
self.skip_first_layer_pe = skip_first_layer_pe
|
198
198
|
|
199
|
-
def forward(
|
199
|
+
def forward(
|
200
|
+
self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
|
201
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
200
202
|
"""
|
201
203
|
Apply two-way attention to process query and key embeddings in a transformer block.
|
202
204
|
|
203
205
|
Args:
|
204
|
-
queries (Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
|
205
|
-
keys (Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
|
206
|
-
query_pe (Tensor): Positional encodings for queries with same shape as queries.
|
207
|
-
key_pe (Tensor): Positional encodings for keys with same shape as keys.
|
206
|
+
queries (torch.Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
|
207
|
+
keys (torch.Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
|
208
|
+
query_pe (torch.Tensor): Positional encodings for queries with same shape as queries.
|
209
|
+
key_pe (torch.Tensor): Positional encodings for keys with same shape as keys.
|
208
210
|
|
209
211
|
Returns:
|
210
|
-
queries (Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
|
211
|
-
keys (Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
|
212
|
+
queries (torch.Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
|
213
|
+
keys (torch.Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
|
212
214
|
"""
|
213
215
|
# Self attention block
|
214
216
|
if self.skip_first_layer_pe:
|
@@ -304,7 +306,7 @@ class Attention(nn.Module):
|
|
304
306
|
self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
|
305
307
|
|
306
308
|
@staticmethod
|
307
|
-
def _separate_heads(x: Tensor, num_heads: int) -> Tensor:
|
309
|
+
def _separate_heads(x: torch.Tensor, num_heads: int) -> torch.Tensor:
|
308
310
|
"""Separate the input tensor into the specified number of attention heads."""
|
309
311
|
b, n, c = x.shape
|
310
312
|
x = x.reshape(b, n, num_heads, c // num_heads)
|
@@ -317,17 +319,17 @@ class Attention(nn.Module):
|
|
317
319
|
x = x.transpose(1, 2)
|
318
320
|
return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
|
319
321
|
|
320
|
-
def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
|
322
|
+
def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
|
321
323
|
"""
|
322
324
|
Apply multi-head attention to query, key, and value tensors with optional downsampling.
|
323
325
|
|
324
326
|
Args:
|
325
|
-
q (Tensor): Query tensor with shape (B, N_q, embedding_dim).
|
326
|
-
k (Tensor): Key tensor with shape (B, N_k, embedding_dim).
|
327
|
-
v (Tensor): Value tensor with shape (B, N_k, embedding_dim).
|
327
|
+
q (torch.Tensor): Query tensor with shape (B, N_q, embedding_dim).
|
328
|
+
k (torch.Tensor): Key tensor with shape (B, N_k, embedding_dim).
|
329
|
+
v (torch.Tensor): Value tensor with shape (B, N_k, embedding_dim).
|
328
330
|
|
329
331
|
Returns:
|
330
|
-
(Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
|
332
|
+
(torch.Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
|
331
333
|
"""
|
332
334
|
# Input projections
|
333
335
|
q = self.q_proj(q)
|
@@ -450,8 +450,8 @@ class DetectionValidator(BaseValidator):
|
|
450
450
|
val.summarize()
|
451
451
|
|
452
452
|
# update mAP50-95 and mAP50
|
453
|
-
stats[f"metrics/mAP50({suffix[i][0]})"] = val.stats_as_dict["
|
454
|
-
stats[f"metrics/mAP50-95({suffix[i][0]})"] = val.stats_as_dict["
|
453
|
+
stats[f"metrics/mAP50({suffix[i][0]})"] = val.stats_as_dict["AP_50"]
|
454
|
+
stats[f"metrics/mAP50-95({suffix[i][0]})"] = val.stats_as_dict["AP_all"]
|
455
455
|
|
456
456
|
if self.is_lvis:
|
457
457
|
stats[f"metrics/APr({suffix[i][0]})"] = val.stats_as_dict["APr"]
|
@@ -107,7 +107,15 @@ class WorldTrainerFromScratch(WorldTrainer):
|
|
107
107
|
datasets = [
|
108
108
|
build_yolo_dataset(self.args, im_path, batch, self.training_data[im_path], stride=gs, multi_modal=True)
|
109
109
|
if isinstance(im_path, str)
|
110
|
-
else build_grounding(
|
110
|
+
else build_grounding(
|
111
|
+
# assign `nc` from validation set to max number of text samples for training consistency
|
112
|
+
self.args,
|
113
|
+
im_path["img_path"],
|
114
|
+
im_path["json_file"],
|
115
|
+
batch,
|
116
|
+
stride=gs,
|
117
|
+
max_samples=self.data["nc"],
|
118
|
+
)
|
111
119
|
for im_path in img_path
|
112
120
|
]
|
113
121
|
self.set_text_embeddings(datasets, batch) # cache text embeddings to accelerate training
|
@@ -76,7 +76,7 @@ class DistanceCalculation(BaseSolution):
|
|
76
76
|
between two user-selected objects if they have been chosen.
|
77
77
|
|
78
78
|
Args:
|
79
|
-
im0 (
|
79
|
+
im0 (np.ndarray): The input image frame to process.
|
80
80
|
|
81
81
|
Returns:
|
82
82
|
(SolutionResults): Contains processed image `plot_im`, `total_tracks` (int) representing the total number
|
@@ -19,7 +19,7 @@ class InstanceSegmentation(BaseSolution):
|
|
19
19
|
names (Dict[int, str]): Dictionary mapping class indices to class names.
|
20
20
|
clss (List[int]): List of detected class indices.
|
21
21
|
track_ids (List[int]): List of track IDs for detected instances.
|
22
|
-
masks (List[
|
22
|
+
masks (List[np.ndarray]): List of segmentation masks for detected instances.
|
23
23
|
show_conf (bool): Whether to display confidence scores.
|
24
24
|
show_labels (bool): Whether to display class labels.
|
25
25
|
show_boxes (bool): Whether to display bounding boxes.
|
@@ -55,7 +55,7 @@ class InstanceSegmentation(BaseSolution):
|
|
55
55
|
Perform instance segmentation on the input image and annotate the results.
|
56
56
|
|
57
57
|
Args:
|
58
|
-
im0 (
|
58
|
+
im0 (np.ndarray): The input image for segmentation.
|
59
59
|
|
60
60
|
Returns:
|
61
61
|
(SolutionResults): Object containing the annotated image and total number of tracked instances.
|
@@ -56,11 +56,11 @@ class ObjectBlurrer(BaseSolution):
|
|
56
56
|
and annotates the image with bounding boxes.
|
57
57
|
|
58
58
|
Args:
|
59
|
-
im0 (
|
59
|
+
im0 (np.ndarray): The input image containing detected objects.
|
60
60
|
|
61
61
|
Returns:
|
62
62
|
(SolutionResults): Object containing the processed image and number of tracked objects.
|
63
|
-
- plot_im (
|
63
|
+
- plot_im (np.ndarray): The annotated output image with blurred objects.
|
64
64
|
- total_tracks (int): The total number of tracked objects in the frame.
|
65
65
|
|
66
66
|
Examples:
|
@@ -122,7 +122,7 @@ class ObjectCounter(BaseSolution):
|
|
122
122
|
Display object counts on the input image or frame.
|
123
123
|
|
124
124
|
Args:
|
125
|
-
plot_im (
|
125
|
+
plot_im (np.ndarray): The image or frame to display counts on.
|
126
126
|
|
127
127
|
Examples:
|
128
128
|
>>> counter = ObjectCounter()
|
@@ -146,7 +146,7 @@ class ObjectCounter(BaseSolution):
|
|
146
146
|
object counts, and displays the results on the input image.
|
147
147
|
|
148
148
|
Args:
|
149
|
-
im0 (
|
149
|
+
im0 (np.ndarray): The input image or frame to be processed.
|
150
150
|
|
151
151
|
Returns:
|
152
152
|
(SolutionResults): Contains processed image `im0`, 'in_count' (int, count of objects entering the region),
|
@@ -57,7 +57,7 @@ class ObjectCropper(BaseSolution):
|
|
57
57
|
Crop detected objects from the input image and save them as separate images.
|
58
58
|
|
59
59
|
Args:
|
60
|
-
im0 (
|
60
|
+
im0 (np.ndarray): The input image containing detected objects.
|
61
61
|
|
62
62
|
Returns:
|
63
63
|
(SolutionResults): A SolutionResults object containing the total number of cropped objects and processed
|
@@ -50,7 +50,7 @@ class QueueManager(BaseSolution):
|
|
50
50
|
Process queue management for a single frame of video.
|
51
51
|
|
52
52
|
Args:
|
53
|
-
im0 (
|
53
|
+
im0 (np.ndarray): Input image for processing, typically a frame from a video stream.
|
54
54
|
|
55
55
|
Returns:
|
56
56
|
(SolutionResults): Contains processed image `im0`, 'queue_count' (int, number of objects in the queue) and
|
@@ -76,7 +76,7 @@ class SecurityAlarm(BaseSolution):
|
|
76
76
|
Send an email notification with an image attachment indicating the number of objects detected.
|
77
77
|
|
78
78
|
Args:
|
79
|
-
im0 (
|
79
|
+
im0 (np.ndarray): The input image or frame to be attached to the email.
|
80
80
|
records (int, optional): The number of detected objects to be included in the email message.
|
81
81
|
|
82
82
|
This method encodes the input image, composes the email message with details about the detection, and sends it
|
@@ -121,7 +121,7 @@ class SecurityAlarm(BaseSolution):
|
|
121
121
|
Monitor the frame, process object detections, and trigger alerts if thresholds are exceeded.
|
122
122
|
|
123
123
|
Args:
|
124
|
-
im0 (
|
124
|
+
im0 (np.ndarray): The input image or frame to be processed and annotated.
|
125
125
|
|
126
126
|
Returns:
|
127
127
|
(SolutionResults): Contains processed image `plot_im`, 'total_tracks' (total number of tracked objects) and
|
@@ -35,7 +35,6 @@
|
|
35
35
|
align-items: center;
|
36
36
|
gap: 1rem;
|
37
37
|
margin-bottom: 3rem;
|
38
|
-
animation: fadeIn 1s ease-in-out;
|
39
38
|
}
|
40
39
|
|
41
40
|
input[type="text"] {
|
@@ -78,7 +77,6 @@
|
|
78
77
|
gap: 1.5rem;
|
79
78
|
max-width: 1600px;
|
80
79
|
margin: auto;
|
81
|
-
animation: fadeInUp 1s ease-in-out;
|
82
80
|
}
|
83
81
|
|
84
82
|
.card {
|
@@ -102,28 +100,6 @@
|
|
102
100
|
object-fit: cover;
|
103
101
|
display: block;
|
104
102
|
}
|
105
|
-
|
106
|
-
@keyframes fadeIn {
|
107
|
-
0% {
|
108
|
-
opacity: 0;
|
109
|
-
transform: scale(0.95);
|
110
|
-
}
|
111
|
-
100% {
|
112
|
-
opacity: 1;
|
113
|
-
transform: scale(1);
|
114
|
-
}
|
115
|
-
}
|
116
|
-
|
117
|
-
@keyframes fadeInUp {
|
118
|
-
0% {
|
119
|
-
opacity: 0;
|
120
|
-
transform: translateY(20px);
|
121
|
-
}
|
122
|
-
100% {
|
123
|
-
opacity: 1;
|
124
|
-
transform: translateY(0);
|
125
|
-
}
|
126
|
-
}
|
127
103
|
</style>
|
128
104
|
</head>
|
129
105
|
<script>
|
@@ -42,7 +42,7 @@ class VisionEye(BaseSolution):
|
|
42
42
|
Perform object detection, vision mapping, and annotation on the input image.
|
43
43
|
|
44
44
|
Args:
|
45
|
-
im0 (
|
45
|
+
im0 (np.ndarray): The input image for detection and annotation.
|
46
46
|
|
47
47
|
Returns:
|
48
48
|
(SolutionResults): Object containing the annotated image and tracking statistics.
|
ultralytics/utils/benchmarks.py
CHANGED
@@ -520,12 +520,12 @@ class ProfileModels:
|
|
520
520
|
Apply iterative sigma clipping to data to remove outliers.
|
521
521
|
|
522
522
|
Args:
|
523
|
-
data (
|
523
|
+
data (np.ndarray): Input data array.
|
524
524
|
sigma (float): Number of standard deviations to use for clipping.
|
525
525
|
max_iters (int): Maximum number of iterations for the clipping process.
|
526
526
|
|
527
527
|
Returns:
|
528
|
-
(
|
528
|
+
(np.ndarray): Clipped data array with outliers removed.
|
529
529
|
"""
|
530
530
|
data = np.array(data)
|
531
531
|
for _ in range(max_iters):
|
ultralytics/utils/export.py
CHANGED
@@ -135,8 +135,6 @@ def export_engine(
|
|
135
135
|
LOGGER.info(f'{prefix} output "{out.name}" with shape{out.shape} {out.dtype}')
|
136
136
|
|
137
137
|
if dynamic:
|
138
|
-
if shape[0] <= 1:
|
139
|
-
LOGGER.warning(f"{prefix} 'dynamic=True' model requires max batch size, i.e. 'batch=16'")
|
140
138
|
profile = builder.create_optimization_profile()
|
141
139
|
min_shape = (1, shape[1], 32, 32) # minimum input shape
|
142
140
|
max_shape = (*shape[:2], *(int(max(2, workspace or 2) * d) for d in shape[2:])) # max input shape
|
ultralytics/utils/instance.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
from collections import abc
|
4
4
|
from itertools import repeat
|
5
5
|
from numbers import Number
|
6
|
-
from typing import List
|
6
|
+
from typing import List, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
|
@@ -59,7 +59,7 @@ class Bboxes:
|
|
59
59
|
This class does not handle normalization or denormalization of bounding boxes.
|
60
60
|
"""
|
61
61
|
|
62
|
-
def __init__(self, bboxes, format="xyxy") -> None:
|
62
|
+
def __init__(self, bboxes: np.ndarray, format: str = "xyxy") -> None:
|
63
63
|
"""
|
64
64
|
Initialize the Bboxes class with bounding box data in a specified format.
|
65
65
|
|
@@ -74,7 +74,7 @@ class Bboxes:
|
|
74
74
|
self.bboxes = bboxes
|
75
75
|
self.format = format
|
76
76
|
|
77
|
-
def convert(self, format):
|
77
|
+
def convert(self, format: str) -> None:
|
78
78
|
"""
|
79
79
|
Convert bounding box format from one type to another.
|
80
80
|
|
@@ -93,7 +93,7 @@ class Bboxes:
|
|
93
93
|
self.bboxes = func(self.bboxes)
|
94
94
|
self.format = format
|
95
95
|
|
96
|
-
def areas(self):
|
96
|
+
def areas(self) -> np.ndarray:
|
97
97
|
"""Calculate the area of bounding boxes."""
|
98
98
|
return (
|
99
99
|
(self.bboxes[:, 2] - self.bboxes[:, 0]) * (self.bboxes[:, 3] - self.bboxes[:, 1]) # format xyxy
|
@@ -101,7 +101,7 @@ class Bboxes:
|
|
101
101
|
else self.bboxes[:, 3] * self.bboxes[:, 2] # format xywh or ltwh
|
102
102
|
)
|
103
103
|
|
104
|
-
def mul(self, scale):
|
104
|
+
def mul(self, scale: Union[int, tuple, list]) -> None:
|
105
105
|
"""
|
106
106
|
Multiply bounding box coordinates by scale factor(s).
|
107
107
|
|
@@ -118,7 +118,7 @@ class Bboxes:
|
|
118
118
|
self.bboxes[:, 2] *= scale[2]
|
119
119
|
self.bboxes[:, 3] *= scale[3]
|
120
120
|
|
121
|
-
def add(self, offset):
|
121
|
+
def add(self, offset: Union[int, tuple, list]) -> None:
|
122
122
|
"""
|
123
123
|
Add offset to bounding box coordinates.
|
124
124
|
|
@@ -135,12 +135,12 @@ class Bboxes:
|
|
135
135
|
self.bboxes[:, 2] += offset[2]
|
136
136
|
self.bboxes[:, 3] += offset[3]
|
137
137
|
|
138
|
-
def __len__(self):
|
138
|
+
def __len__(self) -> int:
|
139
139
|
"""Return the number of bounding boxes."""
|
140
140
|
return len(self.bboxes)
|
141
141
|
|
142
142
|
@classmethod
|
143
|
-
def concatenate(cls, boxes_list: List["Bboxes"], axis=0) -> "Bboxes":
|
143
|
+
def concatenate(cls, boxes_list: List["Bboxes"], axis: int = 0) -> "Bboxes":
|
144
144
|
"""
|
145
145
|
Concatenate a list of Bboxes objects into a single Bboxes object.
|
146
146
|
|
@@ -163,7 +163,7 @@ class Bboxes:
|
|
163
163
|
return boxes_list[0]
|
164
164
|
return cls(np.concatenate([b.bboxes for b in boxes_list], axis=axis))
|
165
165
|
|
166
|
-
def __getitem__(self, index) -> "Bboxes":
|
166
|
+
def __getitem__(self, index: Union[int, np.ndarray, slice]) -> "Bboxes":
|
167
167
|
"""
|
168
168
|
Retrieve a specific bounding box or a set of bounding boxes using indexing.
|
169
169
|
|
@@ -220,13 +220,20 @@ class Instances:
|
|
220
220
|
... )
|
221
221
|
"""
|
222
222
|
|
223
|
-
def __init__(
|
223
|
+
def __init__(
|
224
|
+
self,
|
225
|
+
bboxes: np.ndarray,
|
226
|
+
segments: np.ndarray = None,
|
227
|
+
keypoints: np.ndarray = None,
|
228
|
+
bbox_format: str = "xywh",
|
229
|
+
normalized: bool = True,
|
230
|
+
) -> None:
|
224
231
|
"""
|
225
232
|
Initialize the Instances object with bounding boxes, segments, and keypoints.
|
226
233
|
|
227
234
|
Args:
|
228
235
|
bboxes (np.ndarray): Bounding boxes with shape (N, 4).
|
229
|
-
segments (
|
236
|
+
segments (np.ndarray, optional): Segmentation masks.
|
230
237
|
keypoints (np.ndarray, optional): Keypoints with shape (N, 17, 3) in format (x, y, visible).
|
231
238
|
bbox_format (str): Format of bboxes.
|
232
239
|
normalized (bool): Whether the coordinates are normalized.
|
@@ -236,7 +243,7 @@ class Instances:
|
|
236
243
|
self.normalized = normalized
|
237
244
|
self.segments = segments
|
238
245
|
|
239
|
-
def convert_bbox(self, format):
|
246
|
+
def convert_bbox(self, format: str) -> None:
|
240
247
|
"""
|
241
248
|
Convert bounding box format.
|
242
249
|
|
@@ -246,11 +253,11 @@ class Instances:
|
|
246
253
|
self._bboxes.convert(format=format)
|
247
254
|
|
248
255
|
@property
|
249
|
-
def bbox_areas(self):
|
256
|
+
def bbox_areas(self) -> np.ndarray:
|
250
257
|
"""Calculate the area of bounding boxes."""
|
251
258
|
return self._bboxes.areas()
|
252
259
|
|
253
|
-
def scale(self, scale_w, scale_h, bbox_only=False):
|
260
|
+
def scale(self, scale_w: float, scale_h: float, bbox_only: bool = False):
|
254
261
|
"""
|
255
262
|
Scale coordinates by given factors.
|
256
263
|
|
@@ -268,7 +275,7 @@ class Instances:
|
|
268
275
|
self.keypoints[..., 0] *= scale_w
|
269
276
|
self.keypoints[..., 1] *= scale_h
|
270
277
|
|
271
|
-
def denormalize(self, w, h):
|
278
|
+
def denormalize(self, w: int, h: int) -> None:
|
272
279
|
"""
|
273
280
|
Convert normalized coordinates to absolute coordinates.
|
274
281
|
|
@@ -286,7 +293,7 @@ class Instances:
|
|
286
293
|
self.keypoints[..., 1] *= h
|
287
294
|
self.normalized = False
|
288
295
|
|
289
|
-
def normalize(self, w, h):
|
296
|
+
def normalize(self, w: int, h: int) -> None:
|
290
297
|
"""
|
291
298
|
Convert absolute coordinates to normalized coordinates.
|
292
299
|
|
@@ -304,7 +311,7 @@ class Instances:
|
|
304
311
|
self.keypoints[..., 1] /= h
|
305
312
|
self.normalized = True
|
306
313
|
|
307
|
-
def add_padding(self, padw, padh):
|
314
|
+
def add_padding(self, padw: int, padh: int) -> None:
|
308
315
|
"""
|
309
316
|
Add padding to coordinates.
|
310
317
|
|
@@ -320,7 +327,7 @@ class Instances:
|
|
320
327
|
self.keypoints[..., 0] += padw
|
321
328
|
self.keypoints[..., 1] += padh
|
322
329
|
|
323
|
-
def __getitem__(self, index) -> "Instances":
|
330
|
+
def __getitem__(self, index: Union[int, np.ndarray, slice]) -> "Instances":
|
324
331
|
"""
|
325
332
|
Retrieve a specific instance or a set of instances using indexing.
|
326
333
|
|
@@ -346,7 +353,7 @@ class Instances:
|
|
346
353
|
normalized=self.normalized,
|
347
354
|
)
|
348
355
|
|
349
|
-
def flipud(self, h):
|
356
|
+
def flipud(self, h: int) -> None:
|
350
357
|
"""
|
351
358
|
Flip coordinates vertically.
|
352
359
|
|
@@ -364,7 +371,7 @@ class Instances:
|
|
364
371
|
if self.keypoints is not None:
|
365
372
|
self.keypoints[..., 1] = h - self.keypoints[..., 1]
|
366
373
|
|
367
|
-
def fliplr(self, w):
|
374
|
+
def fliplr(self, w: int) -> None:
|
368
375
|
"""
|
369
376
|
Flip coordinates horizontally.
|
370
377
|
|
@@ -382,7 +389,7 @@ class Instances:
|
|
382
389
|
if self.keypoints is not None:
|
383
390
|
self.keypoints[..., 0] = w - self.keypoints[..., 0]
|
384
391
|
|
385
|
-
def clip(self, w, h):
|
392
|
+
def clip(self, w: int, h: int) -> None:
|
386
393
|
"""
|
387
394
|
Clip coordinates to stay within image boundaries.
|
388
395
|
|
@@ -409,7 +416,7 @@ class Instances:
|
|
409
416
|
self.keypoints[..., 0] = self.keypoints[..., 0].clip(0, w)
|
410
417
|
self.keypoints[..., 1] = self.keypoints[..., 1].clip(0, h)
|
411
418
|
|
412
|
-
def remove_zero_area_boxes(self):
|
419
|
+
def remove_zero_area_boxes(self) -> np.ndarray:
|
413
420
|
"""
|
414
421
|
Remove zero-area boxes, i.e. after clipping some boxes may have zero width or height.
|
415
422
|
|
@@ -425,7 +432,7 @@ class Instances:
|
|
425
432
|
self.keypoints = self.keypoints[good]
|
426
433
|
return good
|
427
434
|
|
428
|
-
def update(self, bboxes, segments=None, keypoints=None):
|
435
|
+
def update(self, bboxes: np.ndarray, segments: np.ndarray = None, keypoints: np.ndarray = None):
|
429
436
|
"""
|
430
437
|
Update instance variables.
|
431
438
|
|
@@ -440,7 +447,7 @@ class Instances:
|
|
440
447
|
if keypoints is not None:
|
441
448
|
self.keypoints = keypoints
|
442
449
|
|
443
|
-
def __len__(self):
|
450
|
+
def __len__(self) -> int:
|
444
451
|
"""Return the number of instances."""
|
445
452
|
return len(self.bboxes)
|
446
453
|
|
@@ -492,6 +499,6 @@ class Instances:
|
|
492
499
|
return cls(cat_boxes, cat_segments, cat_keypoints, bbox_format, normalized)
|
493
500
|
|
494
501
|
@property
|
495
|
-
def bboxes(self):
|
502
|
+
def bboxes(self) -> np.ndarray:
|
496
503
|
"""Return bounding boxes."""
|
497
504
|
return self._bboxes.bboxes
|
ultralytics/utils/ops.py
CHANGED
@@ -343,11 +343,11 @@ def clip_boxes(boxes, shape):
|
|
343
343
|
Clip bounding boxes to image boundaries.
|
344
344
|
|
345
345
|
Args:
|
346
|
-
boxes (torch.Tensor |
|
346
|
+
boxes (torch.Tensor | np.ndarray): Bounding boxes to clip.
|
347
347
|
shape (tuple): Image shape as (height, width).
|
348
348
|
|
349
349
|
Returns:
|
350
|
-
(torch.Tensor |
|
350
|
+
(torch.Tensor | np.ndarray): Clipped bounding boxes.
|
351
351
|
"""
|
352
352
|
if isinstance(boxes, torch.Tensor): # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
|
353
353
|
boxes[..., 0] = boxes[..., 0].clamp(0, shape[1]) # x1
|
@@ -365,11 +365,11 @@ def clip_coords(coords, shape):
|
|
365
365
|
Clip line coordinates to image boundaries.
|
366
366
|
|
367
367
|
Args:
|
368
|
-
coords (torch.Tensor |
|
368
|
+
coords (torch.Tensor | np.ndarray): Line coordinates to clip.
|
369
369
|
shape (tuple): Image shape as (height, width).
|
370
370
|
|
371
371
|
Returns:
|
372
|
-
(torch.Tensor |
|
372
|
+
(torch.Tensor | np.ndarray): Clipped coordinates.
|
373
373
|
"""
|
374
374
|
if isinstance(coords, torch.Tensor): # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
|
375
375
|
coords[..., 0] = coords[..., 0].clamp(0, shape[1]) # x
|
@@ -564,10 +564,10 @@ def xyxyxyxy2xywhr(x):
|
|
564
564
|
Convert batched Oriented Bounding Boxes (OBB) from [xy1, xy2, xy3, xy4] to [xywh, rotation] format.
|
565
565
|
|
566
566
|
Args:
|
567
|
-
x (
|
567
|
+
x (np.ndarray | torch.Tensor): Input box corners with shape (N, 8) in [xy1, xy2, xy3, xy4] format.
|
568
568
|
|
569
569
|
Returns:
|
570
|
-
(
|
570
|
+
(np.ndarray | torch.Tensor): Converted data in [cx, cy, w, h, rotation] format with shape (N, 5).
|
571
571
|
Rotation values are in radians from 0 to pi/2.
|
572
572
|
"""
|
573
573
|
is_torch = isinstance(x, torch.Tensor)
|
@@ -587,11 +587,11 @@ def xywhr2xyxyxyxy(x):
|
|
587
587
|
Convert batched Oriented Bounding Boxes (OBB) from [xywh, rotation] to [xy1, xy2, xy3, xy4] format.
|
588
588
|
|
589
589
|
Args:
|
590
|
-
x (
|
590
|
+
x (np.ndarray | torch.Tensor): Boxes in [cx, cy, w, h, rotation] format with shape (N, 5) or (B, N, 5).
|
591
591
|
Rotation values should be in radians from 0 to pi/2.
|
592
592
|
|
593
593
|
Returns:
|
594
|
-
(
|
594
|
+
(np.ndarray | torch.Tensor): Converted corner points with shape (N, 4, 2) or (B, N, 4, 2).
|
595
595
|
"""
|
596
596
|
cos, sin, cat, stack = (
|
597
597
|
(torch.cos, torch.sin, torch.cat, torch.stack)
|