ultralytics 8.0.195__py3-none-any.whl → 8.0.196__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultralytics might be problematic. Click here for more details.
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +5 -6
- ultralytics/data/augment.py +234 -29
- ultralytics/data/base.py +2 -1
- ultralytics/data/build.py +9 -3
- ultralytics/data/converter.py +5 -2
- ultralytics/data/dataset.py +16 -2
- ultralytics/data/loaders.py +111 -7
- ultralytics/data/utils.py +3 -3
- ultralytics/engine/exporter.py +1 -3
- ultralytics/engine/model.py +3 -9
- ultralytics/engine/predictor.py +10 -6
- ultralytics/engine/results.py +18 -8
- ultralytics/engine/trainer.py +19 -31
- ultralytics/engine/tuner.py +20 -20
- ultralytics/engine/validator.py +3 -4
- ultralytics/hub/__init__.py +2 -2
- ultralytics/hub/auth.py +18 -3
- ultralytics/hub/session.py +1 -0
- ultralytics/hub/utils.py +1 -3
- ultralytics/models/fastsam/model.py +2 -1
- ultralytics/models/fastsam/predict.py +2 -0
- ultralytics/models/fastsam/prompt.py +15 -1
- ultralytics/models/nas/model.py +3 -1
- ultralytics/models/rtdetr/model.py +4 -6
- ultralytics/models/rtdetr/predict.py +2 -1
- ultralytics/models/rtdetr/train.py +2 -1
- ultralytics/models/rtdetr/val.py +1 -0
- ultralytics/models/sam/amg.py +12 -6
- ultralytics/models/sam/model.py +5 -6
- ultralytics/models/sam/modules/decoders.py +5 -1
- ultralytics/models/sam/modules/encoders.py +15 -12
- ultralytics/models/sam/modules/tiny_encoder.py +38 -2
- ultralytics/models/sam/modules/transformer.py +2 -4
- ultralytics/models/sam/predict.py +8 -4
- ultralytics/models/utils/loss.py +35 -8
- ultralytics/models/utils/ops.py +14 -18
- ultralytics/models/yolo/classify/predict.py +1 -0
- ultralytics/models/yolo/classify/train.py +4 -2
- ultralytics/models/yolo/classify/val.py +1 -0
- ultralytics/models/yolo/detect/train.py +4 -3
- ultralytics/models/yolo/model.py +2 -4
- ultralytics/models/yolo/pose/predict.py +1 -0
- ultralytics/models/yolo/segment/predict.py +2 -0
- ultralytics/models/yolo/segment/val.py +1 -1
- ultralytics/nn/autobackend.py +45 -32
- ultralytics/nn/modules/__init__.py +13 -9
- ultralytics/nn/modules/block.py +11 -5
- ultralytics/nn/modules/conv.py +16 -7
- ultralytics/nn/modules/head.py +6 -3
- ultralytics/nn/modules/transformer.py +47 -15
- ultralytics/nn/modules/utils.py +6 -4
- ultralytics/nn/tasks.py +61 -21
- ultralytics/trackers/bot_sort.py +53 -6
- ultralytics/trackers/byte_tracker.py +71 -15
- ultralytics/trackers/track.py +0 -1
- ultralytics/trackers/utils/gmc.py +23 -0
- ultralytics/trackers/utils/kalman_filter.py +6 -6
- ultralytics/utils/__init__.py +31 -18
- ultralytics/utils/autobatch.py +1 -3
- ultralytics/utils/benchmarks.py +14 -1
- ultralytics/utils/callbacks/base.py +1 -3
- ultralytics/utils/callbacks/comet.py +11 -3
- ultralytics/utils/callbacks/dvc.py +9 -0
- ultralytics/utils/callbacks/neptune.py +5 -6
- ultralytics/utils/callbacks/wb.py +1 -0
- ultralytics/utils/checks.py +13 -9
- ultralytics/utils/dist.py +2 -1
- ultralytics/utils/downloads.py +7 -3
- ultralytics/utils/files.py +3 -3
- ultralytics/utils/instance.py +12 -3
- ultralytics/utils/loss.py +97 -22
- ultralytics/utils/metrics.py +34 -34
- ultralytics/utils/ops.py +10 -9
- ultralytics/utils/patches.py +9 -7
- ultralytics/utils/plotting.py +4 -3
- ultralytics/utils/torch_utils.py +8 -6
- ultralytics/utils/triton.py +2 -1
- {ultralytics-8.0.195.dist-info → ultralytics-8.0.196.dist-info}/METADATA +1 -1
- {ultralytics-8.0.195.dist-info → ultralytics-8.0.196.dist-info}/RECORD +84 -84
- {ultralytics-8.0.195.dist-info → ultralytics-8.0.196.dist-info}/LICENSE +0 -0
- {ultralytics-8.0.195.dist-info → ultralytics-8.0.196.dist-info}/WHEEL +0 -0
- {ultralytics-8.0.195.dist-info → ultralytics-8.0.196.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.0.195.dist-info → ultralytics-8.0.196.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
|
2
|
-
"""
|
|
3
|
-
Transformer modules
|
|
4
|
-
"""
|
|
2
|
+
"""Transformer modules."""
|
|
5
3
|
|
|
6
4
|
import math
|
|
7
5
|
|
|
@@ -18,9 +16,10 @@ __all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'M
|
|
|
18
16
|
|
|
19
17
|
|
|
20
18
|
class TransformerEncoderLayer(nn.Module):
|
|
21
|
-
"""
|
|
19
|
+
"""Defines a single layer of the transformer encoder."""
|
|
22
20
|
|
|
23
21
|
def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
|
|
22
|
+
"""Initialize the TransformerEncoderLayer with specified parameters."""
|
|
24
23
|
super().__init__()
|
|
25
24
|
from ...utils.torch_utils import TORCH_1_9
|
|
26
25
|
if not TORCH_1_9:
|
|
@@ -41,10 +40,11 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
41
40
|
self.normalize_before = normalize_before
|
|
42
41
|
|
|
43
42
|
def with_pos_embed(self, tensor, pos=None):
|
|
44
|
-
"""Add position embeddings if
|
|
43
|
+
"""Add position embeddings to the tensor if provided."""
|
|
45
44
|
return tensor if pos is None else tensor + pos
|
|
46
45
|
|
|
47
46
|
def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
|
|
47
|
+
"""Performs forward pass with post-normalization."""
|
|
48
48
|
q = k = self.with_pos_embed(src, pos)
|
|
49
49
|
src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
|
|
50
50
|
src = src + self.dropout1(src2)
|
|
@@ -54,6 +54,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
54
54
|
return self.norm2(src)
|
|
55
55
|
|
|
56
56
|
def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
|
|
57
|
+
"""Performs forward pass with pre-normalization."""
|
|
57
58
|
src2 = self.norm1(src)
|
|
58
59
|
q = k = self.with_pos_embed(src2, pos)
|
|
59
60
|
src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
|
|
@@ -70,11 +71,14 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
70
71
|
|
|
71
72
|
|
|
72
73
|
class AIFI(TransformerEncoderLayer):
|
|
74
|
+
"""Defines the AIFI transformer layer."""
|
|
73
75
|
|
|
74
76
|
def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
|
|
77
|
+
"""Initialize the AIFI instance with specified parameters."""
|
|
75
78
|
super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
|
|
76
79
|
|
|
77
80
|
def forward(self, x):
|
|
81
|
+
"""Forward pass for the AIFI transformer layer."""
|
|
78
82
|
c, h, w = x.shape[1:]
|
|
79
83
|
pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
|
|
80
84
|
# flatten [B, C, H, W] to [B, HxW, C]
|
|
@@ -82,7 +86,8 @@ class AIFI(TransformerEncoderLayer):
|
|
|
82
86
|
return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
|
|
83
87
|
|
|
84
88
|
@staticmethod
|
|
85
|
-
def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
|
|
89
|
+
def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
|
|
90
|
+
"""Builds 2D sine-cosine position embedding."""
|
|
86
91
|
grid_w = torch.arange(int(w), dtype=torch.float32)
|
|
87
92
|
grid_h = torch.arange(int(h), dtype=torch.float32)
|
|
88
93
|
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
|
|
@@ -140,27 +145,32 @@ class TransformerBlock(nn.Module):
|
|
|
140
145
|
|
|
141
146
|
|
|
142
147
|
class MLPBlock(nn.Module):
|
|
148
|
+
"""Implements a single block of a multi-layer perceptron."""
|
|
143
149
|
|
|
144
150
|
def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
|
|
151
|
+
"""Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function."""
|
|
145
152
|
super().__init__()
|
|
146
153
|
self.lin1 = nn.Linear(embedding_dim, mlp_dim)
|
|
147
154
|
self.lin2 = nn.Linear(mlp_dim, embedding_dim)
|
|
148
155
|
self.act = act()
|
|
149
156
|
|
|
150
157
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
158
|
+
"""Forward pass for the MLPBlock."""
|
|
151
159
|
return self.lin2(self.act(self.lin1(x)))
|
|
152
160
|
|
|
153
161
|
|
|
154
162
|
class MLP(nn.Module):
|
|
155
|
-
"""
|
|
163
|
+
"""Implements a simple multi-layer perceptron (also called FFN)."""
|
|
156
164
|
|
|
157
165
|
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
|
166
|
+
"""Initialize the MLP with specified input, hidden, output dimensions and number of layers."""
|
|
158
167
|
super().__init__()
|
|
159
168
|
self.num_layers = num_layers
|
|
160
169
|
h = [hidden_dim] * (num_layers - 1)
|
|
161
170
|
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
|
|
162
171
|
|
|
163
172
|
def forward(self, x):
|
|
173
|
+
"""Forward pass for the entire MLP."""
|
|
164
174
|
for i, layer in enumerate(self.layers):
|
|
165
175
|
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
|
|
166
176
|
return x
|
|
@@ -168,17 +178,22 @@ class MLP(nn.Module):
|
|
|
168
178
|
|
|
169
179
|
class LayerNorm2d(nn.Module):
|
|
170
180
|
"""
|
|
171
|
-
|
|
181
|
+
2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
|
|
182
|
+
|
|
183
|
+
Original implementation at
|
|
184
|
+
https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
|
|
172
185
|
https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
|
|
173
186
|
"""
|
|
174
187
|
|
|
175
188
|
def __init__(self, num_channels, eps=1e-6):
|
|
189
|
+
"""Initialize LayerNorm2d with the given parameters."""
|
|
176
190
|
super().__init__()
|
|
177
191
|
self.weight = nn.Parameter(torch.ones(num_channels))
|
|
178
192
|
self.bias = nn.Parameter(torch.zeros(num_channels))
|
|
179
193
|
self.eps = eps
|
|
180
194
|
|
|
181
195
|
def forward(self, x):
|
|
196
|
+
"""Perform forward pass for 2D layer normalization."""
|
|
182
197
|
u = x.mean(1, keepdim=True)
|
|
183
198
|
s = (x - u).pow(2).mean(1, keepdim=True)
|
|
184
199
|
x = (x - u) / torch.sqrt(s + self.eps)
|
|
@@ -187,11 +202,13 @@ class LayerNorm2d(nn.Module):
|
|
|
187
202
|
|
|
188
203
|
class MSDeformAttn(nn.Module):
|
|
189
204
|
"""
|
|
190
|
-
|
|
205
|
+
Multi-Scale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
|
|
206
|
+
|
|
191
207
|
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
|
|
192
208
|
"""
|
|
193
209
|
|
|
194
210
|
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
|
|
211
|
+
"""Initialize MSDeformAttn with the given parameters."""
|
|
195
212
|
super().__init__()
|
|
196
213
|
if d_model % n_heads != 0:
|
|
197
214
|
raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
|
|
@@ -214,6 +231,7 @@ class MSDeformAttn(nn.Module):
|
|
|
214
231
|
self._reset_parameters()
|
|
215
232
|
|
|
216
233
|
def _reset_parameters(self):
|
|
234
|
+
"""Reset module parameters."""
|
|
217
235
|
constant_(self.sampling_offsets.weight.data, 0.)
|
|
218
236
|
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
|
|
219
237
|
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
|
|
@@ -232,7 +250,10 @@ class MSDeformAttn(nn.Module):
|
|
|
232
250
|
|
|
233
251
|
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
|
|
234
252
|
"""
|
|
253
|
+
Perform forward pass for multi-scale deformable attention.
|
|
254
|
+
|
|
235
255
|
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
|
256
|
+
|
|
236
257
|
Args:
|
|
237
258
|
query (torch.Tensor): [bs, query_length, C]
|
|
238
259
|
refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
|
|
@@ -272,24 +293,27 @@ class MSDeformAttn(nn.Module):
|
|
|
272
293
|
|
|
273
294
|
class DeformableTransformerDecoderLayer(nn.Module):
|
|
274
295
|
"""
|
|
296
|
+
Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
|
|
297
|
+
|
|
275
298
|
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
|
276
299
|
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
|
|
277
300
|
"""
|
|
278
301
|
|
|
279
302
|
def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
|
|
303
|
+
"""Initialize the DeformableTransformerDecoderLayer with the given parameters."""
|
|
280
304
|
super().__init__()
|
|
281
305
|
|
|
282
|
-
#
|
|
306
|
+
# Self attention
|
|
283
307
|
self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
|
|
284
308
|
self.dropout1 = nn.Dropout(dropout)
|
|
285
309
|
self.norm1 = nn.LayerNorm(d_model)
|
|
286
310
|
|
|
287
|
-
#
|
|
311
|
+
# Cross attention
|
|
288
312
|
self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
|
|
289
313
|
self.dropout2 = nn.Dropout(dropout)
|
|
290
314
|
self.norm2 = nn.LayerNorm(d_model)
|
|
291
315
|
|
|
292
|
-
#
|
|
316
|
+
# FFN
|
|
293
317
|
self.linear1 = nn.Linear(d_model, d_ffn)
|
|
294
318
|
self.act = act
|
|
295
319
|
self.dropout3 = nn.Dropout(dropout)
|
|
@@ -299,37 +323,44 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
299
323
|
|
|
300
324
|
@staticmethod
|
|
301
325
|
def with_pos_embed(tensor, pos):
|
|
326
|
+
"""Add positional embeddings to the input tensor, if provided."""
|
|
302
327
|
return tensor if pos is None else tensor + pos
|
|
303
328
|
|
|
304
329
|
def forward_ffn(self, tgt):
|
|
330
|
+
"""Perform forward pass through the Feed-Forward Network part of the layer."""
|
|
305
331
|
tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
|
|
306
332
|
tgt = tgt + self.dropout4(tgt2)
|
|
307
333
|
return self.norm3(tgt)
|
|
308
334
|
|
|
309
335
|
def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
|
|
310
|
-
|
|
336
|
+
"""Perform the forward pass through the entire decoder layer."""
|
|
337
|
+
|
|
338
|
+
# Self attention
|
|
311
339
|
q = k = self.with_pos_embed(embed, query_pos)
|
|
312
340
|
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
|
|
313
341
|
attn_mask=attn_mask)[0].transpose(0, 1)
|
|
314
342
|
embed = embed + self.dropout1(tgt)
|
|
315
343
|
embed = self.norm1(embed)
|
|
316
344
|
|
|
317
|
-
#
|
|
345
|
+
# Cross attention
|
|
318
346
|
tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
|
|
319
347
|
padding_mask)
|
|
320
348
|
embed = embed + self.dropout2(tgt)
|
|
321
349
|
embed = self.norm2(embed)
|
|
322
350
|
|
|
323
|
-
#
|
|
351
|
+
# FFN
|
|
324
352
|
return self.forward_ffn(embed)
|
|
325
353
|
|
|
326
354
|
|
|
327
355
|
class DeformableTransformerDecoder(nn.Module):
|
|
328
356
|
"""
|
|
357
|
+
Implementation of Deformable Transformer Decoder based on PaddleDetection.
|
|
358
|
+
|
|
329
359
|
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
|
330
360
|
"""
|
|
331
361
|
|
|
332
362
|
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
|
|
363
|
+
"""Initialize the DeformableTransformerDecoder with the given parameters."""
|
|
333
364
|
super().__init__()
|
|
334
365
|
self.layers = _get_clones(decoder_layer, num_layers)
|
|
335
366
|
self.num_layers = num_layers
|
|
@@ -347,6 +378,7 @@ class DeformableTransformerDecoder(nn.Module):
|
|
|
347
378
|
pos_mlp,
|
|
348
379
|
attn_mask=None,
|
|
349
380
|
padding_mask=None):
|
|
381
|
+
"""Perform the forward pass through the entire decoder."""
|
|
350
382
|
output = embed
|
|
351
383
|
dec_bboxes = []
|
|
352
384
|
dec_cls = []
|
ultralytics/nn/modules/utils.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
|
2
|
-
"""
|
|
3
|
-
Module utils
|
|
4
|
-
"""
|
|
2
|
+
"""Module utils."""
|
|
5
3
|
|
|
6
4
|
import copy
|
|
7
5
|
import math
|
|
@@ -16,15 +14,17 @@ __all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
|
|
|
16
14
|
|
|
17
15
|
|
|
18
16
|
def _get_clones(module, n):
|
|
17
|
+
"""Create a list of cloned modules from the given module."""
|
|
19
18
|
return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
def bias_init_with_prob(prior_prob=0.01):
|
|
23
|
-
"""
|
|
22
|
+
"""Initialize conv/fc bias value according to a given probability value."""
|
|
24
23
|
return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
def linear_init_(module):
|
|
27
|
+
"""Initialize the weights and biases of a linear module."""
|
|
28
28
|
bound = 1 / math.sqrt(module.weight.shape[0])
|
|
29
29
|
uniform_(module.weight, -bound, bound)
|
|
30
30
|
if hasattr(module, 'bias') and module.bias is not None:
|
|
@@ -32,6 +32,7 @@ def linear_init_(module):
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def inverse_sigmoid(x, eps=1e-5):
|
|
35
|
+
"""Calculate the inverse sigmoid function for a tensor."""
|
|
35
36
|
x = x.clamp(min=0, max=1)
|
|
36
37
|
x1 = x.clamp(min=eps)
|
|
37
38
|
x2 = (1 - x).clamp(min=eps)
|
|
@@ -43,6 +44,7 @@ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shape
|
|
|
43
44
|
attention_weights: torch.Tensor) -> torch.Tensor:
|
|
44
45
|
"""
|
|
45
46
|
Multi-scale deformable attention.
|
|
47
|
+
|
|
46
48
|
https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
|
|
47
49
|
"""
|
|
48
50
|
|
ultralytics/nn/tasks.py
CHANGED
|
@@ -25,14 +25,11 @@ except ImportError:
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class BaseModel(nn.Module):
|
|
28
|
-
"""
|
|
29
|
-
The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family.
|
|
30
|
-
"""
|
|
28
|
+
"""The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family."""
|
|
31
29
|
|
|
32
30
|
def forward(self, x, *args, **kwargs):
|
|
33
31
|
"""
|
|
34
|
-
Forward pass of the model on a single scale.
|
|
35
|
-
Wrapper for `_forward_once` method.
|
|
32
|
+
Forward pass of the model on a single scale. Wrapper for `_forward_once` method.
|
|
36
33
|
|
|
37
34
|
Args:
|
|
38
35
|
x (torch.Tensor | dict): The input image tensor or a dict including image tensor and gt labels.
|
|
@@ -93,8 +90,8 @@ class BaseModel(nn.Module):
|
|
|
93
90
|
|
|
94
91
|
def _profile_one_layer(self, m, x, dt):
|
|
95
92
|
"""
|
|
96
|
-
Profile the computation time and FLOPs of a single layer of the model on a given input.
|
|
97
|
-
|
|
93
|
+
Profile the computation time and FLOPs of a single layer of the model on a given input. Appends the results to
|
|
94
|
+
the provided list.
|
|
98
95
|
|
|
99
96
|
Args:
|
|
100
97
|
m (nn.Module): The layer to be profiled.
|
|
@@ -158,7 +155,7 @@ class BaseModel(nn.Module):
|
|
|
158
155
|
|
|
159
156
|
def info(self, detailed=False, verbose=True, imgsz=640):
|
|
160
157
|
"""
|
|
161
|
-
Prints model information
|
|
158
|
+
Prints model information.
|
|
162
159
|
|
|
163
160
|
Args:
|
|
164
161
|
detailed (bool): if True, prints out detailed information about the model. Defaults to False
|
|
@@ -175,7 +172,7 @@ class BaseModel(nn.Module):
|
|
|
175
172
|
fn (function): the function to apply to the model
|
|
176
173
|
|
|
177
174
|
Returns:
|
|
178
|
-
|
|
175
|
+
(BaseModel): An updated BaseModel object.
|
|
179
176
|
"""
|
|
180
177
|
self = super()._apply(fn)
|
|
181
178
|
m = self.model[-1] # Detect()
|
|
@@ -202,7 +199,7 @@ class BaseModel(nn.Module):
|
|
|
202
199
|
|
|
203
200
|
def loss(self, batch, preds=None):
|
|
204
201
|
"""
|
|
205
|
-
Compute loss
|
|
202
|
+
Compute loss.
|
|
206
203
|
|
|
207
204
|
Args:
|
|
208
205
|
batch (dict): Batch to compute loss on
|
|
@@ -215,6 +212,7 @@ class BaseModel(nn.Module):
|
|
|
215
212
|
return self.criterion(preds, batch)
|
|
216
213
|
|
|
217
214
|
def init_criterion(self):
|
|
215
|
+
"""Initialize the loss criterion for the BaseModel."""
|
|
218
216
|
raise NotImplementedError('compute_loss() needs to be implemented by task heads')
|
|
219
217
|
|
|
220
218
|
|
|
@@ -222,6 +220,7 @@ class DetectionModel(BaseModel):
|
|
|
222
220
|
"""YOLOv8 detection model."""
|
|
223
221
|
|
|
224
222
|
def __init__(self, cfg='yolov8n.yaml', ch=3, nc=None, verbose=True): # model, input channels, number of classes
|
|
223
|
+
"""Initialize the YOLOv8 detection model with the given config and parameters."""
|
|
225
224
|
super().__init__()
|
|
226
225
|
self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict
|
|
227
226
|
|
|
@@ -289,6 +288,7 @@ class DetectionModel(BaseModel):
|
|
|
289
288
|
return y
|
|
290
289
|
|
|
291
290
|
def init_criterion(self):
|
|
291
|
+
"""Initialize the loss criterion for the DetectionModel."""
|
|
292
292
|
return v8DetectionLoss(self)
|
|
293
293
|
|
|
294
294
|
|
|
@@ -300,6 +300,7 @@ class SegmentationModel(DetectionModel):
|
|
|
300
300
|
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
|
|
301
301
|
|
|
302
302
|
def init_criterion(self):
|
|
303
|
+
"""Initialize the loss criterion for the SegmentationModel."""
|
|
303
304
|
return v8SegmentationLoss(self)
|
|
304
305
|
|
|
305
306
|
|
|
@@ -316,6 +317,7 @@ class PoseModel(DetectionModel):
|
|
|
316
317
|
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
|
|
317
318
|
|
|
318
319
|
def init_criterion(self):
|
|
320
|
+
"""Initialize the loss criterion for the PoseModel."""
|
|
319
321
|
return v8PoseLoss(self)
|
|
320
322
|
|
|
321
323
|
|
|
@@ -365,22 +367,59 @@ class ClassificationModel(BaseModel):
|
|
|
365
367
|
m[i] = nn.Conv2d(m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None)
|
|
366
368
|
|
|
367
369
|
def init_criterion(self):
|
|
368
|
-
"""
|
|
370
|
+
"""Initialize the loss criterion for the ClassificationModel."""
|
|
369
371
|
return v8ClassificationLoss()
|
|
370
372
|
|
|
371
373
|
|
|
372
374
|
class RTDETRDetectionModel(DetectionModel):
|
|
375
|
+
"""
|
|
376
|
+
RTDETR (Real-time DEtection and Tracking using Transformers) Detection Model class.
|
|
377
|
+
|
|
378
|
+
This class is responsible for constructing the RTDETR architecture, defining loss functions, and
|
|
379
|
+
facilitating both the training and inference processes. RTDETR is an object detection and tracking model
|
|
380
|
+
that extends from the DetectionModel base class.
|
|
381
|
+
|
|
382
|
+
Attributes:
|
|
383
|
+
cfg (str): The configuration file path or preset string. Default is 'rtdetr-l.yaml'.
|
|
384
|
+
ch (int): Number of input channels. Default is 3 (RGB).
|
|
385
|
+
nc (int, optional): Number of classes for object detection. Default is None.
|
|
386
|
+
verbose (bool): Specifies if summary statistics are shown during initialization. Default is True.
|
|
387
|
+
|
|
388
|
+
Methods:
|
|
389
|
+
init_criterion: Initializes the criterion used for loss calculation.
|
|
390
|
+
loss: Computes and returns the loss during training.
|
|
391
|
+
predict: Performs a forward pass through the network and returns the output.
|
|
392
|
+
"""
|
|
373
393
|
|
|
374
394
|
def __init__(self, cfg='rtdetr-l.yaml', ch=3, nc=None, verbose=True):
|
|
395
|
+
"""
|
|
396
|
+
Initialize the RTDETRDetectionModel.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
cfg (str): Configuration file name or path.
|
|
400
|
+
ch (int): Number of input channels.
|
|
401
|
+
nc (int, optional): Number of classes. Defaults to None.
|
|
402
|
+
verbose (bool, optional): Print additional information during initialization. Defaults to True.
|
|
403
|
+
"""
|
|
375
404
|
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
|
|
376
405
|
|
|
377
406
|
def init_criterion(self):
|
|
378
|
-
"""
|
|
407
|
+
"""Initialize the loss criterion for the RTDETRDetectionModel."""
|
|
379
408
|
from ultralytics.models.utils.loss import RTDETRDetectionLoss
|
|
380
409
|
|
|
381
410
|
return RTDETRDetectionLoss(nc=self.nc, use_vfl=True)
|
|
382
411
|
|
|
383
412
|
def loss(self, batch, preds=None):
|
|
413
|
+
"""
|
|
414
|
+
Compute the loss for the given batch of data.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
batch (dict): Dictionary containing image and label data.
|
|
418
|
+
preds (torch.Tensor, optional): Precomputed model predictions. Defaults to None.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
tuple: A tuple containing the total loss and main three losses in a tensor.
|
|
422
|
+
"""
|
|
384
423
|
if not hasattr(self, 'criterion'):
|
|
385
424
|
self.criterion = self.init_criterion()
|
|
386
425
|
|
|
@@ -417,16 +456,17 @@ class RTDETRDetectionModel(DetectionModel):
|
|
|
417
456
|
|
|
418
457
|
def predict(self, x, profile=False, visualize=False, batch=None, augment=False):
|
|
419
458
|
"""
|
|
420
|
-
Perform a forward pass through the
|
|
459
|
+
Perform a forward pass through the model.
|
|
421
460
|
|
|
422
461
|
Args:
|
|
423
|
-
x (torch.Tensor): The input tensor
|
|
424
|
-
profile (bool):
|
|
425
|
-
visualize (bool):
|
|
426
|
-
batch (dict):
|
|
462
|
+
x (torch.Tensor): The input tensor.
|
|
463
|
+
profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
|
|
464
|
+
visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
|
|
465
|
+
batch (dict, optional): Ground truth data for evaluation. Defaults to None.
|
|
466
|
+
augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
|
|
427
467
|
|
|
428
468
|
Returns:
|
|
429
|
-
|
|
469
|
+
torch.Tensor: Model's output tensor.
|
|
430
470
|
"""
|
|
431
471
|
y, dt = [], [] # outputs
|
|
432
472
|
for m in self.model[:-1]: # except the head part
|
|
@@ -708,9 +748,9 @@ def yaml_model_load(path):
|
|
|
708
748
|
|
|
709
749
|
def guess_model_scale(model_path):
|
|
710
750
|
"""
|
|
711
|
-
Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale.
|
|
712
|
-
|
|
713
|
-
|
|
751
|
+
Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale. The function
|
|
752
|
+
uses regular expression matching to find the pattern of the model scale in the YAML file name, which is denoted by
|
|
753
|
+
n, s, m, l, or x. The function returns the size character of the model scale as a string.
|
|
714
754
|
|
|
715
755
|
Args:
|
|
716
756
|
model_path (str | Path): The path to the YOLO model's YAML file.
|
ultralytics/trackers/bot_sort.py
CHANGED
|
@@ -12,6 +12,33 @@ from .utils.kalman_filter import KalmanFilterXYWH
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class BOTrack(STrack):
|
|
15
|
+
"""
|
|
16
|
+
An extended version of the STrack class for YOLOv8, adding object tracking features.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
shared_kalman (KalmanFilterXYWH): A shared Kalman filter for all instances of BOTrack.
|
|
20
|
+
smooth_feat (np.ndarray): Smoothed feature vector.
|
|
21
|
+
curr_feat (np.ndarray): Current feature vector.
|
|
22
|
+
features (deque): A deque to store feature vectors with a maximum length defined by `feat_history`.
|
|
23
|
+
alpha (float): Smoothing factor for the exponential moving average of features.
|
|
24
|
+
mean (np.ndarray): The mean state of the Kalman filter.
|
|
25
|
+
covariance (np.ndarray): The covariance matrix of the Kalman filter.
|
|
26
|
+
|
|
27
|
+
Methods:
|
|
28
|
+
update_features(feat): Update features vector and smooth it using exponential moving average.
|
|
29
|
+
predict(): Predicts the mean and covariance using Kalman filter.
|
|
30
|
+
re_activate(new_track, frame_id, new_id): Reactivates a track with updated features and optionally new ID.
|
|
31
|
+
update(new_track, frame_id): Update the YOLOv8 instance with new track and frame ID.
|
|
32
|
+
tlwh: Property that gets the current position in tlwh format `(top left x, top left y, width, height)`.
|
|
33
|
+
multi_predict(stracks): Predicts the mean and covariance of multiple object tracks using shared Kalman filter.
|
|
34
|
+
convert_coords(tlwh): Converts tlwh bounding box coordinates to xywh format.
|
|
35
|
+
tlwh_to_xywh(tlwh): Convert bounding box to xywh format `(center x, center y, width, height)`.
|
|
36
|
+
|
|
37
|
+
Usage:
|
|
38
|
+
bo_track = BOTrack(tlwh, score, cls, feat)
|
|
39
|
+
bo_track.predict()
|
|
40
|
+
bo_track.update(new_track, frame_id)
|
|
41
|
+
"""
|
|
15
42
|
shared_kalman = KalmanFilterXYWH()
|
|
16
43
|
|
|
17
44
|
def __init__(self, tlwh, score, cls, feat=None, feat_history=50):
|
|
@@ -59,9 +86,7 @@ class BOTrack(STrack):
|
|
|
59
86
|
|
|
60
87
|
@property
|
|
61
88
|
def tlwh(self):
|
|
62
|
-
"""Get current position in bounding box format `(top left x, top left y,
|
|
63
|
-
width, height)`.
|
|
64
|
-
"""
|
|
89
|
+
"""Get current position in bounding box format `(top left x, top left y, width, height)`."""
|
|
65
90
|
if self.mean is None:
|
|
66
91
|
return self._tlwh.copy()
|
|
67
92
|
ret = self.mean[:4].copy()
|
|
@@ -90,15 +115,37 @@ class BOTrack(STrack):
|
|
|
90
115
|
|
|
91
116
|
@staticmethod
|
|
92
117
|
def tlwh_to_xywh(tlwh):
|
|
93
|
-
"""Convert bounding box to format `(center x, center y, width,
|
|
94
|
-
height)`.
|
|
95
|
-
"""
|
|
118
|
+
"""Convert bounding box to format `(center x, center y, width, height)`."""
|
|
96
119
|
ret = np.asarray(tlwh).copy()
|
|
97
120
|
ret[:2] += ret[2:] / 2
|
|
98
121
|
return ret
|
|
99
122
|
|
|
100
123
|
|
|
101
124
|
class BOTSORT(BYTETracker):
|
|
125
|
+
"""
|
|
126
|
+
An extended version of the BYTETracker class for YOLOv8, designed for object tracking with ReID and GMC algorithm.
|
|
127
|
+
|
|
128
|
+
Attributes:
|
|
129
|
+
proximity_thresh (float): Threshold for spatial proximity (IoU) between tracks and detections.
|
|
130
|
+
appearance_thresh (float): Threshold for appearance similarity (ReID embeddings) between tracks and detections.
|
|
131
|
+
encoder (object): Object to handle ReID embeddings, set to None if ReID is not enabled.
|
|
132
|
+
gmc (GMC): An instance of the GMC algorithm for data association.
|
|
133
|
+
args (object): Parsed command-line arguments containing tracking parameters.
|
|
134
|
+
|
|
135
|
+
Methods:
|
|
136
|
+
get_kalmanfilter(): Returns an instance of KalmanFilterXYWH for object tracking.
|
|
137
|
+
init_track(dets, scores, cls, img): Initialize track with detections, scores, and classes.
|
|
138
|
+
get_dists(tracks, detections): Get distances between tracks and detections using IoU and (optionally) ReID.
|
|
139
|
+
multi_predict(tracks): Predict and track multiple objects with YOLOv8 model.
|
|
140
|
+
|
|
141
|
+
Usage:
|
|
142
|
+
bot_sort = BOTSORT(args, frame_rate)
|
|
143
|
+
bot_sort.init_track(dets, scores, cls, img)
|
|
144
|
+
bot_sort.multi_predict(tracks)
|
|
145
|
+
|
|
146
|
+
Note:
|
|
147
|
+
The class is designed to work with the YOLOv8 object detection model and supports ReID only if enabled via args.
|
|
148
|
+
"""
|
|
102
149
|
|
|
103
150
|
def __init__(self, args, frame_rate=30):
|
|
104
151
|
"""Initialize YOLOv8 object with ReID module and GMC algorithm."""
|