ultralytics 8.0.194__py3-none-any.whl → 8.0.196__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultralytics might be problematic. Click here for more details.

Files changed (84) hide show
  1. ultralytics/__init__.py +1 -1
  2. ultralytics/cfg/__init__.py +5 -6
  3. ultralytics/data/augment.py +234 -29
  4. ultralytics/data/base.py +2 -1
  5. ultralytics/data/build.py +9 -3
  6. ultralytics/data/converter.py +5 -2
  7. ultralytics/data/dataset.py +16 -2
  8. ultralytics/data/loaders.py +111 -7
  9. ultralytics/data/utils.py +3 -3
  10. ultralytics/engine/exporter.py +1 -3
  11. ultralytics/engine/model.py +16 -9
  12. ultralytics/engine/predictor.py +10 -6
  13. ultralytics/engine/results.py +18 -8
  14. ultralytics/engine/trainer.py +19 -31
  15. ultralytics/engine/tuner.py +20 -20
  16. ultralytics/engine/validator.py +3 -4
  17. ultralytics/hub/__init__.py +2 -2
  18. ultralytics/hub/auth.py +18 -3
  19. ultralytics/hub/session.py +1 -0
  20. ultralytics/hub/utils.py +1 -3
  21. ultralytics/models/fastsam/model.py +2 -1
  22. ultralytics/models/fastsam/predict.py +10 -7
  23. ultralytics/models/fastsam/prompt.py +15 -1
  24. ultralytics/models/nas/model.py +3 -1
  25. ultralytics/models/rtdetr/model.py +4 -6
  26. ultralytics/models/rtdetr/predict.py +2 -1
  27. ultralytics/models/rtdetr/train.py +2 -1
  28. ultralytics/models/rtdetr/val.py +1 -0
  29. ultralytics/models/sam/amg.py +12 -6
  30. ultralytics/models/sam/model.py +5 -6
  31. ultralytics/models/sam/modules/decoders.py +5 -1
  32. ultralytics/models/sam/modules/encoders.py +15 -12
  33. ultralytics/models/sam/modules/tiny_encoder.py +38 -2
  34. ultralytics/models/sam/modules/transformer.py +2 -4
  35. ultralytics/models/sam/predict.py +8 -4
  36. ultralytics/models/utils/loss.py +35 -8
  37. ultralytics/models/utils/ops.py +14 -18
  38. ultralytics/models/yolo/classify/predict.py +1 -0
  39. ultralytics/models/yolo/classify/train.py +4 -2
  40. ultralytics/models/yolo/classify/val.py +1 -0
  41. ultralytics/models/yolo/detect/train.py +4 -3
  42. ultralytics/models/yolo/model.py +2 -4
  43. ultralytics/models/yolo/pose/predict.py +1 -0
  44. ultralytics/models/yolo/segment/predict.py +2 -0
  45. ultralytics/models/yolo/segment/val.py +1 -1
  46. ultralytics/nn/autobackend.py +54 -43
  47. ultralytics/nn/modules/__init__.py +13 -9
  48. ultralytics/nn/modules/block.py +11 -5
  49. ultralytics/nn/modules/conv.py +16 -7
  50. ultralytics/nn/modules/head.py +6 -3
  51. ultralytics/nn/modules/transformer.py +47 -15
  52. ultralytics/nn/modules/utils.py +6 -4
  53. ultralytics/nn/tasks.py +61 -21
  54. ultralytics/trackers/bot_sort.py +53 -6
  55. ultralytics/trackers/byte_tracker.py +71 -15
  56. ultralytics/trackers/track.py +0 -1
  57. ultralytics/trackers/utils/gmc.py +23 -0
  58. ultralytics/trackers/utils/kalman_filter.py +6 -6
  59. ultralytics/utils/__init__.py +32 -19
  60. ultralytics/utils/autobatch.py +1 -3
  61. ultralytics/utils/benchmarks.py +14 -1
  62. ultralytics/utils/callbacks/base.py +1 -3
  63. ultralytics/utils/callbacks/comet.py +11 -3
  64. ultralytics/utils/callbacks/dvc.py +9 -0
  65. ultralytics/utils/callbacks/neptune.py +5 -6
  66. ultralytics/utils/callbacks/wb.py +1 -0
  67. ultralytics/utils/checks.py +13 -9
  68. ultralytics/utils/dist.py +2 -1
  69. ultralytics/utils/downloads.py +7 -3
  70. ultralytics/utils/files.py +3 -3
  71. ultralytics/utils/instance.py +12 -3
  72. ultralytics/utils/loss.py +97 -22
  73. ultralytics/utils/metrics.py +35 -34
  74. ultralytics/utils/ops.py +10 -9
  75. ultralytics/utils/patches.py +9 -7
  76. ultralytics/utils/plotting.py +4 -3
  77. ultralytics/utils/torch_utils.py +8 -6
  78. ultralytics/utils/triton.py +87 -0
  79. {ultralytics-8.0.194.dist-info → ultralytics-8.0.196.dist-info}/METADATA +1 -1
  80. {ultralytics-8.0.194.dist-info → ultralytics-8.0.196.dist-info}/RECORD +84 -83
  81. {ultralytics-8.0.194.dist-info → ultralytics-8.0.196.dist-info}/LICENSE +0 -0
  82. {ultralytics-8.0.194.dist-info → ultralytics-8.0.196.dist-info}/WHEEL +0 -0
  83. {ultralytics-8.0.194.dist-info → ultralytics-8.0.196.dist-info}/entry_points.txt +0 -0
  84. {ultralytics-8.0.194.dist-info → ultralytics-8.0.196.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,5 @@
1
1
  # Ultralytics YOLO 🚀, AGPL-3.0 license
2
- """
3
- Model head modules
4
- """
2
+ """Model head modules."""
5
3
 
6
4
  import math
7
5
 
@@ -229,6 +227,7 @@ class RTDETRDecoder(nn.Module):
229
227
  self._reset_parameters()
230
228
 
231
229
  def forward(self, x, batch=None):
230
+ """Runs the forward pass of the module, returning bounding box and classification scores for the input."""
232
231
  from ultralytics.models.utils.ops import get_cdn_group
233
232
 
234
233
  # input projection and embedding
@@ -265,6 +264,7 @@ class RTDETRDecoder(nn.Module):
265
264
  return y if self.export else (y, x)
266
265
 
267
266
  def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
267
+ """Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
268
268
  anchors = []
269
269
  for i, (h, w) in enumerate(shapes):
270
270
  sy = torch.arange(end=h, dtype=dtype, device=device)
@@ -284,6 +284,7 @@ class RTDETRDecoder(nn.Module):
284
284
  return anchors, valid_mask
285
285
 
286
286
  def _get_encoder_input(self, x):
287
+ """Processes and returns encoder inputs by getting projection features from input and concatenating them."""
287
288
  # get projection features
288
289
  x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
289
290
  # get encoder inputs
@@ -301,6 +302,7 @@ class RTDETRDecoder(nn.Module):
301
302
  return feats, shapes
302
303
 
303
304
  def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
305
+ """Generates and prepares the input required for the decoder from the provided features and shapes."""
304
306
  bs = len(feats)
305
307
  # prepare input for decoder
306
308
  anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
@@ -339,6 +341,7 @@ class RTDETRDecoder(nn.Module):
339
341
 
340
342
  # TODO
341
343
  def _reset_parameters(self):
344
+ """Initializes or resets the parameters of the model's various components with predefined weights and biases."""
342
345
  # class and bbox head init
343
346
  bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
344
347
  # NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
@@ -1,7 +1,5 @@
1
1
  # Ultralytics YOLO 🚀, AGPL-3.0 license
2
- """
3
- Transformer modules
4
- """
2
+ """Transformer modules."""
5
3
 
6
4
  import math
7
5
 
@@ -18,9 +16,10 @@ __all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'M
18
16
 
19
17
 
20
18
  class TransformerEncoderLayer(nn.Module):
21
- """Transformer Encoder."""
19
+ """Defines a single layer of the transformer encoder."""
22
20
 
23
21
  def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
22
+ """Initialize the TransformerEncoderLayer with specified parameters."""
24
23
  super().__init__()
25
24
  from ...utils.torch_utils import TORCH_1_9
26
25
  if not TORCH_1_9:
@@ -41,10 +40,11 @@ class TransformerEncoderLayer(nn.Module):
41
40
  self.normalize_before = normalize_before
42
41
 
43
42
  def with_pos_embed(self, tensor, pos=None):
44
- """Add position embeddings if given."""
43
+ """Add position embeddings to the tensor if provided."""
45
44
  return tensor if pos is None else tensor + pos
46
45
 
47
46
  def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
47
+ """Performs forward pass with post-normalization."""
48
48
  q = k = self.with_pos_embed(src, pos)
49
49
  src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
50
50
  src = src + self.dropout1(src2)
@@ -54,6 +54,7 @@ class TransformerEncoderLayer(nn.Module):
54
54
  return self.norm2(src)
55
55
 
56
56
  def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
57
+ """Performs forward pass with pre-normalization."""
57
58
  src2 = self.norm1(src)
58
59
  q = k = self.with_pos_embed(src2, pos)
59
60
  src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
@@ -70,11 +71,14 @@ class TransformerEncoderLayer(nn.Module):
70
71
 
71
72
 
72
73
  class AIFI(TransformerEncoderLayer):
74
+ """Defines the AIFI transformer layer."""
73
75
 
74
76
  def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
77
+ """Initialize the AIFI instance with specified parameters."""
75
78
  super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
76
79
 
77
80
  def forward(self, x):
81
+ """Forward pass for the AIFI transformer layer."""
78
82
  c, h, w = x.shape[1:]
79
83
  pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
80
84
  # flatten [B, C, H, W] to [B, HxW, C]
@@ -82,7 +86,8 @@ class AIFI(TransformerEncoderLayer):
82
86
  return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
83
87
 
84
88
  @staticmethod
85
- def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
89
+ def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
90
+ """Builds 2D sine-cosine position embedding."""
86
91
  grid_w = torch.arange(int(w), dtype=torch.float32)
87
92
  grid_h = torch.arange(int(h), dtype=torch.float32)
88
93
  grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
@@ -140,27 +145,32 @@ class TransformerBlock(nn.Module):
140
145
 
141
146
 
142
147
  class MLPBlock(nn.Module):
148
+ """Implements a single block of a multi-layer perceptron."""
143
149
 
144
150
  def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
151
+ """Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function."""
145
152
  super().__init__()
146
153
  self.lin1 = nn.Linear(embedding_dim, mlp_dim)
147
154
  self.lin2 = nn.Linear(mlp_dim, embedding_dim)
148
155
  self.act = act()
149
156
 
150
157
  def forward(self, x: torch.Tensor) -> torch.Tensor:
158
+ """Forward pass for the MLPBlock."""
151
159
  return self.lin2(self.act(self.lin1(x)))
152
160
 
153
161
 
154
162
  class MLP(nn.Module):
155
- """ Very simple multi-layer perceptron (also called FFN)"""
163
+ """Implements a simple multi-layer perceptron (also called FFN)."""
156
164
 
157
165
  def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
166
+ """Initialize the MLP with specified input, hidden, output dimensions and number of layers."""
158
167
  super().__init__()
159
168
  self.num_layers = num_layers
160
169
  h = [hidden_dim] * (num_layers - 1)
161
170
  self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
162
171
 
163
172
  def forward(self, x):
173
+ """Forward pass for the entire MLP."""
164
174
  for i, layer in enumerate(self.layers):
165
175
  x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
166
176
  return x
@@ -168,17 +178,22 @@ class MLP(nn.Module):
168
178
 
169
179
  class LayerNorm2d(nn.Module):
170
180
  """
171
- LayerNorm2d module from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
181
+ 2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
182
+
183
+ Original implementation at
184
+ https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
172
185
  https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
173
186
  """
174
187
 
175
188
  def __init__(self, num_channels, eps=1e-6):
189
+ """Initialize LayerNorm2d with the given parameters."""
176
190
  super().__init__()
177
191
  self.weight = nn.Parameter(torch.ones(num_channels))
178
192
  self.bias = nn.Parameter(torch.zeros(num_channels))
179
193
  self.eps = eps
180
194
 
181
195
  def forward(self, x):
196
+ """Perform forward pass for 2D layer normalization."""
182
197
  u = x.mean(1, keepdim=True)
183
198
  s = (x - u).pow(2).mean(1, keepdim=True)
184
199
  x = (x - u) / torch.sqrt(s + self.eps)
@@ -187,11 +202,13 @@ class LayerNorm2d(nn.Module):
187
202
 
188
203
  class MSDeformAttn(nn.Module):
189
204
  """
190
- Original Multi-Scale Deformable Attention Module.
205
+ Multi-Scale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
206
+
191
207
  https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
192
208
  """
193
209
 
194
210
  def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
211
+ """Initialize MSDeformAttn with the given parameters."""
195
212
  super().__init__()
196
213
  if d_model % n_heads != 0:
197
214
  raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
@@ -214,6 +231,7 @@ class MSDeformAttn(nn.Module):
214
231
  self._reset_parameters()
215
232
 
216
233
  def _reset_parameters(self):
234
+ """Reset module parameters."""
217
235
  constant_(self.sampling_offsets.weight.data, 0.)
218
236
  thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
219
237
  grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
@@ -232,7 +250,10 @@ class MSDeformAttn(nn.Module):
232
250
 
233
251
  def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
234
252
  """
253
+ Perform forward pass for multi-scale deformable attention.
254
+
235
255
  https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
256
+
236
257
  Args:
237
258
  query (torch.Tensor): [bs, query_length, C]
238
259
  refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
@@ -272,24 +293,27 @@ class MSDeformAttn(nn.Module):
272
293
 
273
294
  class DeformableTransformerDecoderLayer(nn.Module):
274
295
  """
296
+ Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
297
+
275
298
  https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
276
299
  https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
277
300
  """
278
301
 
279
302
  def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
303
+ """Initialize the DeformableTransformerDecoderLayer with the given parameters."""
280
304
  super().__init__()
281
305
 
282
- # self attention
306
+ # Self attention
283
307
  self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
284
308
  self.dropout1 = nn.Dropout(dropout)
285
309
  self.norm1 = nn.LayerNorm(d_model)
286
310
 
287
- # cross attention
311
+ # Cross attention
288
312
  self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
289
313
  self.dropout2 = nn.Dropout(dropout)
290
314
  self.norm2 = nn.LayerNorm(d_model)
291
315
 
292
- # ffn
316
+ # FFN
293
317
  self.linear1 = nn.Linear(d_model, d_ffn)
294
318
  self.act = act
295
319
  self.dropout3 = nn.Dropout(dropout)
@@ -299,37 +323,44 @@ class DeformableTransformerDecoderLayer(nn.Module):
299
323
 
300
324
  @staticmethod
301
325
  def with_pos_embed(tensor, pos):
326
+ """Add positional embeddings to the input tensor, if provided."""
302
327
  return tensor if pos is None else tensor + pos
303
328
 
304
329
  def forward_ffn(self, tgt):
330
+ """Perform forward pass through the Feed-Forward Network part of the layer."""
305
331
  tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
306
332
  tgt = tgt + self.dropout4(tgt2)
307
333
  return self.norm3(tgt)
308
334
 
309
335
  def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
310
- # self attention
336
+ """Perform the forward pass through the entire decoder layer."""
337
+
338
+ # Self attention
311
339
  q = k = self.with_pos_embed(embed, query_pos)
312
340
  tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
313
341
  attn_mask=attn_mask)[0].transpose(0, 1)
314
342
  embed = embed + self.dropout1(tgt)
315
343
  embed = self.norm1(embed)
316
344
 
317
- # cross attention
345
+ # Cross attention
318
346
  tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
319
347
  padding_mask)
320
348
  embed = embed + self.dropout2(tgt)
321
349
  embed = self.norm2(embed)
322
350
 
323
- # ffn
351
+ # FFN
324
352
  return self.forward_ffn(embed)
325
353
 
326
354
 
327
355
  class DeformableTransformerDecoder(nn.Module):
328
356
  """
357
+ Implementation of Deformable Transformer Decoder based on PaddleDetection.
358
+
329
359
  https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
330
360
  """
331
361
 
332
362
  def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
363
+ """Initialize the DeformableTransformerDecoder with the given parameters."""
333
364
  super().__init__()
334
365
  self.layers = _get_clones(decoder_layer, num_layers)
335
366
  self.num_layers = num_layers
@@ -347,6 +378,7 @@ class DeformableTransformerDecoder(nn.Module):
347
378
  pos_mlp,
348
379
  attn_mask=None,
349
380
  padding_mask=None):
381
+ """Perform the forward pass through the entire decoder."""
350
382
  output = embed
351
383
  dec_bboxes = []
352
384
  dec_cls = []
@@ -1,7 +1,5 @@
1
1
  # Ultralytics YOLO 🚀, AGPL-3.0 license
2
- """
3
- Module utils
4
- """
2
+ """Module utils."""
5
3
 
6
4
  import copy
7
5
  import math
@@ -16,15 +14,17 @@ __all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
16
14
 
17
15
 
18
16
  def _get_clones(module, n):
17
+ """Create a list of cloned modules from the given module."""
19
18
  return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
20
19
 
21
20
 
22
21
  def bias_init_with_prob(prior_prob=0.01):
23
- """initialize conv/fc bias value according to a given probability value."""
22
+ """Initialize conv/fc bias value according to a given probability value."""
24
23
  return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init
25
24
 
26
25
 
27
26
  def linear_init_(module):
27
+ """Initialize the weights and biases of a linear module."""
28
28
  bound = 1 / math.sqrt(module.weight.shape[0])
29
29
  uniform_(module.weight, -bound, bound)
30
30
  if hasattr(module, 'bias') and module.bias is not None:
@@ -32,6 +32,7 @@ def linear_init_(module):
32
32
 
33
33
 
34
34
  def inverse_sigmoid(x, eps=1e-5):
35
+ """Calculate the inverse sigmoid function for a tensor."""
35
36
  x = x.clamp(min=0, max=1)
36
37
  x1 = x.clamp(min=eps)
37
38
  x2 = (1 - x).clamp(min=eps)
@@ -43,6 +44,7 @@ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shape
43
44
  attention_weights: torch.Tensor) -> torch.Tensor:
44
45
  """
45
46
  Multi-scale deformable attention.
47
+
46
48
  https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
47
49
  """
48
50
 
ultralytics/nn/tasks.py CHANGED
@@ -25,14 +25,11 @@ except ImportError:
25
25
 
26
26
 
27
27
  class BaseModel(nn.Module):
28
- """
29
- The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family.
30
- """
28
+ """The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family."""
31
29
 
32
30
  def forward(self, x, *args, **kwargs):
33
31
  """
34
- Forward pass of the model on a single scale.
35
- Wrapper for `_forward_once` method.
32
+ Forward pass of the model on a single scale. Wrapper for `_forward_once` method.
36
33
 
37
34
  Args:
38
35
  x (torch.Tensor | dict): The input image tensor or a dict including image tensor and gt labels.
@@ -93,8 +90,8 @@ class BaseModel(nn.Module):
93
90
 
94
91
  def _profile_one_layer(self, m, x, dt):
95
92
  """
96
- Profile the computation time and FLOPs of a single layer of the model on a given input.
97
- Appends the results to the provided list.
93
+ Profile the computation time and FLOPs of a single layer of the model on a given input. Appends the results to
94
+ the provided list.
98
95
 
99
96
  Args:
100
97
  m (nn.Module): The layer to be profiled.
@@ -158,7 +155,7 @@ class BaseModel(nn.Module):
158
155
 
159
156
  def info(self, detailed=False, verbose=True, imgsz=640):
160
157
  """
161
- Prints model information
158
+ Prints model information.
162
159
 
163
160
  Args:
164
161
  detailed (bool): if True, prints out detailed information about the model. Defaults to False
@@ -175,7 +172,7 @@ class BaseModel(nn.Module):
175
172
  fn (function): the function to apply to the model
176
173
 
177
174
  Returns:
178
- A model that is a Detect() object.
175
+ (BaseModel): An updated BaseModel object.
179
176
  """
180
177
  self = super()._apply(fn)
181
178
  m = self.model[-1] # Detect()
@@ -202,7 +199,7 @@ class BaseModel(nn.Module):
202
199
 
203
200
  def loss(self, batch, preds=None):
204
201
  """
205
- Compute loss
202
+ Compute loss.
206
203
 
207
204
  Args:
208
205
  batch (dict): Batch to compute loss on
@@ -215,6 +212,7 @@ class BaseModel(nn.Module):
215
212
  return self.criterion(preds, batch)
216
213
 
217
214
  def init_criterion(self):
215
+ """Initialize the loss criterion for the BaseModel."""
218
216
  raise NotImplementedError('compute_loss() needs to be implemented by task heads')
219
217
 
220
218
 
@@ -222,6 +220,7 @@ class DetectionModel(BaseModel):
222
220
  """YOLOv8 detection model."""
223
221
 
224
222
  def __init__(self, cfg='yolov8n.yaml', ch=3, nc=None, verbose=True): # model, input channels, number of classes
223
+ """Initialize the YOLOv8 detection model with the given config and parameters."""
225
224
  super().__init__()
226
225
  self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict
227
226
 
@@ -289,6 +288,7 @@ class DetectionModel(BaseModel):
289
288
  return y
290
289
 
291
290
  def init_criterion(self):
291
+ """Initialize the loss criterion for the DetectionModel."""
292
292
  return v8DetectionLoss(self)
293
293
 
294
294
 
@@ -300,6 +300,7 @@ class SegmentationModel(DetectionModel):
300
300
  super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
301
301
 
302
302
  def init_criterion(self):
303
+ """Initialize the loss criterion for the SegmentationModel."""
303
304
  return v8SegmentationLoss(self)
304
305
 
305
306
 
@@ -316,6 +317,7 @@ class PoseModel(DetectionModel):
316
317
  super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
317
318
 
318
319
  def init_criterion(self):
320
+ """Initialize the loss criterion for the PoseModel."""
319
321
  return v8PoseLoss(self)
320
322
 
321
323
 
@@ -365,22 +367,59 @@ class ClassificationModel(BaseModel):
365
367
  m[i] = nn.Conv2d(m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None)
366
368
 
367
369
  def init_criterion(self):
368
- """Compute the classification loss between predictions and true labels."""
370
+ """Initialize the loss criterion for the ClassificationModel."""
369
371
  return v8ClassificationLoss()
370
372
 
371
373
 
372
374
  class RTDETRDetectionModel(DetectionModel):
375
+ """
376
+ RTDETR (Real-time DEtection and Tracking using Transformers) Detection Model class.
377
+
378
+ This class is responsible for constructing the RTDETR architecture, defining loss functions, and
379
+ facilitating both the training and inference processes. RTDETR is an object detection and tracking model
380
+ that extends from the DetectionModel base class.
381
+
382
+ Attributes:
383
+ cfg (str): The configuration file path or preset string. Default is 'rtdetr-l.yaml'.
384
+ ch (int): Number of input channels. Default is 3 (RGB).
385
+ nc (int, optional): Number of classes for object detection. Default is None.
386
+ verbose (bool): Specifies if summary statistics are shown during initialization. Default is True.
387
+
388
+ Methods:
389
+ init_criterion: Initializes the criterion used for loss calculation.
390
+ loss: Computes and returns the loss during training.
391
+ predict: Performs a forward pass through the network and returns the output.
392
+ """
373
393
 
374
394
  def __init__(self, cfg='rtdetr-l.yaml', ch=3, nc=None, verbose=True):
395
+ """
396
+ Initialize the RTDETRDetectionModel.
397
+
398
+ Args:
399
+ cfg (str): Configuration file name or path.
400
+ ch (int): Number of input channels.
401
+ nc (int, optional): Number of classes. Defaults to None.
402
+ verbose (bool, optional): Print additional information during initialization. Defaults to True.
403
+ """
375
404
  super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
376
405
 
377
406
  def init_criterion(self):
378
- """Compute the classification loss between predictions and true labels."""
407
+ """Initialize the loss criterion for the RTDETRDetectionModel."""
379
408
  from ultralytics.models.utils.loss import RTDETRDetectionLoss
380
409
 
381
410
  return RTDETRDetectionLoss(nc=self.nc, use_vfl=True)
382
411
 
383
412
  def loss(self, batch, preds=None):
413
+ """
414
+ Compute the loss for the given batch of data.
415
+
416
+ Args:
417
+ batch (dict): Dictionary containing image and label data.
418
+ preds (torch.Tensor, optional): Precomputed model predictions. Defaults to None.
419
+
420
+ Returns:
421
+ tuple: A tuple containing the total loss and main three losses in a tensor.
422
+ """
384
423
  if not hasattr(self, 'criterion'):
385
424
  self.criterion = self.init_criterion()
386
425
 
@@ -417,16 +456,17 @@ class RTDETRDetectionModel(DetectionModel):
417
456
 
418
457
  def predict(self, x, profile=False, visualize=False, batch=None, augment=False):
419
458
  """
420
- Perform a forward pass through the network.
459
+ Perform a forward pass through the model.
421
460
 
422
461
  Args:
423
- x (torch.Tensor): The input tensor to the model
424
- profile (bool): Print the computation time of each layer if True, defaults to False.
425
- visualize (bool): Save the feature maps of the model if True, defaults to False
426
- batch (dict): A dict including gt boxes and labels from dataloader.
462
+ x (torch.Tensor): The input tensor.
463
+ profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
464
+ visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
465
+ batch (dict, optional): Ground truth data for evaluation. Defaults to None.
466
+ augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
427
467
 
428
468
  Returns:
429
- (torch.Tensor): The last output of the model.
469
+ torch.Tensor: Model's output tensor.
430
470
  """
431
471
  y, dt = [], [] # outputs
432
472
  for m in self.model[:-1]: # except the head part
@@ -708,9 +748,9 @@ def yaml_model_load(path):
708
748
 
709
749
  def guess_model_scale(model_path):
710
750
  """
711
- Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale.
712
- The function uses regular expression matching to find the pattern of the model scale in the YAML file name,
713
- which is denoted by n, s, m, l, or x. The function returns the size character of the model scale as a string.
751
+ Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale. The function
752
+ uses regular expression matching to find the pattern of the model scale in the YAML file name, which is denoted by
753
+ n, s, m, l, or x. The function returns the size character of the model scale as a string.
714
754
 
715
755
  Args:
716
756
  model_path (str | Path): The path to the YOLO model's YAML file.
@@ -12,6 +12,33 @@ from .utils.kalman_filter import KalmanFilterXYWH
12
12
 
13
13
 
14
14
  class BOTrack(STrack):
15
+ """
16
+ An extended version of the STrack class for YOLOv8, adding object tracking features.
17
+
18
+ Attributes:
19
+ shared_kalman (KalmanFilterXYWH): A shared Kalman filter for all instances of BOTrack.
20
+ smooth_feat (np.ndarray): Smoothed feature vector.
21
+ curr_feat (np.ndarray): Current feature vector.
22
+ features (deque): A deque to store feature vectors with a maximum length defined by `feat_history`.
23
+ alpha (float): Smoothing factor for the exponential moving average of features.
24
+ mean (np.ndarray): The mean state of the Kalman filter.
25
+ covariance (np.ndarray): The covariance matrix of the Kalman filter.
26
+
27
+ Methods:
28
+ update_features(feat): Update features vector and smooth it using exponential moving average.
29
+ predict(): Predicts the mean and covariance using Kalman filter.
30
+ re_activate(new_track, frame_id, new_id): Reactivates a track with updated features and optionally new ID.
31
+ update(new_track, frame_id): Update the YOLOv8 instance with new track and frame ID.
32
+ tlwh: Property that gets the current position in tlwh format `(top left x, top left y, width, height)`.
33
+ multi_predict(stracks): Predicts the mean and covariance of multiple object tracks using shared Kalman filter.
34
+ convert_coords(tlwh): Converts tlwh bounding box coordinates to xywh format.
35
+ tlwh_to_xywh(tlwh): Convert bounding box to xywh format `(center x, center y, width, height)`.
36
+
37
+ Usage:
38
+ bo_track = BOTrack(tlwh, score, cls, feat)
39
+ bo_track.predict()
40
+ bo_track.update(new_track, frame_id)
41
+ """
15
42
  shared_kalman = KalmanFilterXYWH()
16
43
 
17
44
  def __init__(self, tlwh, score, cls, feat=None, feat_history=50):
@@ -59,9 +86,7 @@ class BOTrack(STrack):
59
86
 
60
87
  @property
61
88
  def tlwh(self):
62
- """Get current position in bounding box format `(top left x, top left y,
63
- width, height)`.
64
- """
89
+ """Get current position in bounding box format `(top left x, top left y, width, height)`."""
65
90
  if self.mean is None:
66
91
  return self._tlwh.copy()
67
92
  ret = self.mean[:4].copy()
@@ -90,15 +115,37 @@ class BOTrack(STrack):
90
115
 
91
116
  @staticmethod
92
117
  def tlwh_to_xywh(tlwh):
93
- """Convert bounding box to format `(center x, center y, width,
94
- height)`.
95
- """
118
+ """Convert bounding box to format `(center x, center y, width, height)`."""
96
119
  ret = np.asarray(tlwh).copy()
97
120
  ret[:2] += ret[2:] / 2
98
121
  return ret
99
122
 
100
123
 
101
124
  class BOTSORT(BYTETracker):
125
+ """
126
+ An extended version of the BYTETracker class for YOLOv8, designed for object tracking with ReID and GMC algorithm.
127
+
128
+ Attributes:
129
+ proximity_thresh (float): Threshold for spatial proximity (IoU) between tracks and detections.
130
+ appearance_thresh (float): Threshold for appearance similarity (ReID embeddings) between tracks and detections.
131
+ encoder (object): Object to handle ReID embeddings, set to None if ReID is not enabled.
132
+ gmc (GMC): An instance of the GMC algorithm for data association.
133
+ args (object): Parsed command-line arguments containing tracking parameters.
134
+
135
+ Methods:
136
+ get_kalmanfilter(): Returns an instance of KalmanFilterXYWH for object tracking.
137
+ init_track(dets, scores, cls, img): Initialize track with detections, scores, and classes.
138
+ get_dists(tracks, detections): Get distances between tracks and detections using IoU and (optionally) ReID.
139
+ multi_predict(tracks): Predict and track multiple objects with YOLOv8 model.
140
+
141
+ Usage:
142
+ bot_sort = BOTSORT(args, frame_rate)
143
+ bot_sort.init_track(dets, scores, cls, img)
144
+ bot_sort.multi_predict(tracks)
145
+
146
+ Note:
147
+ The class is designed to work with the YOLOv8 object detection model and supports ReID only if enabled via args.
148
+ """
102
149
 
103
150
  def __init__(self, args, frame_rate=30):
104
151
  """Initialize YOLOv8 object with ReID module and GMC algorithm."""