ultralytics 8.3.101__py3-none-any.whl → 8.3.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. tests/test_exports.py +14 -5
  2. tests/test_solutions.py +140 -76
  3. ultralytics/__init__.py +1 -1
  4. ultralytics/cfg/__init__.py +1 -1
  5. ultralytics/engine/exporter.py +23 -8
  6. ultralytics/engine/tuner.py +8 -2
  7. ultralytics/hub/__init__.py +29 -2
  8. ultralytics/hub/google/__init__.py +18 -1
  9. ultralytics/models/fastsam/predict.py +12 -1
  10. ultralytics/models/nas/predict.py +21 -3
  11. ultralytics/models/rtdetr/val.py +26 -2
  12. ultralytics/models/sam/amg.py +22 -1
  13. ultralytics/models/sam/modules/encoders.py +85 -4
  14. ultralytics/models/sam/modules/memory_attention.py +61 -3
  15. ultralytics/models/sam/modules/utils.py +108 -5
  16. ultralytics/models/utils/loss.py +38 -2
  17. ultralytics/models/utils/ops.py +15 -1
  18. ultralytics/models/yolo/classify/predict.py +11 -1
  19. ultralytics/models/yolo/classify/train.py +17 -1
  20. ultralytics/models/yolo/classify/val.py +82 -6
  21. ultralytics/models/yolo/detect/predict.py +20 -1
  22. ultralytics/models/yolo/model.py +55 -4
  23. ultralytics/models/yolo/obb/predict.py +16 -1
  24. ultralytics/models/yolo/obb/train.py +35 -2
  25. ultralytics/models/yolo/obb/val.py +87 -6
  26. ultralytics/models/yolo/pose/predict.py +18 -1
  27. ultralytics/models/yolo/pose/train.py +48 -3
  28. ultralytics/models/yolo/pose/val.py +113 -8
  29. ultralytics/models/yolo/segment/predict.py +27 -2
  30. ultralytics/models/yolo/segment/train.py +61 -3
  31. ultralytics/models/yolo/segment/val.py +10 -1
  32. ultralytics/models/yolo/world/train_world.py +29 -1
  33. ultralytics/models/yolo/yoloe/train.py +47 -3
  34. ultralytics/nn/autobackend.py +9 -8
  35. ultralytics/nn/modules/activation.py +26 -3
  36. ultralytics/nn/modules/block.py +89 -0
  37. ultralytics/nn/modules/head.py +3 -92
  38. ultralytics/nn/modules/utils.py +70 -4
  39. ultralytics/nn/tasks.py +3 -0
  40. ultralytics/nn/text_model.py +93 -17
  41. ultralytics/solutions/instance_segmentation.py +15 -7
  42. ultralytics/solutions/solutions.py +2 -47
  43. ultralytics/utils/benchmarks.py +1 -1
  44. ultralytics/utils/callbacks/base.py +22 -5
  45. ultralytics/utils/callbacks/comet.py +93 -5
  46. ultralytics/utils/callbacks/dvc.py +64 -5
  47. ultralytics/utils/callbacks/neptune.py +25 -2
  48. ultralytics/utils/callbacks/tensorboard.py +30 -2
  49. ultralytics/utils/callbacks/wb.py +16 -1
  50. ultralytics/utils/dist.py +35 -2
  51. ultralytics/utils/errors.py +27 -6
  52. ultralytics/utils/metrics.py +1 -1
  53. ultralytics/utils/patches.py +33 -5
  54. ultralytics/utils/torch_utils.py +14 -6
  55. ultralytics/utils/triton.py +16 -3
  56. ultralytics/utils/tuner.py +17 -9
  57. {ultralytics-8.3.101.dist-info → ultralytics-8.3.103.dist-info}/METADATA +3 -4
  58. {ultralytics-8.3.101.dist-info → ultralytics-8.3.103.dist-info}/RECORD +62 -62
  59. {ultralytics-8.3.101.dist-info → ultralytics-8.3.103.dist-info}/WHEEL +0 -0
  60. {ultralytics-8.3.101.dist-info → ultralytics-8.3.103.dist-info}/entry_points.txt +0 -0
  61. {ultralytics-8.3.101.dist-info → ultralytics-8.3.103.dist-info}/licenses/LICENSE +0 -0
  62. {ultralytics-8.3.101.dist-info → ultralytics-8.3.103.dist-info}/top_level.txt +0 -0
@@ -26,14 +26,45 @@ class SegmentationTrainer(yolo.detect.DetectionTrainer):
26
26
  """
27
27
 
28
28
  def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
29
- """Initialize a SegmentationTrainer object with given arguments."""
29
+ """
30
+ Initialize a SegmentationTrainer object.
31
+
32
+ This initializes a trainer for segmentation tasks, extending the detection trainer with segmentation-specific
33
+ functionality. It sets the task to 'segment' and prepares the trainer for training segmentation models.
34
+
35
+ Args:
36
+ cfg (dict): Configuration dictionary with default training settings. Defaults to DEFAULT_CFG.
37
+ overrides (dict, optional): Dictionary of parameter overrides for the default configuration.
38
+ _callbacks (list, optional): List of callback functions to be executed during training.
39
+
40
+ Examples:
41
+ >>> from ultralytics.models.yolo.segment import SegmentationTrainer
42
+ >>> args = dict(model="yolov8n-seg.pt", data="coco8-seg.yaml", epochs=3)
43
+ >>> trainer = SegmentationTrainer(overrides=args)
44
+ >>> trainer.train()
45
+ """
30
46
  if overrides is None:
31
47
  overrides = {}
32
48
  overrides["task"] = "segment"
33
49
  super().__init__(cfg, overrides, _callbacks)
34
50
 
35
51
  def get_model(self, cfg=None, weights=None, verbose=True):
36
- """Return SegmentationModel initialized with specified config and weights."""
52
+ """
53
+ Initialize and return a SegmentationModel with specified configuration and weights.
54
+
55
+ Args:
56
+ cfg (dict | str | None): Model configuration. Can be a dictionary, a path to a YAML file, or None.
57
+ weights (str | Path | None): Path to pretrained weights file.
58
+ verbose (bool): Whether to display model information during initialization.
59
+
60
+ Returns:
61
+ (SegmentationModel): Initialized segmentation model with loaded weights if specified.
62
+
63
+ Examples:
64
+ >>> trainer = SegmentationTrainer()
65
+ >>> model = trainer.get_model(cfg="yolov8n-seg.yaml")
66
+ >>> model = trainer.get_model(weights="yolov8n-seg.pt", verbose=False)
67
+ """
37
68
  model = SegmentationModel(cfg, ch=3, nc=self.data["nc"], verbose=verbose and RANK == -1)
38
69
  if weights:
39
70
  model.load(weights)
@@ -48,7 +79,34 @@ class SegmentationTrainer(yolo.detect.DetectionTrainer):
48
79
  )
49
80
 
50
81
  def plot_training_samples(self, batch, ni):
51
- """Creates a plot of training sample images with labels and box coordinates."""
82
+ """
83
+ Plot training sample images with labels, bounding boxes, and masks.
84
+
85
+ This method creates a visualization of training batch images with their corresponding labels, bounding boxes,
86
+ and segmentation masks, saving the result to a file for inspection and debugging.
87
+
88
+ Args:
89
+ batch (dict): Dictionary containing batch data with the following keys:
90
+ 'img': Images tensor
91
+ 'batch_idx': Batch indices for each box
92
+ 'cls': Class labels tensor (squeezed to remove last dimension)
93
+ 'bboxes': Bounding box coordinates tensor
94
+ 'masks': Segmentation masks tensor
95
+ 'im_file': List of image file paths
96
+ ni (int): Current training iteration number, used for naming the output file.
97
+
98
+ Examples:
99
+ >>> trainer = SegmentationTrainer()
100
+ >>> batch = {
101
+ ... "img": torch.rand(16, 3, 640, 640),
102
+ ... "batch_idx": torch.zeros(16),
103
+ ... "cls": torch.randint(0, 80, (16, 1)),
104
+ ... "bboxes": torch.rand(16, 4),
105
+ ... "masks": torch.rand(16, 640, 640),
106
+ ... "im_file": ["image1.jpg", "image2.jpg"],
107
+ ... }
108
+ >>> trainer.plot_training_samples(batch, ni=5)
109
+ """
52
110
  plot_images(
53
111
  batch["img"],
54
112
  batch["batch_idx"],
@@ -215,7 +215,16 @@ class SegmentationValidator(DetectionValidator):
215
215
  )
216
216
 
217
217
  def finalize_metrics(self, *args, **kwargs):
218
- """Set speed and confusion matrix for evaluation metrics."""
218
+ """
219
+ Finalize evaluation metrics by setting the speed attribute in the metrics object.
220
+
221
+ This method is called at the end of validation to set the processing speed for the metrics calculations.
222
+ It transfers the validator's speed measurement to the metrics object for reporting.
223
+
224
+ Args:
225
+ *args (Any): Variable length argument list.
226
+ **kwargs (Any): Arbitrary keyword arguments.
227
+ """
219
228
  self.metrics.speed = self.speed
220
229
  self.metrics.confusion_matrix = self.confusion_matrix
221
230
 
@@ -43,7 +43,35 @@ class WorldTrainerFromScratch(WorldTrainer):
43
43
  """
44
44
 
45
45
  def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
46
- """Initialize a WorldTrainerFromScratch object with given configuration and callbacks."""
46
+ """
47
+ Initialize a WorldTrainerFromScratch object.
48
+
49
+ This initializes a trainer for YOLO-World models from scratch, supporting mixed datasets including both
50
+ object detection and grounding datasets for vision-language capabilities.
51
+
52
+ Args:
53
+ cfg (dict): Configuration dictionary with default parameters for model training.
54
+ overrides (dict, optional): Dictionary of parameter overrides to customize the configuration.
55
+ _callbacks (list, optional): List of callback functions to be executed during different stages of training.
56
+
57
+ Examples:
58
+ >>> from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
59
+ >>> from ultralytics import YOLOWorld
60
+ >>> data = dict(
61
+ ... train=dict(
62
+ ... yolo_data=["Objects365.yaml"],
63
+ ... grounding_data=[
64
+ ... dict(
65
+ ... img_path="../datasets/flickr30k/images",
66
+ ... json_file="../datasets/flickr30k/final_flickr_separateGT_train.json",
67
+ ... ),
68
+ ... ],
69
+ ... ),
70
+ ... val=dict(yolo_data=["lvis.yaml"]),
71
+ ... )
72
+ >>> model = YOLOWorld("yolov8s-worldv2.yaml")
73
+ >>> model.train(data=data, trainer=WorldTrainerFromScratch)
74
+ """
47
75
  if overrides is None:
48
76
  overrides = {}
49
77
  super().__init__(cfg, overrides, _callbacks)
@@ -38,7 +38,23 @@ class YOLOETrainer(DetectionTrainer):
38
38
  super().__init__(cfg, overrides, _callbacks)
39
39
 
40
40
  def get_model(self, cfg=None, weights=None, verbose=True):
41
- """Return YOLOEModel initialized with specified config and weights."""
41
+ """
42
+ Return a YOLOEModel initialized with the specified configuration and weights.
43
+
44
+ Args:
45
+ cfg (dict | str | None): Model configuration. Can be a dictionary containing a 'yaml_file' key,
46
+ a direct path to a YAML file, or None to use default configuration.
47
+ weights (str | Path | None): Path to pretrained weights file to load into the model.
48
+ verbose (bool): Whether to display model information during initialization.
49
+
50
+ Returns:
51
+ (YOLOEModel): The initialized YOLOE model.
52
+
53
+ Notes:
54
+ - The number of classes (nc) is hard-coded to a maximum of 80 following the official configuration.
55
+ - The nc parameter here represents the maximum number of different text samples in one image,
56
+ rather than the actual number of classes.
57
+ """
42
58
  # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
43
59
  # NOTE: Following the official config, nc hard-coded to 80 for now.
44
60
  model = YOLOEModel(
@@ -180,7 +196,20 @@ class YOLOETrainerFromScratch(YOLOETrainer):
180
196
  return YOLOConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
181
197
 
182
198
  def set_text_embeddings(self, datasets, batch):
183
- """Set text embeddings for datasets to accelerate training by caching category names."""
199
+ """
200
+ Set text embeddings for datasets to accelerate training by caching category names.
201
+
202
+ This method collects unique category names from all datasets, then generates and caches text embeddings
203
+ for these categories to improve training efficiency.
204
+
205
+ Args:
206
+ datasets (List[Dataset]): List of datasets from which to extract category names.
207
+ batch (int | None): Batch size used for processing.
208
+
209
+ Notes:
210
+ This method collects category names from datasets that have the 'category_names' attribute,
211
+ then uses the first dataset's image path to determine where to cache the generated text embeddings.
212
+ """
184
213
  # TODO: open up an interface to determine whether to do cache
185
214
  category_names = set()
186
215
  for dataset in datasets:
@@ -312,7 +341,22 @@ class YOLOEPEFreeTrainer(YOLOEPETrainer, YOLOETrainerFromScratch):
312
341
  return batch
313
342
 
314
343
  def set_text_embeddings(self, datasets, batch):
315
- """No need to set text embeddings for prompt-free fine-tuning."""
344
+ """
345
+ Set text embeddings for datasets to accelerate training by caching category names.
346
+
347
+ This method collects unique category names from all datasets, generates text embeddings for them,
348
+ and caches these embeddings to improve training efficiency. The embeddings are stored in a file
349
+ in the parent directory of the first dataset's image path.
350
+
351
+ Args:
352
+ datasets (List[Dataset]): List of datasets containing category names to process.
353
+ batch (int): Batch size for processing text embeddings.
354
+
355
+ Notes:
356
+ The method creates a dictionary mapping text samples to their embeddings and stores it
357
+ at the path specified by 'cache_path'. If the cache file already exists, it will be loaded
358
+ instead of regenerating the embeddings.
359
+ """
316
360
  pass
317
361
 
318
362
 
@@ -6,6 +6,7 @@ import platform
6
6
  import zipfile
7
7
  from collections import OrderedDict, namedtuple
8
8
  from pathlib import Path
9
+ from typing import List, Optional, Union
9
10
 
10
11
  import cv2
11
12
  import numpy as np
@@ -96,14 +97,14 @@ class AutoBackend(nn.Module):
96
97
  @torch.no_grad()
97
98
  def __init__(
98
99
  self,
99
- weights="yolo11n.pt",
100
- device=torch.device("cpu"),
101
- dnn=False,
102
- data=None,
103
- fp16=False,
104
- batch=1,
105
- fuse=True,
106
- verbose=True,
100
+ weights: Union[str, List[str], torch.nn.Module] = "yolo11n.pt",
101
+ device: torch.device = torch.device("cpu"),
102
+ dnn: bool = False,
103
+ data: Optional[Union[str, Path]] = None,
104
+ fp16: bool = False,
105
+ batch: int = 1,
106
+ fuse: bool = True,
107
+ verbose: bool = True,
107
108
  ):
108
109
  """
109
110
  Initialize the AutoBackend for inference.
@@ -7,14 +7,26 @@ import torch.nn as nn
7
7
 
8
8
  class AGLU(nn.Module):
9
9
  """
10
- Unified activation function module from https://github.com/kostas1515/AGLU.
10
+ Unified activation function module from AGLU.
11
11
 
12
- This class implements a parameterized activation function with learnable parameters lambda and kappa.
12
+ This class implements a parameterized activation function with learnable parameters lambda and kappa, based on the
13
+ AGLU (Adaptive Gated Linear Unit) approach (https://github.com/kostas1515/AGLU).
13
14
 
14
15
  Attributes:
15
16
  act (nn.Softplus): Softplus activation function with negative beta.
16
17
  lambd (nn.Parameter): Learnable lambda parameter initialized with uniform distribution.
17
18
  kappa (nn.Parameter): Learnable kappa parameter initialized with uniform distribution.
19
+
20
+ Methods:
21
+ forward: Compute the forward pass of the Unified activation function.
22
+
23
+ Examples:
24
+ >>> import torch
25
+ >>> m = AGLU()
26
+ >>> input = torch.randn(2)
27
+ >>> output = m(input)
28
+ >>> print(output.shape)
29
+ torch.Size([2])
18
30
  """
19
31
 
20
32
  def __init__(self, device=None, dtype=None) -> None:
@@ -25,6 +37,17 @@ class AGLU(nn.Module):
25
37
  self.kappa = nn.Parameter(nn.init.uniform_(torch.empty(1, device=device, dtype=dtype))) # kappa parameter
26
38
 
27
39
  def forward(self, x: torch.Tensor) -> torch.Tensor:
28
- """Compute the forward pass of the Unified activation function."""
40
+ """
41
+ Apply the Adaptive Gated Linear Unit (AGLU) activation function.
42
+
43
+ This forward method implements the AGLU activation function with learnable parameters lambda and kappa.
44
+ The function applies a transformation that adaptively combines linear and non-linear components.
45
+
46
+ Args:
47
+ x (torch.Tensor): Input tensor to apply the activation function to.
48
+
49
+ Returns:
50
+ (torch.Tensor): Output tensor after applying the AGLU activation function, with the same shape as the input.
51
+ """
29
52
  lam = torch.clamp(self.lambd, min=0.0001) # Clamp lambda to avoid division by zero
30
53
  return torch.exp((1 / lam) * self.act((self.kappa * x) - torch.log(lam)))
@@ -1875,3 +1875,92 @@ class A2C2f(nn.Module):
1875
1875
  if self.gamma is not None:
1876
1876
  return x + self.gamma.view(-1, len(self.gamma), 1, 1) * y
1877
1877
  return y
1878
+
1879
+
1880
+ class SwiGLUFFN(nn.Module):
1881
+ """SwiGLU Feed-Forward Network for transformer-based architectures."""
1882
+
1883
+ def __init__(self, gc, ec, e=4) -> None:
1884
+ """Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor."""
1885
+ super().__init__()
1886
+ self.w12 = nn.Linear(gc, e * ec)
1887
+ self.w3 = nn.Linear(e * ec // 2, ec)
1888
+
1889
+ def forward(self, x):
1890
+ """Apply SwiGLU transformation to input features."""
1891
+ x12 = self.w12(x)
1892
+ x1, x2 = x12.chunk(2, dim=-1)
1893
+ hidden = F.silu(x1) * x2
1894
+ return self.w3(hidden)
1895
+
1896
+
1897
+ class Residual(nn.Module):
1898
+ """Residual connection wrapper for neural network modules."""
1899
+
1900
+ def __init__(self, m) -> None:
1901
+ """Initialize residual module with the wrapped module."""
1902
+ super().__init__()
1903
+ self.m = m
1904
+ nn.init.zeros_(self.m.w3.bias)
1905
+ # For models with l scale, please change the initialization to
1906
+ # nn.init.constant_(self.m.w3.weight, 1e-6)
1907
+ nn.init.zeros_(self.m.w3.weight)
1908
+
1909
+ def forward(self, x):
1910
+ """Apply residual connection to input features."""
1911
+ return x + self.m(x)
1912
+
1913
+
1914
+ class SAVPE(nn.Module):
1915
+ """Spatial-Aware Visual Prompt Embedding module for feature enhancement."""
1916
+
1917
+ def __init__(self, ch, c3, embed):
1918
+ """Initialize SAVPE module with channels, intermediate channels, and embedding dimension."""
1919
+ super().__init__()
1920
+ self.cv1 = nn.ModuleList(
1921
+ nn.Sequential(
1922
+ Conv(x, c3, 3), Conv(c3, c3, 3), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity()
1923
+ )
1924
+ for i, x in enumerate(ch)
1925
+ )
1926
+
1927
+ self.cv2 = nn.ModuleList(
1928
+ nn.Sequential(Conv(x, c3, 1), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity())
1929
+ for i, x in enumerate(ch)
1930
+ )
1931
+
1932
+ self.c = 16
1933
+ self.cv3 = nn.Conv2d(3 * c3, embed, 1)
1934
+ self.cv4 = nn.Conv2d(3 * c3, self.c, 3, padding=1)
1935
+ self.cv5 = nn.Conv2d(1, self.c, 3, padding=1)
1936
+ self.cv6 = nn.Sequential(Conv(2 * self.c, self.c, 3), nn.Conv2d(self.c, self.c, 3, padding=1))
1937
+
1938
+ def forward(self, x, vp):
1939
+ """Process input features and visual prompts to generate enhanced embeddings."""
1940
+ y = [self.cv2[i](xi) for i, xi in enumerate(x)]
1941
+ y = self.cv4(torch.cat(y, dim=1))
1942
+
1943
+ x = [self.cv1[i](xi) for i, xi in enumerate(x)]
1944
+ x = self.cv3(torch.cat(x, dim=1))
1945
+
1946
+ B, C, H, W = x.shape
1947
+
1948
+ Q = vp.shape[1]
1949
+
1950
+ x = x.view(B, C, -1)
1951
+
1952
+ y = y.reshape(B, 1, self.c, H, W).expand(-1, Q, -1, -1, -1).reshape(B * Q, self.c, H, W)
1953
+ vp = vp.reshape(B, Q, 1, H, W).reshape(B * Q, 1, H, W)
1954
+
1955
+ y = self.cv6(torch.cat((y, self.cv5(vp)), dim=1))
1956
+
1957
+ y = y.reshape(B, Q, self.c, -1)
1958
+ vp = vp.reshape(B, Q, 1, -1)
1959
+
1960
+ score = y * vp + torch.logical_not(vp) * torch.finfo(y.dtype).min
1961
+
1962
+ score = F.softmax(score, dim=-1, dtype=torch.float).to(score.dtype)
1963
+
1964
+ aggregated = score.transpose(-2, -3) @ x.reshape(B, self.c, C // self.c, -1).transpose(-1, -2)
1965
+
1966
+ return F.normalize(aggregated.transpose(-2, -3).reshape(B, Q, -1), dim=-1, p=2)
@@ -12,7 +12,7 @@ from torch.nn.init import constant_, xavier_uniform_
12
12
  from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
13
13
  from ultralytics.utils.torch_utils import fuse_conv_and_bn, smart_inference_mode
14
14
 
15
- from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
15
+ from .block import DFL, SAVPE, BNContrastiveHead, ContrastiveHead, Proto, Residual, SwiGLUFFN
16
16
  from .conv import Conv, DWConv
17
17
  from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
18
18
  from .utils import bias_init_with_prob, linear_init
@@ -345,61 +345,6 @@ class WorldDetect(Detect):
345
345
  # b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
346
346
 
347
347
 
348
- class SAVPE(nn.Module):
349
- """Spatial-Aware Visual Prompt Embedding module for feature enhancement."""
350
-
351
- def __init__(self, ch, c3, embed):
352
- """Initialize SAVPE module with channels, intermediate channels, and embedding dimension."""
353
- super().__init__()
354
- self.cv1 = nn.ModuleList(
355
- nn.Sequential(
356
- Conv(x, c3, 3), Conv(c3, c3, 3), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity()
357
- )
358
- for i, x in enumerate(ch)
359
- )
360
-
361
- self.cv2 = nn.ModuleList(
362
- nn.Sequential(Conv(x, c3, 1), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity())
363
- for i, x in enumerate(ch)
364
- )
365
-
366
- self.c = 16
367
- self.cv3 = nn.Conv2d(3 * c3, embed, 1)
368
- self.cv4 = nn.Conv2d(3 * c3, self.c, 3, padding=1)
369
- self.cv5 = nn.Conv2d(1, self.c, 3, padding=1)
370
- self.cv6 = nn.Sequential(Conv(2 * self.c, self.c, 3), nn.Conv2d(self.c, self.c, 3, padding=1))
371
-
372
- def forward(self, x, vp):
373
- """Process input features and visual prompts to generate enhanced embeddings."""
374
- y = [self.cv2[i](xi) for i, xi in enumerate(x)]
375
- y = self.cv4(torch.cat(y, dim=1))
376
-
377
- x = [self.cv1[i](xi) for i, xi in enumerate(x)]
378
- x = self.cv3(torch.cat(x, dim=1))
379
-
380
- B, C, H, W = x.shape
381
-
382
- Q = vp.shape[1]
383
-
384
- x = x.view(B, C, -1)
385
-
386
- y = y.reshape(B, 1, self.c, H, W).expand(-1, Q, -1, -1, -1).reshape(B * Q, self.c, H, W)
387
- vp = vp.reshape(B, Q, 1, H, W).reshape(B * Q, 1, H, W)
388
-
389
- y = self.cv6(torch.cat((y, self.cv5(vp)), dim=1))
390
-
391
- y = y.reshape(B, Q, self.c, -1)
392
- vp = vp.reshape(B, Q, 1, -1)
393
-
394
- score = y * vp + torch.logical_not(vp) * torch.finfo(y.dtype).min
395
-
396
- score = F.softmax(score, dim=-1, dtype=torch.float).to(score.dtype)
397
-
398
- aggregated = score.transpose(-2, -3) @ x.reshape(B, self.c, C // self.c, -1).transpose(-1, -2)
399
-
400
- return F.normalize(aggregated.transpose(-2, -3).reshape(B, Q, -1), dim=-1, p=2)
401
-
402
-
403
348
  class LRPCHead(nn.Module):
404
349
  """Lightweight Region Proposal and Classification Head for efficient object detection."""
405
350
 
@@ -419,7 +364,7 @@ class LRPCHead(nn.Module):
419
364
  linear.bias.data = conv.bias.data
420
365
  return linear
421
366
 
422
- def forward(self, cls_feat, loc_feat, conf, max_det):
367
+ def forward(self, cls_feat, loc_feat, conf):
423
368
  """Process classification and localization features to generate detection proposals."""
424
369
  if self.enabled:
425
370
  pf_score = self.pf(cls_feat)[0, 0].flatten(0)
@@ -533,7 +478,7 @@ class YOLOEDetect(Detect):
533
478
  cls_feat = self.cv3[i](x[i])
534
479
  loc_feat = self.cv2[i](x[i])
535
480
  assert isinstance(self.lrpc[i], LRPCHead)
536
- x[i], mask = self.lrpc[i](cls_feat, loc_feat, self.conf, self.max_det)
481
+ x[i], mask = self.lrpc[i](cls_feat, loc_feat, getattr(self, "conf", 0.001))
537
482
  masks.append(mask)
538
483
  shape = x[0][0].shape
539
484
  if self.dynamic or self.shape != shape:
@@ -585,40 +530,6 @@ class YOLOEDetect(Detect):
585
530
  c.bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)
586
531
 
587
532
 
588
- class SwiGLUFFN(nn.Module):
589
- """SwiGLU Feed-Forward Network for transformer-based architectures."""
590
-
591
- def __init__(self, gc, ec, e=4) -> None:
592
- """Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor."""
593
- super().__init__()
594
- self.w12 = nn.Linear(gc, e * ec)
595
- self.w3 = nn.Linear(e * ec // 2, ec)
596
-
597
- def forward(self, x):
598
- """Apply SwiGLU transformation to input features."""
599
- x12 = self.w12(x)
600
- x1, x2 = x12.chunk(2, dim=-1)
601
- hidden = F.silu(x1) * x2
602
- return self.w3(hidden)
603
-
604
-
605
- class Residual(nn.Module):
606
- """Residual connection wrapper for neural network modules."""
607
-
608
- def __init__(self, m) -> None:
609
- """Initialize residual module with the wrapped module."""
610
- super().__init__()
611
- self.m = m
612
- nn.init.zeros_(self.m.w3.bias)
613
- # For models with l scale, please change the initialization to
614
- # nn.init.constant_(self.m.w3.weight, 1e-6)
615
- nn.init.zeros_(self.m.w3.weight)
616
-
617
- def forward(self, x):
618
- """Apply residual connection to input features."""
619
- return x + self.m(x)
620
-
621
-
622
533
  class YOLOESegment(YOLOEDetect):
623
534
  """YOLO segmentation head with text embedding capabilities."""
624
535
 
@@ -13,17 +13,66 @@ __all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
13
13
 
14
14
 
15
15
  def _get_clones(module, n):
16
- """Create a list of cloned modules from the given module."""
16
+ """
17
+ Create a list of cloned modules from the given module.
18
+
19
+ Args:
20
+ module (nn.Module): The module to be cloned.
21
+ n (int): Number of clones to create.
22
+
23
+ Returns:
24
+ (nn.ModuleList): A ModuleList containing n clones of the input module.
25
+
26
+ Examples:
27
+ >>> import torch.nn as nn
28
+ >>> layer = nn.Linear(10, 10)
29
+ >>> clones = _get_clones(layer, 3)
30
+ >>> len(clones)
31
+ 3
32
+ """
17
33
  return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
18
34
 
19
35
 
20
36
  def bias_init_with_prob(prior_prob=0.01):
21
- """Initialize conv/fc bias value according to a given probability value."""
37
+ """
38
+ Initialize conv/fc bias value according to a given probability value.
39
+
40
+ This function calculates the bias initialization value based on a prior probability using the inverse error function.
41
+ It's commonly used in object detection models to initialize classification layers with a specific positive prediction
42
+ probability.
43
+
44
+ Args:
45
+ prior_prob (float, optional): Prior probability for bias initialization.
46
+
47
+ Returns:
48
+ (float): Bias initialization value calculated from the prior probability.
49
+
50
+ Examples:
51
+ >>> bias = bias_init_with_prob(0.01)
52
+ >>> print(f"Bias initialization value: {bias:.4f}")
53
+ Bias initialization value: -4.5951
54
+ """
22
55
  return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init
23
56
 
24
57
 
25
58
  def linear_init(module):
26
- """Initialize the weights and biases of a linear module."""
59
+ """
60
+ Initialize the weights and biases of a linear module.
61
+
62
+ This function initializes the weights of a linear module using a uniform distribution within bounds calculated
63
+ from the input dimension. If the module has a bias, it is also initialized.
64
+
65
+ Args:
66
+ module (nn.Module): Linear module to initialize.
67
+
68
+ Returns:
69
+ (nn.Module): The initialized module.
70
+
71
+ Examples:
72
+ >>> import torch.nn as nn
73
+ >>> linear = nn.Linear(10, 5)
74
+ >>> initialized_linear = linear_init(linear)
75
+ """
27
76
  bound = 1 / math.sqrt(module.weight.shape[0])
28
77
  uniform_(module.weight, -bound, bound)
29
78
  if hasattr(module, "bias") and module.bias is not None:
@@ -31,7 +80,24 @@ def linear_init(module):
31
80
 
32
81
 
33
82
  def inverse_sigmoid(x, eps=1e-5):
34
- """Calculate the inverse sigmoid function for a tensor."""
83
+ """
84
+ Calculate the inverse sigmoid function for a tensor.
85
+
86
+ This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
87
+ operations, particularly in attention mechanisms and coordinate transformations.
88
+
89
+ Args:
90
+ x (torch.Tensor): Input tensor with values in range [0, 1].
91
+ eps (float, optional): Small epsilon value to prevent numerical instability.
92
+
93
+ Returns:
94
+ (torch.Tensor): Tensor after applying the inverse sigmoid function.
95
+
96
+ Examples:
97
+ >>> x = torch.tensor([0.2, 0.5, 0.8])
98
+ >>> inverse_sigmoid(x)
99
+ tensor([-1.3863, 0.0000, 1.3863])
100
+ """
35
101
  x = x.clamp(min=0, max=1)
36
102
  x1 = x.clamp(min=eps)
37
103
  x2 = (1 - x).clamp(min=eps)
ultralytics/nn/tasks.py CHANGED
@@ -912,6 +912,9 @@ class YOLOEModel(DetectionModel):
912
912
  names (List[str]): List of class names.
913
913
  embeddings (torch.Tensor): Embeddings tensor.
914
914
  """
915
+ assert not hasattr(self.model[-1], "lrpc"), (
916
+ "Prompt-free model does not support setting classes. Please try with Text/Visual prompt models."
917
+ )
915
918
  assert embeddings.ndim == 3
916
919
  self.pe = embeddings
917
920
  self.model[-1].nc = len(names)