ultralytics 8.3.101__py3-none-any.whl → 8.3.102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/test_solutions.py +140 -76
- ultralytics/__init__.py +1 -1
- ultralytics/engine/exporter.py +20 -5
- ultralytics/hub/__init__.py +29 -2
- ultralytics/hub/google/__init__.py +18 -1
- ultralytics/models/fastsam/predict.py +12 -1
- ultralytics/models/nas/predict.py +21 -3
- ultralytics/models/rtdetr/val.py +26 -2
- ultralytics/models/sam/amg.py +22 -1
- ultralytics/models/sam/modules/encoders.py +85 -4
- ultralytics/models/sam/modules/memory_attention.py +61 -3
- ultralytics/models/sam/modules/utils.py +108 -5
- ultralytics/models/utils/loss.py +38 -2
- ultralytics/models/utils/ops.py +15 -1
- ultralytics/models/yolo/classify/predict.py +11 -1
- ultralytics/models/yolo/classify/train.py +17 -1
- ultralytics/models/yolo/classify/val.py +82 -6
- ultralytics/models/yolo/detect/predict.py +20 -1
- ultralytics/models/yolo/model.py +55 -4
- ultralytics/models/yolo/obb/predict.py +16 -1
- ultralytics/models/yolo/obb/train.py +35 -2
- ultralytics/models/yolo/obb/val.py +87 -6
- ultralytics/models/yolo/pose/predict.py +18 -1
- ultralytics/models/yolo/pose/train.py +48 -3
- ultralytics/models/yolo/pose/val.py +113 -8
- ultralytics/models/yolo/segment/predict.py +27 -2
- ultralytics/models/yolo/segment/train.py +61 -3
- ultralytics/models/yolo/segment/val.py +10 -1
- ultralytics/models/yolo/world/train_world.py +29 -1
- ultralytics/models/yolo/yoloe/train.py +47 -3
- ultralytics/nn/modules/activation.py +26 -3
- ultralytics/nn/modules/block.py +89 -0
- ultralytics/nn/modules/head.py +3 -92
- ultralytics/nn/modules/utils.py +70 -4
- ultralytics/nn/text_model.py +93 -17
- ultralytics/utils/benchmarks.py +1 -1
- ultralytics/utils/callbacks/base.py +22 -5
- ultralytics/utils/callbacks/comet.py +93 -5
- ultralytics/utils/callbacks/dvc.py +64 -5
- ultralytics/utils/callbacks/neptune.py +25 -2
- ultralytics/utils/callbacks/tensorboard.py +30 -2
- ultralytics/utils/callbacks/wb.py +16 -1
- ultralytics/utils/dist.py +35 -2
- ultralytics/utils/errors.py +27 -6
- ultralytics/utils/patches.py +33 -5
- ultralytics/utils/triton.py +16 -3
- {ultralytics-8.3.101.dist-info → ultralytics-8.3.102.dist-info}/METADATA +1 -2
- {ultralytics-8.3.101.dist-info → ultralytics-8.3.102.dist-info}/RECORD +52 -52
- {ultralytics-8.3.101.dist-info → ultralytics-8.3.102.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.101.dist-info → ultralytics-8.3.102.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.101.dist-info → ultralytics-8.3.102.dist-info}/licenses/LICENSE +0 -0
- {ultralytics-8.3.101.dist-info → ultralytics-8.3.102.dist-info}/top_level.txt +0 -0
@@ -38,7 +38,23 @@ class YOLOETrainer(DetectionTrainer):
|
|
38
38
|
super().__init__(cfg, overrides, _callbacks)
|
39
39
|
|
40
40
|
def get_model(self, cfg=None, weights=None, verbose=True):
|
41
|
-
"""
|
41
|
+
"""
|
42
|
+
Return a YOLOEModel initialized with the specified configuration and weights.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
cfg (dict | str | None): Model configuration. Can be a dictionary containing a 'yaml_file' key,
|
46
|
+
a direct path to a YAML file, or None to use default configuration.
|
47
|
+
weights (str | Path | None): Path to pretrained weights file to load into the model.
|
48
|
+
verbose (bool): Whether to display model information during initialization.
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
(YOLOEModel): The initialized YOLOE model.
|
52
|
+
|
53
|
+
Notes:
|
54
|
+
- The number of classes (nc) is hard-coded to a maximum of 80 following the official configuration.
|
55
|
+
- The nc parameter here represents the maximum number of different text samples in one image,
|
56
|
+
rather than the actual number of classes.
|
57
|
+
"""
|
42
58
|
# NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
|
43
59
|
# NOTE: Following the official config, nc hard-coded to 80 for now.
|
44
60
|
model = YOLOEModel(
|
@@ -180,7 +196,20 @@ class YOLOETrainerFromScratch(YOLOETrainer):
|
|
180
196
|
return YOLOConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
|
181
197
|
|
182
198
|
def set_text_embeddings(self, datasets, batch):
|
183
|
-
"""
|
199
|
+
"""
|
200
|
+
Set text embeddings for datasets to accelerate training by caching category names.
|
201
|
+
|
202
|
+
This method collects unique category names from all datasets, then generates and caches text embeddings
|
203
|
+
for these categories to improve training efficiency.
|
204
|
+
|
205
|
+
Args:
|
206
|
+
datasets (List[Dataset]): List of datasets from which to extract category names.
|
207
|
+
batch (int | None): Batch size used for processing.
|
208
|
+
|
209
|
+
Notes:
|
210
|
+
This method collects category names from datasets that have the 'category_names' attribute,
|
211
|
+
then uses the first dataset's image path to determine where to cache the generated text embeddings.
|
212
|
+
"""
|
184
213
|
# TODO: open up an interface to determine whether to do cache
|
185
214
|
category_names = set()
|
186
215
|
for dataset in datasets:
|
@@ -312,7 +341,22 @@ class YOLOEPEFreeTrainer(YOLOEPETrainer, YOLOETrainerFromScratch):
|
|
312
341
|
return batch
|
313
342
|
|
314
343
|
def set_text_embeddings(self, datasets, batch):
|
315
|
-
"""
|
344
|
+
"""
|
345
|
+
Set text embeddings for datasets to accelerate training by caching category names.
|
346
|
+
|
347
|
+
This method collects unique category names from all datasets, generates text embeddings for them,
|
348
|
+
and caches these embeddings to improve training efficiency. The embeddings are stored in a file
|
349
|
+
in the parent directory of the first dataset's image path.
|
350
|
+
|
351
|
+
Args:
|
352
|
+
datasets (List[Dataset]): List of datasets containing category names to process.
|
353
|
+
batch (int): Batch size for processing text embeddings.
|
354
|
+
|
355
|
+
Notes:
|
356
|
+
The method creates a dictionary mapping text samples to their embeddings and stores it
|
357
|
+
at the path specified by 'cache_path'. If the cache file already exists, it will be loaded
|
358
|
+
instead of regenerating the embeddings.
|
359
|
+
"""
|
316
360
|
pass
|
317
361
|
|
318
362
|
|
@@ -7,14 +7,26 @@ import torch.nn as nn
|
|
7
7
|
|
8
8
|
class AGLU(nn.Module):
|
9
9
|
"""
|
10
|
-
Unified activation function module from
|
10
|
+
Unified activation function module from AGLU.
|
11
11
|
|
12
|
-
This class implements a parameterized activation function with learnable parameters lambda and kappa
|
12
|
+
This class implements a parameterized activation function with learnable parameters lambda and kappa, based on the
|
13
|
+
AGLU (Adaptive Gated Linear Unit) approach (https://github.com/kostas1515/AGLU).
|
13
14
|
|
14
15
|
Attributes:
|
15
16
|
act (nn.Softplus): Softplus activation function with negative beta.
|
16
17
|
lambd (nn.Parameter): Learnable lambda parameter initialized with uniform distribution.
|
17
18
|
kappa (nn.Parameter): Learnable kappa parameter initialized with uniform distribution.
|
19
|
+
|
20
|
+
Methods:
|
21
|
+
forward: Compute the forward pass of the Unified activation function.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
>>> import torch
|
25
|
+
>>> m = AGLU()
|
26
|
+
>>> input = torch.randn(2)
|
27
|
+
>>> output = m(input)
|
28
|
+
>>> print(output.shape)
|
29
|
+
torch.Size([2])
|
18
30
|
"""
|
19
31
|
|
20
32
|
def __init__(self, device=None, dtype=None) -> None:
|
@@ -25,6 +37,17 @@ class AGLU(nn.Module):
|
|
25
37
|
self.kappa = nn.Parameter(nn.init.uniform_(torch.empty(1, device=device, dtype=dtype))) # kappa parameter
|
26
38
|
|
27
39
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
28
|
-
"""
|
40
|
+
"""
|
41
|
+
Apply the Adaptive Gated Linear Unit (AGLU) activation function.
|
42
|
+
|
43
|
+
This forward method implements the AGLU activation function with learnable parameters lambda and kappa.
|
44
|
+
The function applies a transformation that adaptively combines linear and non-linear components.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
x (torch.Tensor): Input tensor to apply the activation function to.
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
(torch.Tensor): Output tensor after applying the AGLU activation function, with the same shape as the input.
|
51
|
+
"""
|
29
52
|
lam = torch.clamp(self.lambd, min=0.0001) # Clamp lambda to avoid division by zero
|
30
53
|
return torch.exp((1 / lam) * self.act((self.kappa * x) - torch.log(lam)))
|
ultralytics/nn/modules/block.py
CHANGED
@@ -1875,3 +1875,92 @@ class A2C2f(nn.Module):
|
|
1875
1875
|
if self.gamma is not None:
|
1876
1876
|
return x + self.gamma.view(-1, len(self.gamma), 1, 1) * y
|
1877
1877
|
return y
|
1878
|
+
|
1879
|
+
|
1880
|
+
class SwiGLUFFN(nn.Module):
|
1881
|
+
"""SwiGLU Feed-Forward Network for transformer-based architectures."""
|
1882
|
+
|
1883
|
+
def __init__(self, gc, ec, e=4) -> None:
|
1884
|
+
"""Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor."""
|
1885
|
+
super().__init__()
|
1886
|
+
self.w12 = nn.Linear(gc, e * ec)
|
1887
|
+
self.w3 = nn.Linear(e * ec // 2, ec)
|
1888
|
+
|
1889
|
+
def forward(self, x):
|
1890
|
+
"""Apply SwiGLU transformation to input features."""
|
1891
|
+
x12 = self.w12(x)
|
1892
|
+
x1, x2 = x12.chunk(2, dim=-1)
|
1893
|
+
hidden = F.silu(x1) * x2
|
1894
|
+
return self.w3(hidden)
|
1895
|
+
|
1896
|
+
|
1897
|
+
class Residual(nn.Module):
|
1898
|
+
"""Residual connection wrapper for neural network modules."""
|
1899
|
+
|
1900
|
+
def __init__(self, m) -> None:
|
1901
|
+
"""Initialize residual module with the wrapped module."""
|
1902
|
+
super().__init__()
|
1903
|
+
self.m = m
|
1904
|
+
nn.init.zeros_(self.m.w3.bias)
|
1905
|
+
# For models with l scale, please change the initialization to
|
1906
|
+
# nn.init.constant_(self.m.w3.weight, 1e-6)
|
1907
|
+
nn.init.zeros_(self.m.w3.weight)
|
1908
|
+
|
1909
|
+
def forward(self, x):
|
1910
|
+
"""Apply residual connection to input features."""
|
1911
|
+
return x + self.m(x)
|
1912
|
+
|
1913
|
+
|
1914
|
+
class SAVPE(nn.Module):
|
1915
|
+
"""Spatial-Aware Visual Prompt Embedding module for feature enhancement."""
|
1916
|
+
|
1917
|
+
def __init__(self, ch, c3, embed):
|
1918
|
+
"""Initialize SAVPE module with channels, intermediate channels, and embedding dimension."""
|
1919
|
+
super().__init__()
|
1920
|
+
self.cv1 = nn.ModuleList(
|
1921
|
+
nn.Sequential(
|
1922
|
+
Conv(x, c3, 3), Conv(c3, c3, 3), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity()
|
1923
|
+
)
|
1924
|
+
for i, x in enumerate(ch)
|
1925
|
+
)
|
1926
|
+
|
1927
|
+
self.cv2 = nn.ModuleList(
|
1928
|
+
nn.Sequential(Conv(x, c3, 1), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity())
|
1929
|
+
for i, x in enumerate(ch)
|
1930
|
+
)
|
1931
|
+
|
1932
|
+
self.c = 16
|
1933
|
+
self.cv3 = nn.Conv2d(3 * c3, embed, 1)
|
1934
|
+
self.cv4 = nn.Conv2d(3 * c3, self.c, 3, padding=1)
|
1935
|
+
self.cv5 = nn.Conv2d(1, self.c, 3, padding=1)
|
1936
|
+
self.cv6 = nn.Sequential(Conv(2 * self.c, self.c, 3), nn.Conv2d(self.c, self.c, 3, padding=1))
|
1937
|
+
|
1938
|
+
def forward(self, x, vp):
|
1939
|
+
"""Process input features and visual prompts to generate enhanced embeddings."""
|
1940
|
+
y = [self.cv2[i](xi) for i, xi in enumerate(x)]
|
1941
|
+
y = self.cv4(torch.cat(y, dim=1))
|
1942
|
+
|
1943
|
+
x = [self.cv1[i](xi) for i, xi in enumerate(x)]
|
1944
|
+
x = self.cv3(torch.cat(x, dim=1))
|
1945
|
+
|
1946
|
+
B, C, H, W = x.shape
|
1947
|
+
|
1948
|
+
Q = vp.shape[1]
|
1949
|
+
|
1950
|
+
x = x.view(B, C, -1)
|
1951
|
+
|
1952
|
+
y = y.reshape(B, 1, self.c, H, W).expand(-1, Q, -1, -1, -1).reshape(B * Q, self.c, H, W)
|
1953
|
+
vp = vp.reshape(B, Q, 1, H, W).reshape(B * Q, 1, H, W)
|
1954
|
+
|
1955
|
+
y = self.cv6(torch.cat((y, self.cv5(vp)), dim=1))
|
1956
|
+
|
1957
|
+
y = y.reshape(B, Q, self.c, -1)
|
1958
|
+
vp = vp.reshape(B, Q, 1, -1)
|
1959
|
+
|
1960
|
+
score = y * vp + torch.logical_not(vp) * torch.finfo(y.dtype).min
|
1961
|
+
|
1962
|
+
score = F.softmax(score, dim=-1, dtype=torch.float).to(score.dtype)
|
1963
|
+
|
1964
|
+
aggregated = score.transpose(-2, -3) @ x.reshape(B, self.c, C // self.c, -1).transpose(-1, -2)
|
1965
|
+
|
1966
|
+
return F.normalize(aggregated.transpose(-2, -3).reshape(B, Q, -1), dim=-1, p=2)
|
ultralytics/nn/modules/head.py
CHANGED
@@ -12,7 +12,7 @@ from torch.nn.init import constant_, xavier_uniform_
|
|
12
12
|
from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
|
13
13
|
from ultralytics.utils.torch_utils import fuse_conv_and_bn, smart_inference_mode
|
14
14
|
|
15
|
-
from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
|
15
|
+
from .block import DFL, SAVPE, BNContrastiveHead, ContrastiveHead, Proto, Residual, SwiGLUFFN
|
16
16
|
from .conv import Conv, DWConv
|
17
17
|
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
|
18
18
|
from .utils import bias_init_with_prob, linear_init
|
@@ -345,61 +345,6 @@ class WorldDetect(Detect):
|
|
345
345
|
# b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
|
346
346
|
|
347
347
|
|
348
|
-
class SAVPE(nn.Module):
|
349
|
-
"""Spatial-Aware Visual Prompt Embedding module for feature enhancement."""
|
350
|
-
|
351
|
-
def __init__(self, ch, c3, embed):
|
352
|
-
"""Initialize SAVPE module with channels, intermediate channels, and embedding dimension."""
|
353
|
-
super().__init__()
|
354
|
-
self.cv1 = nn.ModuleList(
|
355
|
-
nn.Sequential(
|
356
|
-
Conv(x, c3, 3), Conv(c3, c3, 3), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity()
|
357
|
-
)
|
358
|
-
for i, x in enumerate(ch)
|
359
|
-
)
|
360
|
-
|
361
|
-
self.cv2 = nn.ModuleList(
|
362
|
-
nn.Sequential(Conv(x, c3, 1), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity())
|
363
|
-
for i, x in enumerate(ch)
|
364
|
-
)
|
365
|
-
|
366
|
-
self.c = 16
|
367
|
-
self.cv3 = nn.Conv2d(3 * c3, embed, 1)
|
368
|
-
self.cv4 = nn.Conv2d(3 * c3, self.c, 3, padding=1)
|
369
|
-
self.cv5 = nn.Conv2d(1, self.c, 3, padding=1)
|
370
|
-
self.cv6 = nn.Sequential(Conv(2 * self.c, self.c, 3), nn.Conv2d(self.c, self.c, 3, padding=1))
|
371
|
-
|
372
|
-
def forward(self, x, vp):
|
373
|
-
"""Process input features and visual prompts to generate enhanced embeddings."""
|
374
|
-
y = [self.cv2[i](xi) for i, xi in enumerate(x)]
|
375
|
-
y = self.cv4(torch.cat(y, dim=1))
|
376
|
-
|
377
|
-
x = [self.cv1[i](xi) for i, xi in enumerate(x)]
|
378
|
-
x = self.cv3(torch.cat(x, dim=1))
|
379
|
-
|
380
|
-
B, C, H, W = x.shape
|
381
|
-
|
382
|
-
Q = vp.shape[1]
|
383
|
-
|
384
|
-
x = x.view(B, C, -1)
|
385
|
-
|
386
|
-
y = y.reshape(B, 1, self.c, H, W).expand(-1, Q, -1, -1, -1).reshape(B * Q, self.c, H, W)
|
387
|
-
vp = vp.reshape(B, Q, 1, H, W).reshape(B * Q, 1, H, W)
|
388
|
-
|
389
|
-
y = self.cv6(torch.cat((y, self.cv5(vp)), dim=1))
|
390
|
-
|
391
|
-
y = y.reshape(B, Q, self.c, -1)
|
392
|
-
vp = vp.reshape(B, Q, 1, -1)
|
393
|
-
|
394
|
-
score = y * vp + torch.logical_not(vp) * torch.finfo(y.dtype).min
|
395
|
-
|
396
|
-
score = F.softmax(score, dim=-1, dtype=torch.float).to(score.dtype)
|
397
|
-
|
398
|
-
aggregated = score.transpose(-2, -3) @ x.reshape(B, self.c, C // self.c, -1).transpose(-1, -2)
|
399
|
-
|
400
|
-
return F.normalize(aggregated.transpose(-2, -3).reshape(B, Q, -1), dim=-1, p=2)
|
401
|
-
|
402
|
-
|
403
348
|
class LRPCHead(nn.Module):
|
404
349
|
"""Lightweight Region Proposal and Classification Head for efficient object detection."""
|
405
350
|
|
@@ -419,7 +364,7 @@ class LRPCHead(nn.Module):
|
|
419
364
|
linear.bias.data = conv.bias.data
|
420
365
|
return linear
|
421
366
|
|
422
|
-
def forward(self, cls_feat, loc_feat, conf
|
367
|
+
def forward(self, cls_feat, loc_feat, conf):
|
423
368
|
"""Process classification and localization features to generate detection proposals."""
|
424
369
|
if self.enabled:
|
425
370
|
pf_score = self.pf(cls_feat)[0, 0].flatten(0)
|
@@ -533,7 +478,7 @@ class YOLOEDetect(Detect):
|
|
533
478
|
cls_feat = self.cv3[i](x[i])
|
534
479
|
loc_feat = self.cv2[i](x[i])
|
535
480
|
assert isinstance(self.lrpc[i], LRPCHead)
|
536
|
-
x[i], mask = self.lrpc[i](cls_feat, loc_feat, self
|
481
|
+
x[i], mask = self.lrpc[i](cls_feat, loc_feat, getattr(self, "conf", 0.001))
|
537
482
|
masks.append(mask)
|
538
483
|
shape = x[0][0].shape
|
539
484
|
if self.dynamic or self.shape != shape:
|
@@ -585,40 +530,6 @@ class YOLOEDetect(Detect):
|
|
585
530
|
c.bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)
|
586
531
|
|
587
532
|
|
588
|
-
class SwiGLUFFN(nn.Module):
|
589
|
-
"""SwiGLU Feed-Forward Network for transformer-based architectures."""
|
590
|
-
|
591
|
-
def __init__(self, gc, ec, e=4) -> None:
|
592
|
-
"""Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor."""
|
593
|
-
super().__init__()
|
594
|
-
self.w12 = nn.Linear(gc, e * ec)
|
595
|
-
self.w3 = nn.Linear(e * ec // 2, ec)
|
596
|
-
|
597
|
-
def forward(self, x):
|
598
|
-
"""Apply SwiGLU transformation to input features."""
|
599
|
-
x12 = self.w12(x)
|
600
|
-
x1, x2 = x12.chunk(2, dim=-1)
|
601
|
-
hidden = F.silu(x1) * x2
|
602
|
-
return self.w3(hidden)
|
603
|
-
|
604
|
-
|
605
|
-
class Residual(nn.Module):
|
606
|
-
"""Residual connection wrapper for neural network modules."""
|
607
|
-
|
608
|
-
def __init__(self, m) -> None:
|
609
|
-
"""Initialize residual module with the wrapped module."""
|
610
|
-
super().__init__()
|
611
|
-
self.m = m
|
612
|
-
nn.init.zeros_(self.m.w3.bias)
|
613
|
-
# For models with l scale, please change the initialization to
|
614
|
-
# nn.init.constant_(self.m.w3.weight, 1e-6)
|
615
|
-
nn.init.zeros_(self.m.w3.weight)
|
616
|
-
|
617
|
-
def forward(self, x):
|
618
|
-
"""Apply residual connection to input features."""
|
619
|
-
return x + self.m(x)
|
620
|
-
|
621
|
-
|
622
533
|
class YOLOESegment(YOLOEDetect):
|
623
534
|
"""YOLO segmentation head with text embedding capabilities."""
|
624
535
|
|
ultralytics/nn/modules/utils.py
CHANGED
@@ -13,17 +13,66 @@ __all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
|
|
13
13
|
|
14
14
|
|
15
15
|
def _get_clones(module, n):
|
16
|
-
"""
|
16
|
+
"""
|
17
|
+
Create a list of cloned modules from the given module.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
module (nn.Module): The module to be cloned.
|
21
|
+
n (int): Number of clones to create.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
(nn.ModuleList): A ModuleList containing n clones of the input module.
|
25
|
+
|
26
|
+
Examples:
|
27
|
+
>>> import torch.nn as nn
|
28
|
+
>>> layer = nn.Linear(10, 10)
|
29
|
+
>>> clones = _get_clones(layer, 3)
|
30
|
+
>>> len(clones)
|
31
|
+
3
|
32
|
+
"""
|
17
33
|
return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
|
18
34
|
|
19
35
|
|
20
36
|
def bias_init_with_prob(prior_prob=0.01):
|
21
|
-
"""
|
37
|
+
"""
|
38
|
+
Initialize conv/fc bias value according to a given probability value.
|
39
|
+
|
40
|
+
This function calculates the bias initialization value based on a prior probability using the inverse error function.
|
41
|
+
It's commonly used in object detection models to initialize classification layers with a specific positive prediction
|
42
|
+
probability.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
prior_prob (float, optional): Prior probability for bias initialization.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
(float): Bias initialization value calculated from the prior probability.
|
49
|
+
|
50
|
+
Examples:
|
51
|
+
>>> bias = bias_init_with_prob(0.01)
|
52
|
+
>>> print(f"Bias initialization value: {bias:.4f}")
|
53
|
+
Bias initialization value: -4.5951
|
54
|
+
"""
|
22
55
|
return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init
|
23
56
|
|
24
57
|
|
25
58
|
def linear_init(module):
|
26
|
-
"""
|
59
|
+
"""
|
60
|
+
Initialize the weights and biases of a linear module.
|
61
|
+
|
62
|
+
This function initializes the weights of a linear module using a uniform distribution within bounds calculated
|
63
|
+
from the input dimension. If the module has a bias, it is also initialized.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
module (nn.Module): Linear module to initialize.
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
(nn.Module): The initialized module.
|
70
|
+
|
71
|
+
Examples:
|
72
|
+
>>> import torch.nn as nn
|
73
|
+
>>> linear = nn.Linear(10, 5)
|
74
|
+
>>> initialized_linear = linear_init(linear)
|
75
|
+
"""
|
27
76
|
bound = 1 / math.sqrt(module.weight.shape[0])
|
28
77
|
uniform_(module.weight, -bound, bound)
|
29
78
|
if hasattr(module, "bias") and module.bias is not None:
|
@@ -31,7 +80,24 @@ def linear_init(module):
|
|
31
80
|
|
32
81
|
|
33
82
|
def inverse_sigmoid(x, eps=1e-5):
|
34
|
-
"""
|
83
|
+
"""
|
84
|
+
Calculate the inverse sigmoid function for a tensor.
|
85
|
+
|
86
|
+
This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
|
87
|
+
operations, particularly in attention mechanisms and coordinate transformations.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
x (torch.Tensor): Input tensor with values in range [0, 1].
|
91
|
+
eps (float, optional): Small epsilon value to prevent numerical instability.
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
(torch.Tensor): Tensor after applying the inverse sigmoid function.
|
95
|
+
|
96
|
+
Examples:
|
97
|
+
>>> x = torch.tensor([0.2, 0.5, 0.8])
|
98
|
+
>>> inverse_sigmoid(x)
|
99
|
+
tensor([-1.3863, 0.0000, 1.3863])
|
100
|
+
"""
|
35
101
|
x = x.clamp(min=0, max=1)
|
36
102
|
x1 = x.clamp(min=eps)
|
37
103
|
x2 = (1 - x).clamp(min=eps)
|
ultralytics/nn/text_model.py
CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
6
6
|
import torch
|
7
7
|
import torch.nn as nn
|
8
8
|
|
9
|
-
from ultralytics.utils import
|
9
|
+
from ultralytics.utils import checks
|
10
10
|
from ultralytics.utils.torch_utils import smart_inference_mode
|
11
11
|
|
12
12
|
try:
|
@@ -59,9 +59,10 @@ class TextModel(nn.Module):
|
|
59
59
|
|
60
60
|
class CLIP(TextModel):
|
61
61
|
"""
|
62
|
-
OpenAI CLIP text encoder
|
62
|
+
Implements OpenAI's CLIP (Contrastive Language-Image Pre-training) text encoder.
|
63
63
|
|
64
|
-
This class
|
64
|
+
This class provides a text encoder based on OpenAI's CLIP model, which can convert text into feature vectors
|
65
|
+
that are aligned with corresponding image features in a shared embedding space.
|
65
66
|
|
66
67
|
Attributes:
|
67
68
|
model (clip.model.CLIP): The loaded CLIP model.
|
@@ -70,15 +71,33 @@ class CLIP(TextModel):
|
|
70
71
|
Methods:
|
71
72
|
tokenize: Convert input texts to CLIP tokens.
|
72
73
|
encode_text: Encode tokenized texts into normalized feature vectors.
|
74
|
+
|
75
|
+
Examples:
|
76
|
+
>>> from ultralytics.models.sam import CLIP
|
77
|
+
>>> import torch
|
78
|
+
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
79
|
+
>>> clip_model = CLIP(size="ViT-B/32", device=device)
|
80
|
+
>>> tokens = clip_model.tokenize(["a photo of a cat", "a photo of a dog"])
|
81
|
+
>>> text_features = clip_model.encode_text(tokens)
|
82
|
+
>>> print(text_features.shape)
|
73
83
|
"""
|
74
84
|
|
75
85
|
def __init__(self, size, device):
|
76
86
|
"""
|
77
87
|
Initialize the CLIP text encoder.
|
78
88
|
|
89
|
+
This class implements the TextModel interface using OpenAI's CLIP model for text encoding. It loads
|
90
|
+
a pre-trained CLIP model of the specified size and prepares it for text encoding tasks.
|
91
|
+
|
79
92
|
Args:
|
80
93
|
size (str): Model size identifier (e.g., 'ViT-B/32').
|
81
94
|
device (torch.device): Device to load the model on.
|
95
|
+
|
96
|
+
Examples:
|
97
|
+
>>> import torch
|
98
|
+
>>> from ultralytics.models.sam.modules.clip import CLIP
|
99
|
+
>>> clip_model = CLIP("ViT-B/32", device=torch.device("cuda:0"))
|
100
|
+
>>> text_features = clip_model.encode_text(["a photo of a cat", "a photo of a dog"])
|
82
101
|
"""
|
83
102
|
super().__init__()
|
84
103
|
self.model = clip.load(size, device=device)[0]
|
@@ -87,7 +106,20 @@ class CLIP(TextModel):
|
|
87
106
|
self.eval()
|
88
107
|
|
89
108
|
def tokenize(self, texts):
|
90
|
-
"""
|
109
|
+
"""
|
110
|
+
Convert input texts to CLIP tokens.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
texts (str | List[str]): Input text or list of texts to tokenize.
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
(torch.Tensor): Tokenized text tensor with shape (batch_size, context_length) ready for model processing.
|
117
|
+
|
118
|
+
Examples:
|
119
|
+
>>> model = CLIP("ViT-B/32", device="cpu")
|
120
|
+
>>> tokens = model.tokenize("a photo of a cat")
|
121
|
+
>>> print(tokens.shape) # torch.Size([1, 77])
|
122
|
+
"""
|
91
123
|
return clip.tokenize(texts).to(self.device)
|
92
124
|
|
93
125
|
@smart_inference_mode()
|
@@ -95,12 +127,22 @@ class CLIP(TextModel):
|
|
95
127
|
"""
|
96
128
|
Encode tokenized texts into normalized feature vectors.
|
97
129
|
|
130
|
+
This method processes tokenized text inputs through the CLIP model to generate feature vectors, which are then
|
131
|
+
normalized to unit length. These normalized vectors can be used for text-image similarity comparisons.
|
132
|
+
|
98
133
|
Args:
|
99
|
-
texts (torch.Tensor): Tokenized text inputs.
|
100
|
-
dtype (torch.dtype): Data type for output features.
|
134
|
+
texts (torch.Tensor): Tokenized text inputs, typically created using the tokenize() method.
|
135
|
+
dtype (torch.dtype, optional): Data type for output features. Default is torch.float32.
|
101
136
|
|
102
137
|
Returns:
|
103
|
-
(torch.Tensor): Normalized text feature vectors.
|
138
|
+
(torch.Tensor): Normalized text feature vectors with unit length (L2 norm = 1).
|
139
|
+
|
140
|
+
Examples:
|
141
|
+
>>> clip_model = CLIP("ViT-B/32", device="cuda")
|
142
|
+
>>> tokens = clip_model.tokenize(["a photo of a cat", "a photo of a dog"])
|
143
|
+
>>> features = clip_model.encode_text(tokens)
|
144
|
+
>>> features.shape
|
145
|
+
torch.Size([2, 512])
|
104
146
|
"""
|
105
147
|
txt_feats = self.model.encode_text(texts).to(dtype)
|
106
148
|
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
|
@@ -109,9 +151,10 @@ class CLIP(TextModel):
|
|
109
151
|
|
110
152
|
class MobileCLIP(TextModel):
|
111
153
|
"""
|
112
|
-
Apple MobileCLIP text encoder
|
154
|
+
Implement Apple's MobileCLIP text encoder for efficient text encoding.
|
113
155
|
|
114
|
-
This class implements the TextModel interface using Apple's MobileCLIP model
|
156
|
+
This class implements the TextModel interface using Apple's MobileCLIP model, providing efficient text encoding
|
157
|
+
capabilities for vision-language tasks.
|
115
158
|
|
116
159
|
Attributes:
|
117
160
|
model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
|
@@ -122,6 +165,12 @@ class MobileCLIP(TextModel):
|
|
122
165
|
Methods:
|
123
166
|
tokenize: Convert input texts to MobileCLIP tokens.
|
124
167
|
encode_text: Encode tokenized texts into normalized feature vectors.
|
168
|
+
|
169
|
+
Examples:
|
170
|
+
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
171
|
+
>>> text_encoder = MobileCLIP(size="s0", device=device)
|
172
|
+
>>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
|
173
|
+
>>> features = text_encoder.encode_text(tokens)
|
125
174
|
"""
|
126
175
|
|
127
176
|
config_size_map = {"s0": "s0", "s1": "s1", "s2": "s2", "b": "b", "blt": "b"}
|
@@ -130,9 +179,18 @@ class MobileCLIP(TextModel):
|
|
130
179
|
"""
|
131
180
|
Initialize the MobileCLIP text encoder.
|
132
181
|
|
182
|
+
This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.
|
183
|
+
|
133
184
|
Args:
|
134
185
|
size (str): Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').
|
135
186
|
device (torch.device): Device to load the model on.
|
187
|
+
|
188
|
+
Examples:
|
189
|
+
>>> from ultralytics.nn.modules import MobileCLIP
|
190
|
+
>>> import torch
|
191
|
+
>>> model = MobileCLIP("s0", device=torch.device("cpu"))
|
192
|
+
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
|
193
|
+
>>> features = model.encode_text(tokens)
|
136
194
|
"""
|
137
195
|
super().__init__()
|
138
196
|
config = self.config_size_map[size]
|
@@ -148,7 +206,19 @@ class MobileCLIP(TextModel):
|
|
148
206
|
self.eval()
|
149
207
|
|
150
208
|
def tokenize(self, texts):
|
151
|
-
"""
|
209
|
+
"""
|
210
|
+
Convert input texts to MobileCLIP tokens.
|
211
|
+
|
212
|
+
Args:
|
213
|
+
texts (list[str]): List of text strings to tokenize.
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
(torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
|
217
|
+
|
218
|
+
Examples:
|
219
|
+
>>> model = MobileCLIP("s0", "cpu")
|
220
|
+
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
|
221
|
+
"""
|
152
222
|
return self.tokenizer(texts).to(self.device)
|
153
223
|
|
154
224
|
@smart_inference_mode()
|
@@ -158,10 +228,17 @@ class MobileCLIP(TextModel):
|
|
158
228
|
|
159
229
|
Args:
|
160
230
|
texts (torch.Tensor): Tokenized text inputs.
|
161
|
-
dtype (torch.dtype): Data type for output features.
|
231
|
+
dtype (torch.dtype, optional): Data type for output features.
|
162
232
|
|
163
233
|
Returns:
|
164
|
-
(torch.Tensor): Normalized text feature vectors.
|
234
|
+
(torch.Tensor): Normalized text feature vectors with L2 normalization applied.
|
235
|
+
|
236
|
+
Examples:
|
237
|
+
>>> model = MobileCLIP("s0", device="cpu")
|
238
|
+
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
|
239
|
+
>>> features = model.encode_text(tokens)
|
240
|
+
>>> features.shape
|
241
|
+
torch.Size([2, 512]) # Actual dimension depends on model size
|
165
242
|
"""
|
166
243
|
text_features = self.model.encode_text(texts).to(dtype)
|
167
244
|
text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
|
@@ -179,15 +256,14 @@ def build_text_model(variant, device=None):
|
|
179
256
|
Returns:
|
180
257
|
(TextModel): Instantiated text encoding model.
|
181
258
|
|
182
|
-
|
183
|
-
|
259
|
+
Examples:
|
260
|
+
>>> model = build_text_model("clip:ViT-B/32", device=torch.device("cuda"))
|
261
|
+
>>> model = build_text_model("mobileclip:s0", device=torch.device("cpu"))
|
184
262
|
"""
|
185
|
-
LOGGER.info(f"Build text model {variant}")
|
186
263
|
base, size = variant.split(":")
|
187
264
|
if base == "clip":
|
188
265
|
return CLIP(size, device)
|
189
266
|
elif base == "mobileclip":
|
190
267
|
return MobileCLIP(size, device)
|
191
268
|
else:
|
192
|
-
|
193
|
-
assert False
|
269
|
+
raise ValueError(f"Unrecognized base model: '{base}'. Supported base models: 'clip', 'mobileclip'.")
|
ultralytics/utils/benchmarks.py
CHANGED
@@ -126,7 +126,7 @@ def benchmark(
|
|
126
126
|
assert not isinstance(model, YOLOWorld), "YOLOWorldv2 TensorFlow exports not supported by onnx2tf yet"
|
127
127
|
if i == 11: # Paddle
|
128
128
|
assert not isinstance(model, YOLOWorld), "YOLOWorldv2 Paddle exports not supported yet"
|
129
|
-
assert
|
129
|
+
assert model.task != "obb", "Paddle OBB bug https://github.com/PaddlePaddle/Paddle/issues/72024"
|
130
130
|
assert not is_end2end, "End-to-end models not supported by PaddlePaddle yet"
|
131
131
|
assert LINUX or MACOS, "Windows Paddle exports not supported yet"
|
132
132
|
if i == 12: # MNN
|