ultralytics 8.3.98__py3-none-any.whl → 8.3.100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/test_python.py +56 -0
- ultralytics/__init__.py +3 -2
- ultralytics/cfg/models/11/yoloe-11-seg.yaml +48 -0
- ultralytics/cfg/models/11/yoloe-11.yaml +48 -0
- ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +45 -0
- ultralytics/cfg/models/v8/yoloe-v8.yaml +45 -0
- ultralytics/data/augment.py +101 -5
- ultralytics/data/dataset.py +165 -12
- ultralytics/engine/exporter.py +5 -4
- ultralytics/engine/trainer.py +16 -7
- ultralytics/models/__init__.py +2 -2
- ultralytics/models/yolo/__init__.py +3 -3
- ultralytics/models/yolo/detect/val.py +6 -1
- ultralytics/models/yolo/model.py +183 -3
- ultralytics/models/yolo/segment/val.py +43 -16
- ultralytics/models/yolo/yoloe/__init__.py +21 -0
- ultralytics/models/yolo/yoloe/predict.py +170 -0
- ultralytics/models/yolo/yoloe/train.py +355 -0
- ultralytics/models/yolo/yoloe/train_seg.py +141 -0
- ultralytics/models/yolo/yoloe/val.py +187 -0
- ultralytics/nn/autobackend.py +17 -7
- ultralytics/nn/modules/__init__.py +18 -1
- ultralytics/nn/modules/block.py +17 -1
- ultralytics/nn/modules/head.py +359 -22
- ultralytics/nn/tasks.py +276 -10
- ultralytics/nn/text_model.py +193 -0
- ultralytics/utils/benchmarks.py +1 -0
- ultralytics/utils/callbacks/comet.py +3 -6
- ultralytics/utils/downloads.py +6 -2
- ultralytics/utils/loss.py +67 -6
- ultralytics/utils/plotting.py +1 -1
- ultralytics/utils/tal.py +1 -1
- {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/METADATA +10 -10
- {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/RECORD +38 -28
- {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/licenses/LICENSE +0 -0
- {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/top_level.txt +0 -0
ultralytics/nn/tasks.py
CHANGED
@@ -8,7 +8,9 @@ from copy import deepcopy
|
|
8
8
|
from pathlib import Path
|
9
9
|
|
10
10
|
import torch
|
11
|
+
import torch.nn as nn
|
11
12
|
|
13
|
+
from ultralytics.nn.autobackend import check_class_names
|
12
14
|
from ultralytics.nn.modules import (
|
13
15
|
AIFI,
|
14
16
|
C1,
|
@@ -51,6 +53,7 @@ from ultralytics.nn.modules import (
|
|
51
53
|
HGStem,
|
52
54
|
ImagePoolingAttn,
|
53
55
|
Index,
|
56
|
+
LRPCHead,
|
54
57
|
Pose,
|
55
58
|
RepC3,
|
56
59
|
RepConv,
|
@@ -62,6 +65,8 @@ from ultralytics.nn.modules import (
|
|
62
65
|
Segment,
|
63
66
|
TorchVision,
|
64
67
|
WorldDetect,
|
68
|
+
YOLOEDetect,
|
69
|
+
YOLOESegment,
|
65
70
|
v10Detect,
|
66
71
|
)
|
67
72
|
from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
|
@@ -83,6 +88,7 @@ from ultralytics.utils.torch_utils import (
|
|
83
88
|
intersect_dicts,
|
84
89
|
model_info,
|
85
90
|
scale_img,
|
91
|
+
smart_inference_mode,
|
86
92
|
time_sync,
|
87
93
|
)
|
88
94
|
|
@@ -255,7 +261,9 @@ class BaseModel(torch.nn.Module):
|
|
255
261
|
"""
|
256
262
|
self = super()._apply(fn)
|
257
263
|
m = self.model[-1] # Detect()
|
258
|
-
if isinstance(
|
264
|
+
if isinstance(
|
265
|
+
m, Detect
|
266
|
+
): # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect, YOLOEDetect, YOLOESegment
|
259
267
|
m.stride = fn(m.stride)
|
260
268
|
m.anchors = fn(m.anchors)
|
261
269
|
m.strides = fn(m.strides)
|
@@ -329,7 +337,7 @@ class DetectionModel(BaseModel):
|
|
329
337
|
|
330
338
|
# Build strides
|
331
339
|
m = self.model[-1] # Detect()
|
332
|
-
if isinstance(m, Detect): # includes all Detect subclasses like Segment, Pose, OBB,
|
340
|
+
if isinstance(m, Detect): # includes all Detect subclasses like Segment, Pose, OBB, YOLOEDetect, YOLOESegment
|
333
341
|
s = 256 # 2x min stride
|
334
342
|
m.inplace = self.inplace
|
335
343
|
|
@@ -337,7 +345,7 @@ class DetectionModel(BaseModel):
|
|
337
345
|
"""Perform a forward pass through the model, handling different Detect subclass types accordingly."""
|
338
346
|
if self.end2end:
|
339
347
|
return self.forward(x)["one2many"]
|
340
|
-
return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)
|
348
|
+
return self.forward(x)[0] if isinstance(m, (Segment, YOLOESegment, Pose, OBB)) else self.forward(x)
|
341
349
|
|
342
350
|
m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward
|
343
351
|
self.stride = m.stride
|
@@ -778,6 +786,260 @@ class WorldModel(DetectionModel):
|
|
778
786
|
return self.criterion(preds, batch)
|
779
787
|
|
780
788
|
|
789
|
+
class YOLOEModel(DetectionModel):
|
790
|
+
"""YOLOE detection model."""
|
791
|
+
|
792
|
+
def __init__(self, cfg="yoloe-v8s.yaml", ch=3, nc=None, verbose=True):
|
793
|
+
"""
|
794
|
+
Initialize YOLOE model with given config and parameters.
|
795
|
+
|
796
|
+
Args:
|
797
|
+
cfg (str | dict): Model configuration file path or dictionary.
|
798
|
+
ch (int): Number of input channels.
|
799
|
+
nc (int, optional): Number of classes.
|
800
|
+
verbose (bool): Whether to display model information.
|
801
|
+
"""
|
802
|
+
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
|
803
|
+
|
804
|
+
@smart_inference_mode()
|
805
|
+
def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
|
806
|
+
"""
|
807
|
+
Set classes in advance so that model could do offline-inference without clip model.
|
808
|
+
|
809
|
+
Args:
|
810
|
+
text (List[str]): List of class names.
|
811
|
+
batch (int): Batch size for processing text tokens.
|
812
|
+
cache_clip_model (bool): Whether to cache the CLIP model.
|
813
|
+
without_reprta (bool): Whether to return text embeddings cooperated with reprta module.
|
814
|
+
|
815
|
+
Returns:
|
816
|
+
(torch.Tensor): Text positional embeddings.
|
817
|
+
"""
|
818
|
+
from ultralytics.nn.text_model import build_text_model
|
819
|
+
|
820
|
+
device = next(self.model.parameters()).device
|
821
|
+
if not getattr(self, "clip_model", None) and cache_clip_model:
|
822
|
+
# For backwards compatibility of models lacking clip_model attribute
|
823
|
+
self.clip_model = build_text_model("mobileclip:blt", device=device)
|
824
|
+
|
825
|
+
model = self.clip_model if cache_clip_model else build_text_model("mobileclip:blt", device=device)
|
826
|
+
text_token = model.tokenize(text)
|
827
|
+
txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
|
828
|
+
txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
|
829
|
+
txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
|
830
|
+
if without_reprta:
|
831
|
+
return txt_feats
|
832
|
+
|
833
|
+
assert not self.training
|
834
|
+
head = self.model[-1]
|
835
|
+
assert isinstance(head, YOLOEDetect)
|
836
|
+
return head.get_tpe(txt_feats) # run axuiliary text head
|
837
|
+
|
838
|
+
@smart_inference_mode()
|
839
|
+
def get_visual_pe(self, img, visual):
|
840
|
+
"""
|
841
|
+
Get visual embeddings.
|
842
|
+
|
843
|
+
Args:
|
844
|
+
img (torch.Tensor): Input image tensor.
|
845
|
+
visual (torch.Tensor): Visual features.
|
846
|
+
|
847
|
+
Returns:
|
848
|
+
(torch.Tensor): Visual positional embeddings.
|
849
|
+
"""
|
850
|
+
return self(img, vpe=visual, return_vpe=True)
|
851
|
+
|
852
|
+
def set_vocab(self, vocab, names):
|
853
|
+
"""
|
854
|
+
Set vocabulary for the prompt-free model.
|
855
|
+
|
856
|
+
Args:
|
857
|
+
vocab (nn.ModuleList): List of vocabulary items.
|
858
|
+
names (List[str]): List of class names.
|
859
|
+
"""
|
860
|
+
assert not self.training
|
861
|
+
head = self.model[-1]
|
862
|
+
assert isinstance(head, YOLOEDetect)
|
863
|
+
|
864
|
+
# Cache anchors for head
|
865
|
+
device = next(self.parameters()).device
|
866
|
+
self(torch.empty(1, 3, self.args["imgsz"], self.args["imgsz"]).to(device)) # warmup
|
867
|
+
|
868
|
+
# re-parameterization for prompt-free model
|
869
|
+
self.model[-1].lrpc = nn.ModuleList(
|
870
|
+
LRPCHead(cls, pf[-1], loc[-1], enabled=i != 2)
|
871
|
+
for i, (cls, pf, loc) in enumerate(zip(vocab, head.cv3, head.cv2))
|
872
|
+
)
|
873
|
+
for loc_head, cls_head in zip(head.cv2, head.cv3):
|
874
|
+
assert isinstance(loc_head, nn.Sequential)
|
875
|
+
assert isinstance(cls_head, nn.Sequential)
|
876
|
+
del loc_head[-1]
|
877
|
+
del cls_head[-1]
|
878
|
+
self.model[-1].nc = len(names)
|
879
|
+
self.names = check_class_names(names)
|
880
|
+
|
881
|
+
def get_vocab(self, names):
|
882
|
+
"""
|
883
|
+
Get fused vocabulary layer from the model.
|
884
|
+
|
885
|
+
Args:
|
886
|
+
names (list): List of class names.
|
887
|
+
|
888
|
+
Returns:
|
889
|
+
(nn.ModuleList): List of vocabulary modules.
|
890
|
+
"""
|
891
|
+
assert not self.training
|
892
|
+
head = self.model[-1]
|
893
|
+
assert isinstance(head, YOLOEDetect)
|
894
|
+
assert not head.is_fused
|
895
|
+
|
896
|
+
tpe = self.get_text_pe(names)
|
897
|
+
self.set_classes(names, tpe)
|
898
|
+
device = next(self.model.parameters()).device
|
899
|
+
head.fuse(self.pe.to(device)) # fuse prompt embeddings to classify head
|
900
|
+
|
901
|
+
vocab = nn.ModuleList()
|
902
|
+
for cls_head in head.cv3:
|
903
|
+
assert isinstance(cls_head, nn.Sequential)
|
904
|
+
vocab.append(cls_head[-1])
|
905
|
+
return vocab
|
906
|
+
|
907
|
+
def set_classes(self, names, embeddings):
|
908
|
+
"""
|
909
|
+
Set classes in advance so that model could do offline-inference without clip model.
|
910
|
+
|
911
|
+
Args:
|
912
|
+
names (List[str]): List of class names.
|
913
|
+
embeddings (torch.Tensor): Embeddings tensor.
|
914
|
+
"""
|
915
|
+
assert embeddings.ndim == 3
|
916
|
+
self.pe = embeddings
|
917
|
+
self.model[-1].nc = len(names)
|
918
|
+
self.names = check_class_names(names)
|
919
|
+
|
920
|
+
def get_cls_pe(self, tpe, vpe):
|
921
|
+
"""
|
922
|
+
Get class positional embeddings.
|
923
|
+
|
924
|
+
Args:
|
925
|
+
tpe (torch.Tensor, optional): Text positional embeddings.
|
926
|
+
vpe (torch.Tensor, optional): Visual positional embeddings.
|
927
|
+
|
928
|
+
Returns:
|
929
|
+
(torch.Tensor): Class positional embeddings.
|
930
|
+
"""
|
931
|
+
all_pe = []
|
932
|
+
if tpe is not None:
|
933
|
+
assert tpe.ndim == 3
|
934
|
+
all_pe.append(tpe)
|
935
|
+
if vpe is not None:
|
936
|
+
assert vpe.ndim == 3
|
937
|
+
all_pe.append(vpe)
|
938
|
+
if not all_pe:
|
939
|
+
all_pe.append(getattr(self, "pe", torch.zeros(1, 80, 512)))
|
940
|
+
return torch.cat(all_pe, dim=1)
|
941
|
+
|
942
|
+
def predict(
|
943
|
+
self, x, profile=False, visualize=False, tpe=None, augment=False, embed=None, vpe=None, return_vpe=False
|
944
|
+
):
|
945
|
+
"""
|
946
|
+
Perform a forward pass through the model.
|
947
|
+
|
948
|
+
Args:
|
949
|
+
x (torch.Tensor): The input tensor.
|
950
|
+
profile (bool): If True, profile the computation time for each layer.
|
951
|
+
visualize (bool): If True, save feature maps for visualization.
|
952
|
+
tpe (torch.Tensor, optional): Text positional embeddings.
|
953
|
+
augment (bool): If True, perform data augmentation during inference.
|
954
|
+
embed (list, optional): A list of feature vectors/embeddings to return.
|
955
|
+
vpe (torch.Tensor, optional): Visual positional embeddings.
|
956
|
+
return_vpe (bool): If True, return visual positional embeddings.
|
957
|
+
|
958
|
+
Returns:
|
959
|
+
(torch.Tensor): Model's output tensor.
|
960
|
+
"""
|
961
|
+
y, dt, embeddings = [], [], [] # outputs
|
962
|
+
b = x.shape[0]
|
963
|
+
for m in self.model: # except the head part
|
964
|
+
if m.f != -1: # if not from previous layer
|
965
|
+
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
|
966
|
+
if profile:
|
967
|
+
self._profile_one_layer(m, x, dt)
|
968
|
+
if isinstance(m, YOLOEDetect):
|
969
|
+
vpe = m.get_vpe(x, vpe) if vpe is not None else None
|
970
|
+
if return_vpe:
|
971
|
+
assert vpe is not None
|
972
|
+
assert not self.training
|
973
|
+
return vpe
|
974
|
+
cls_pe = self.get_cls_pe(m.get_tpe(tpe), vpe).to(device=x[0].device, dtype=x[0].dtype)
|
975
|
+
if len(cls_pe) != b:
|
976
|
+
cls_pe = cls_pe.repeat(b, 1, 1)
|
977
|
+
x = m(x, cls_pe)
|
978
|
+
else:
|
979
|
+
x = m(x) # run
|
980
|
+
|
981
|
+
y.append(x if m.i in self.save else None) # save output
|
982
|
+
if visualize:
|
983
|
+
feature_visualization(x, m.type, m.i, save_dir=visualize)
|
984
|
+
if embed and m.i in embed:
|
985
|
+
embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1)) # flatten
|
986
|
+
if m.i == max(embed):
|
987
|
+
return torch.unbind(torch.cat(embeddings, 1), dim=0)
|
988
|
+
return x
|
989
|
+
|
990
|
+
def loss(self, batch, preds=None):
|
991
|
+
"""
|
992
|
+
Compute loss.
|
993
|
+
|
994
|
+
Args:
|
995
|
+
batch (dict): Batch to compute loss on.
|
996
|
+
preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
|
997
|
+
"""
|
998
|
+
if not hasattr(self, "criterion"):
|
999
|
+
from ultralytics.utils.loss import TVPDetectLoss
|
1000
|
+
|
1001
|
+
visual_prompt = batch.get("visuals", None) is not None # TODO
|
1002
|
+
self.criterion = TVPDetectLoss(self) if visual_prompt else self.init_criterion()
|
1003
|
+
|
1004
|
+
if preds is None:
|
1005
|
+
preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
|
1006
|
+
return self.criterion(preds, batch)
|
1007
|
+
|
1008
|
+
|
1009
|
+
class YOLOESegModel(YOLOEModel, SegmentationModel):
|
1010
|
+
"""YOLOE segmentation model."""
|
1011
|
+
|
1012
|
+
def __init__(self, cfg="yoloe-v8s-seg.yaml", ch=3, nc=None, verbose=True):
|
1013
|
+
"""
|
1014
|
+
Initialize YOLOE segmentation model with given config and parameters.
|
1015
|
+
|
1016
|
+
Args:
|
1017
|
+
cfg (str | dict): Model configuration file path or dictionary.
|
1018
|
+
ch (int): Number of input channels.
|
1019
|
+
nc (int, optional): Number of classes.
|
1020
|
+
verbose (bool): Whether to display model information.
|
1021
|
+
"""
|
1022
|
+
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
|
1023
|
+
|
1024
|
+
def loss(self, batch, preds=None):
|
1025
|
+
"""
|
1026
|
+
Compute loss.
|
1027
|
+
|
1028
|
+
Args:
|
1029
|
+
batch (dict): Batch to compute loss on.
|
1030
|
+
preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
|
1031
|
+
"""
|
1032
|
+
if not hasattr(self, "criterion"):
|
1033
|
+
from ultralytics.utils.loss import TVPSegmentLoss
|
1034
|
+
|
1035
|
+
visual_prompt = batch.get("visuals", None) is not None # TODO
|
1036
|
+
self.criterion = TVPSegmentLoss(self) if visual_prompt else self.init_criterion()
|
1037
|
+
|
1038
|
+
if preds is None:
|
1039
|
+
preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
|
1040
|
+
return self.criterion(preds, batch)
|
1041
|
+
|
1042
|
+
|
781
1043
|
class Ensemble(torch.nn.ModuleList):
|
782
1044
|
"""Ensemble of models."""
|
783
1045
|
|
@@ -1185,6 +1447,8 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
|
|
1185
1447
|
legacy = False
|
1186
1448
|
if scale in "lx": # for L/X sizes
|
1187
1449
|
args.extend((True, 1.2))
|
1450
|
+
if m is C2fCIB:
|
1451
|
+
legacy = False
|
1188
1452
|
elif m is AIFI:
|
1189
1453
|
args = [ch[f], *args]
|
1190
1454
|
elif m in frozenset({HGStem, HGBlock}):
|
@@ -1199,11 +1463,13 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
|
|
1199
1463
|
args = [ch[f]]
|
1200
1464
|
elif m is Concat:
|
1201
1465
|
c2 = sum(ch[x] for x in f)
|
1202
|
-
elif m in frozenset(
|
1466
|
+
elif m in frozenset(
|
1467
|
+
{Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect}
|
1468
|
+
):
|
1203
1469
|
args.append([ch[x] for x in f])
|
1204
|
-
if m is Segment:
|
1470
|
+
if m is Segment or m is YOLOESegment:
|
1205
1471
|
args[2] = make_divisible(min(args[2], max_channels) * width, 8)
|
1206
|
-
if m in {Detect, Segment, Pose, OBB}:
|
1472
|
+
if m in {Detect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB}:
|
1207
1473
|
m.legacy = legacy
|
1208
1474
|
elif m is RTDETRDecoder: # special case, channels arg must be passed in index 1
|
1209
1475
|
args.insert(1, [ch[x] for x in f])
|
@@ -1269,7 +1535,7 @@ def guess_model_scale(model_path):
|
|
1269
1535
|
(str): The size character of the model's scale (n, s, m, l, or x).
|
1270
1536
|
"""
|
1271
1537
|
try:
|
1272
|
-
return re.search(r"yolo[v]?\d+([nslmx])", Path(model_path).stem).group(
|
1538
|
+
return re.search(r"yolo(e-)?[v]?\d+([nslmx])", Path(model_path).stem).group(2) # noqa
|
1273
1539
|
except AttributeError:
|
1274
1540
|
return ""
|
1275
1541
|
|
@@ -1292,7 +1558,7 @@ def guess_model_task(model):
|
|
1292
1558
|
return "classify"
|
1293
1559
|
if "detect" in m:
|
1294
1560
|
return "detect"
|
1295
|
-
if
|
1561
|
+
if "segment" in m:
|
1296
1562
|
return "segment"
|
1297
1563
|
if m == "pose":
|
1298
1564
|
return "pose"
|
@@ -1312,7 +1578,7 @@ def guess_model_task(model):
|
|
1312
1578
|
with contextlib.suppress(Exception):
|
1313
1579
|
return cfg2task(eval(x))
|
1314
1580
|
for m in model.modules():
|
1315
|
-
if isinstance(m, Segment):
|
1581
|
+
if isinstance(m, (Segment, YOLOESegment)):
|
1316
1582
|
return "segment"
|
1317
1583
|
elif isinstance(m, Classify):
|
1318
1584
|
return "classify"
|
@@ -1320,7 +1586,7 @@ def guess_model_task(model):
|
|
1320
1586
|
return "pose"
|
1321
1587
|
elif isinstance(m, OBB):
|
1322
1588
|
return "obb"
|
1323
|
-
elif isinstance(m, (Detect, WorldDetect, v10Detect)):
|
1589
|
+
elif isinstance(m, (Detect, WorldDetect, YOLOEDetect, v10Detect)):
|
1324
1590
|
return "detect"
|
1325
1591
|
|
1326
1592
|
# Guess from model filename
|
@@ -0,0 +1,193 @@
|
|
1
|
+
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
|
+
|
3
|
+
from abc import abstractmethod
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import torch
|
7
|
+
import torch.nn as nn
|
8
|
+
|
9
|
+
from ultralytics.utils import LOGGER, checks
|
10
|
+
from ultralytics.utils.torch_utils import smart_inference_mode
|
11
|
+
|
12
|
+
try:
|
13
|
+
import clip
|
14
|
+
except ImportError:
|
15
|
+
checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
|
16
|
+
import clip
|
17
|
+
|
18
|
+
try:
|
19
|
+
import warnings
|
20
|
+
|
21
|
+
# Suppress 'timm.models.layers is deprecated, please import via timm.layers' warning from mobileclip usage
|
22
|
+
with warnings.catch_warnings():
|
23
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
24
|
+
import mobileclip
|
25
|
+
except ImportError:
|
26
|
+
# MobileCLIP repo has an incorrect version of torchvision as dependency
|
27
|
+
# Manually install other dependencies first and install mobileclip with "--no-deps" flag
|
28
|
+
checks.check_requirements(["open-clip-torch>=2.20.0", "timm>=0.9.5"])
|
29
|
+
checks.check_requirements("git+https://github.com/apple/ml-mobileclip.git", cmds="--no-deps")
|
30
|
+
import mobileclip
|
31
|
+
|
32
|
+
|
33
|
+
class TextModel(nn.Module):
|
34
|
+
"""
|
35
|
+
Abstract base class for text encoding models.
|
36
|
+
|
37
|
+
This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement
|
38
|
+
the tokenize and encode_text methods.
|
39
|
+
|
40
|
+
Methods:
|
41
|
+
tokenize: Convert input texts to tokens.
|
42
|
+
encode_text: Encode tokenized texts into feature vectors.
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(self):
|
46
|
+
"""Initialize the TextModel base class."""
|
47
|
+
super().__init__()
|
48
|
+
|
49
|
+
@abstractmethod
|
50
|
+
def tokenize(texts):
|
51
|
+
"""Convert input texts to tokens for model processing."""
|
52
|
+
pass
|
53
|
+
|
54
|
+
@abstractmethod
|
55
|
+
def encode_text(texts, dtype):
|
56
|
+
"""Encode tokenized texts into normalized feature vectors."""
|
57
|
+
pass
|
58
|
+
|
59
|
+
|
60
|
+
class CLIP(TextModel):
|
61
|
+
"""
|
62
|
+
OpenAI CLIP text encoder implementation.
|
63
|
+
|
64
|
+
This class implements the TextModel interface using OpenAI's CLIP model for text encoding.
|
65
|
+
|
66
|
+
Attributes:
|
67
|
+
model (clip.model.CLIP): The loaded CLIP model.
|
68
|
+
device (torch.device): Device where the model is loaded.
|
69
|
+
|
70
|
+
Methods:
|
71
|
+
tokenize: Convert input texts to CLIP tokens.
|
72
|
+
encode_text: Encode tokenized texts into normalized feature vectors.
|
73
|
+
"""
|
74
|
+
|
75
|
+
def __init__(self, size, device):
|
76
|
+
"""
|
77
|
+
Initialize the CLIP text encoder.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
size (str): Model size identifier (e.g., 'ViT-B/32').
|
81
|
+
device (torch.device): Device to load the model on.
|
82
|
+
"""
|
83
|
+
super().__init__()
|
84
|
+
self.model = clip.load(size, device=device)[0]
|
85
|
+
self.to(device)
|
86
|
+
self.device = device
|
87
|
+
self.eval()
|
88
|
+
|
89
|
+
def tokenize(self, texts):
|
90
|
+
"""Convert input texts to CLIP tokens."""
|
91
|
+
return clip.tokenize(texts).to(self.device)
|
92
|
+
|
93
|
+
@smart_inference_mode()
|
94
|
+
def encode_text(self, texts, dtype=torch.float32):
|
95
|
+
"""
|
96
|
+
Encode tokenized texts into normalized feature vectors.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
texts (torch.Tensor): Tokenized text inputs.
|
100
|
+
dtype (torch.dtype): Data type for output features.
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
(torch.Tensor): Normalized text feature vectors.
|
104
|
+
"""
|
105
|
+
txt_feats = self.model.encode_text(texts).to(dtype)
|
106
|
+
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
|
107
|
+
return txt_feats
|
108
|
+
|
109
|
+
|
110
|
+
class MobileCLIP(TextModel):
|
111
|
+
"""
|
112
|
+
Apple MobileCLIP text encoder implementation.
|
113
|
+
|
114
|
+
This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.
|
115
|
+
|
116
|
+
Attributes:
|
117
|
+
model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
|
118
|
+
tokenizer (callable): Tokenizer function for processing text inputs.
|
119
|
+
device (torch.device): Device where the model is loaded.
|
120
|
+
config_size_map (dict): Mapping from size identifiers to model configuration names.
|
121
|
+
|
122
|
+
Methods:
|
123
|
+
tokenize: Convert input texts to MobileCLIP tokens.
|
124
|
+
encode_text: Encode tokenized texts into normalized feature vectors.
|
125
|
+
"""
|
126
|
+
|
127
|
+
config_size_map = {"s0": "s0", "s1": "s1", "s2": "s2", "b": "b", "blt": "b"}
|
128
|
+
|
129
|
+
def __init__(self, size, device):
|
130
|
+
"""
|
131
|
+
Initialize the MobileCLIP text encoder.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
size (str): Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').
|
135
|
+
device (torch.device): Device to load the model on.
|
136
|
+
"""
|
137
|
+
super().__init__()
|
138
|
+
config = self.config_size_map[size]
|
139
|
+
file = f"mobileclip_{size}.pt"
|
140
|
+
if not Path(file).is_file():
|
141
|
+
from ultralytics import download
|
142
|
+
|
143
|
+
download(f"https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/{file}")
|
144
|
+
self.model = mobileclip.create_model_and_transforms(f"mobileclip_{config}", pretrained=file, device=device)[0]
|
145
|
+
self.tokenizer = mobileclip.get_tokenizer(f"mobileclip_{config}")
|
146
|
+
self.to(device)
|
147
|
+
self.device = device
|
148
|
+
self.eval()
|
149
|
+
|
150
|
+
def tokenize(self, texts):
|
151
|
+
"""Convert input texts to MobileCLIP tokens."""
|
152
|
+
return self.tokenizer(texts).to(self.device)
|
153
|
+
|
154
|
+
@smart_inference_mode()
|
155
|
+
def encode_text(self, texts, dtype=torch.float32):
|
156
|
+
"""
|
157
|
+
Encode tokenized texts into normalized feature vectors.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
texts (torch.Tensor): Tokenized text inputs.
|
161
|
+
dtype (torch.dtype): Data type for output features.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
(torch.Tensor): Normalized text feature vectors.
|
165
|
+
"""
|
166
|
+
text_features = self.model.encode_text(texts).to(dtype)
|
167
|
+
text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
|
168
|
+
return text_features
|
169
|
+
|
170
|
+
|
171
|
+
def build_text_model(variant, device=None):
|
172
|
+
"""
|
173
|
+
Build a text encoding model based on the specified variant.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
variant (str): Model variant in format "base:size" (e.g., "clip:ViT-B/32" or "mobileclip:s0").
|
177
|
+
device (torch.device, optional): Device to load the model on.
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
(TextModel): Instantiated text encoding model.
|
181
|
+
|
182
|
+
Raises:
|
183
|
+
AssertionError: If the specified variant is not supported.
|
184
|
+
"""
|
185
|
+
LOGGER.info(f"Build text model {variant}")
|
186
|
+
base, size = variant.split(":")
|
187
|
+
if base == "clip":
|
188
|
+
return CLIP(size, device)
|
189
|
+
elif base == "mobileclip":
|
190
|
+
return MobileCLIP(size, device)
|
191
|
+
else:
|
192
|
+
print("Variant not found")
|
193
|
+
assert False
|
ultralytics/utils/benchmarks.py
CHANGED
@@ -126,6 +126,7 @@ def benchmark(
|
|
126
126
|
assert not isinstance(model, YOLOWorld), "YOLOWorldv2 TensorFlow exports not supported by onnx2tf yet"
|
127
127
|
if i == 11: # Paddle
|
128
128
|
assert not isinstance(model, YOLOWorld), "YOLOWorldv2 Paddle exports not supported yet"
|
129
|
+
assert not model.task == "obb", "Paddle OBB bug https://github.com/PaddlePaddle/Paddle/issues/72024"
|
129
130
|
assert not is_end2end, "End-to-end models not supported by PaddlePaddle yet"
|
130
131
|
assert LINUX or MACOS, "Windows Paddle exports not supported yet"
|
131
132
|
if i == 12: # MNN
|
@@ -194,12 +194,9 @@ def _format_prediction_annotations(image_path, metadata, class_label_map=None, c
|
|
194
194
|
LOGGER.debug(f"COMET WARNING: Image: {image_path} has no bounding boxes predictions")
|
195
195
|
return None
|
196
196
|
|
197
|
-
label_index_offset = 0
|
198
|
-
if class_map is not None:
|
199
197
|
# offset to align indices of class labels (starting from zero)
|
200
198
|
# with prediction's category ID indices (can start from one)
|
201
|
-
|
202
|
-
|
199
|
+
label_index_offset = sorted(class_map)[0] if class_map is not None else 0
|
203
200
|
try:
|
204
201
|
# import pycotools utilities to decompress annotations for various tasks, e.g. segmentation
|
205
202
|
from pycocotools.mask import decode # noqa
|
@@ -221,8 +218,8 @@ def _format_prediction_annotations(image_path, metadata, class_label_map=None, c
|
|
221
218
|
segments = prediction.get("segmentation", None)
|
222
219
|
if segments is not None:
|
223
220
|
segments = _extract_segmentation_annotation(segments, decode)
|
224
|
-
|
225
|
-
|
221
|
+
if segments is not None:
|
222
|
+
annotation_data["points"] = segments
|
226
223
|
|
227
224
|
data.append(annotation_data)
|
228
225
|
|
ultralytics/utils/downloads.py
CHANGED
@@ -15,7 +15,7 @@ from ultralytics.utils import LOGGER, TQDM, checks, clean_url, emojis, is_online
|
|
15
15
|
|
16
16
|
# Define Ultralytics GitHub assets maintained at https://github.com/ultralytics/assets
|
17
17
|
GITHUB_ASSETS_REPO = "ultralytics/assets"
|
18
|
-
GITHUB_ASSETS_NAMES = (
|
18
|
+
GITHUB_ASSETS_NAMES = frozenset(
|
19
19
|
[f"yolov8{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb", "-oiv7")]
|
20
20
|
+ [f"yolo11{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb")]
|
21
21
|
+ [f"yolo12{k}{suffix}.pt" for k in "nsmlx" for suffix in ("",)] # detect models only currently
|
@@ -23,16 +23,20 @@ GITHUB_ASSETS_NAMES = (
|
|
23
23
|
+ [f"yolov3{k}u.pt" for k in ("", "-spp", "-tiny")]
|
24
24
|
+ [f"yolov8{k}-world.pt" for k in "smlx"]
|
25
25
|
+ [f"yolov8{k}-worldv2.pt" for k in "smlx"]
|
26
|
+
+ [f"yoloe-v8{k}{suffix}.pt" for k in "sml" for suffix in ("-seg", "-seg-pf")]
|
27
|
+
+ [f"yoloe-11{k}{suffix}.pt" for k in "sml" for suffix in ("-seg", "-seg-pf")]
|
26
28
|
+ [f"yolov9{k}.pt" for k in "tsmce"]
|
27
29
|
+ [f"yolov10{k}.pt" for k in "nsmblx"]
|
28
30
|
+ [f"yolo_nas_{k}.pt" for k in "sml"]
|
29
31
|
+ [f"sam_{k}.pt" for k in "bl"]
|
32
|
+
+ [f"sam2_{k}.pt" for k in "blst"]
|
33
|
+
+ [f"sam2.1_{k}.pt" for k in "blst"]
|
30
34
|
+ [f"FastSAM-{k}.pt" for k in "sx"]
|
31
35
|
+ [f"rtdetr-{k}.pt" for k in "lx"]
|
32
36
|
+ ["mobile_sam.pt"]
|
33
37
|
+ ["calibration_image_sample_data_20x128x128x3_float32.npy.zip"]
|
34
38
|
)
|
35
|
-
GITHUB_ASSETS_STEMS =
|
39
|
+
GITHUB_ASSETS_STEMS = frozenset(k.rsplit(".", 1)[0] for k in GITHUB_ASSETS_NAMES)
|
36
40
|
|
37
41
|
|
38
42
|
def is_url(url, check=False):
|