ultralytics 8.3.98__py3-none-any.whl → 8.3.100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. tests/test_python.py +56 -0
  2. ultralytics/__init__.py +3 -2
  3. ultralytics/cfg/models/11/yoloe-11-seg.yaml +48 -0
  4. ultralytics/cfg/models/11/yoloe-11.yaml +48 -0
  5. ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +45 -0
  6. ultralytics/cfg/models/v8/yoloe-v8.yaml +45 -0
  7. ultralytics/data/augment.py +101 -5
  8. ultralytics/data/dataset.py +165 -12
  9. ultralytics/engine/exporter.py +5 -4
  10. ultralytics/engine/trainer.py +16 -7
  11. ultralytics/models/__init__.py +2 -2
  12. ultralytics/models/yolo/__init__.py +3 -3
  13. ultralytics/models/yolo/detect/val.py +6 -1
  14. ultralytics/models/yolo/model.py +183 -3
  15. ultralytics/models/yolo/segment/val.py +43 -16
  16. ultralytics/models/yolo/yoloe/__init__.py +21 -0
  17. ultralytics/models/yolo/yoloe/predict.py +170 -0
  18. ultralytics/models/yolo/yoloe/train.py +355 -0
  19. ultralytics/models/yolo/yoloe/train_seg.py +141 -0
  20. ultralytics/models/yolo/yoloe/val.py +187 -0
  21. ultralytics/nn/autobackend.py +17 -7
  22. ultralytics/nn/modules/__init__.py +18 -1
  23. ultralytics/nn/modules/block.py +17 -1
  24. ultralytics/nn/modules/head.py +359 -22
  25. ultralytics/nn/tasks.py +276 -10
  26. ultralytics/nn/text_model.py +193 -0
  27. ultralytics/utils/benchmarks.py +1 -0
  28. ultralytics/utils/callbacks/comet.py +3 -6
  29. ultralytics/utils/downloads.py +6 -2
  30. ultralytics/utils/loss.py +67 -6
  31. ultralytics/utils/plotting.py +1 -1
  32. ultralytics/utils/tal.py +1 -1
  33. {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/METADATA +10 -10
  34. {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/RECORD +38 -28
  35. {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/WHEEL +0 -0
  36. {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/entry_points.txt +0 -0
  37. {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/licenses/LICENSE +0 -0
  38. {ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/top_level.txt +0 -0
ultralytics/nn/tasks.py CHANGED
@@ -8,7 +8,9 @@ from copy import deepcopy
8
8
  from pathlib import Path
9
9
 
10
10
  import torch
11
+ import torch.nn as nn
11
12
 
13
+ from ultralytics.nn.autobackend import check_class_names
12
14
  from ultralytics.nn.modules import (
13
15
  AIFI,
14
16
  C1,
@@ -51,6 +53,7 @@ from ultralytics.nn.modules import (
51
53
  HGStem,
52
54
  ImagePoolingAttn,
53
55
  Index,
56
+ LRPCHead,
54
57
  Pose,
55
58
  RepC3,
56
59
  RepConv,
@@ -62,6 +65,8 @@ from ultralytics.nn.modules import (
62
65
  Segment,
63
66
  TorchVision,
64
67
  WorldDetect,
68
+ YOLOEDetect,
69
+ YOLOESegment,
65
70
  v10Detect,
66
71
  )
67
72
  from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
@@ -83,6 +88,7 @@ from ultralytics.utils.torch_utils import (
83
88
  intersect_dicts,
84
89
  model_info,
85
90
  scale_img,
91
+ smart_inference_mode,
86
92
  time_sync,
87
93
  )
88
94
 
@@ -255,7 +261,9 @@ class BaseModel(torch.nn.Module):
255
261
  """
256
262
  self = super()._apply(fn)
257
263
  m = self.model[-1] # Detect()
258
- if isinstance(m, Detect): # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect
264
+ if isinstance(
265
+ m, Detect
266
+ ): # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect, YOLOEDetect, YOLOESegment
259
267
  m.stride = fn(m.stride)
260
268
  m.anchors = fn(m.anchors)
261
269
  m.strides = fn(m.strides)
@@ -329,7 +337,7 @@ class DetectionModel(BaseModel):
329
337
 
330
338
  # Build strides
331
339
  m = self.model[-1] # Detect()
332
- if isinstance(m, Detect): # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect
340
+ if isinstance(m, Detect): # includes all Detect subclasses like Segment, Pose, OBB, YOLOEDetect, YOLOESegment
333
341
  s = 256 # 2x min stride
334
342
  m.inplace = self.inplace
335
343
 
@@ -337,7 +345,7 @@ class DetectionModel(BaseModel):
337
345
  """Perform a forward pass through the model, handling different Detect subclass types accordingly."""
338
346
  if self.end2end:
339
347
  return self.forward(x)["one2many"]
340
- return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)
348
+ return self.forward(x)[0] if isinstance(m, (Segment, YOLOESegment, Pose, OBB)) else self.forward(x)
341
349
 
342
350
  m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward
343
351
  self.stride = m.stride
@@ -778,6 +786,260 @@ class WorldModel(DetectionModel):
778
786
  return self.criterion(preds, batch)
779
787
 
780
788
 
789
+ class YOLOEModel(DetectionModel):
790
+ """YOLOE detection model."""
791
+
792
+ def __init__(self, cfg="yoloe-v8s.yaml", ch=3, nc=None, verbose=True):
793
+ """
794
+ Initialize YOLOE model with given config and parameters.
795
+
796
+ Args:
797
+ cfg (str | dict): Model configuration file path or dictionary.
798
+ ch (int): Number of input channels.
799
+ nc (int, optional): Number of classes.
800
+ verbose (bool): Whether to display model information.
801
+ """
802
+ super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
803
+
804
+ @smart_inference_mode()
805
+ def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
806
+ """
807
+ Set classes in advance so that model could do offline-inference without clip model.
808
+
809
+ Args:
810
+ text (List[str]): List of class names.
811
+ batch (int): Batch size for processing text tokens.
812
+ cache_clip_model (bool): Whether to cache the CLIP model.
813
+ without_reprta (bool): Whether to return text embeddings cooperated with reprta module.
814
+
815
+ Returns:
816
+ (torch.Tensor): Text positional embeddings.
817
+ """
818
+ from ultralytics.nn.text_model import build_text_model
819
+
820
+ device = next(self.model.parameters()).device
821
+ if not getattr(self, "clip_model", None) and cache_clip_model:
822
+ # For backwards compatibility of models lacking clip_model attribute
823
+ self.clip_model = build_text_model("mobileclip:blt", device=device)
824
+
825
+ model = self.clip_model if cache_clip_model else build_text_model("mobileclip:blt", device=device)
826
+ text_token = model.tokenize(text)
827
+ txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
828
+ txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
829
+ txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
830
+ if without_reprta:
831
+ return txt_feats
832
+
833
+ assert not self.training
834
+ head = self.model[-1]
835
+ assert isinstance(head, YOLOEDetect)
836
+ return head.get_tpe(txt_feats) # run axuiliary text head
837
+
838
+ @smart_inference_mode()
839
+ def get_visual_pe(self, img, visual):
840
+ """
841
+ Get visual embeddings.
842
+
843
+ Args:
844
+ img (torch.Tensor): Input image tensor.
845
+ visual (torch.Tensor): Visual features.
846
+
847
+ Returns:
848
+ (torch.Tensor): Visual positional embeddings.
849
+ """
850
+ return self(img, vpe=visual, return_vpe=True)
851
+
852
+ def set_vocab(self, vocab, names):
853
+ """
854
+ Set vocabulary for the prompt-free model.
855
+
856
+ Args:
857
+ vocab (nn.ModuleList): List of vocabulary items.
858
+ names (List[str]): List of class names.
859
+ """
860
+ assert not self.training
861
+ head = self.model[-1]
862
+ assert isinstance(head, YOLOEDetect)
863
+
864
+ # Cache anchors for head
865
+ device = next(self.parameters()).device
866
+ self(torch.empty(1, 3, self.args["imgsz"], self.args["imgsz"]).to(device)) # warmup
867
+
868
+ # re-parameterization for prompt-free model
869
+ self.model[-1].lrpc = nn.ModuleList(
870
+ LRPCHead(cls, pf[-1], loc[-1], enabled=i != 2)
871
+ for i, (cls, pf, loc) in enumerate(zip(vocab, head.cv3, head.cv2))
872
+ )
873
+ for loc_head, cls_head in zip(head.cv2, head.cv3):
874
+ assert isinstance(loc_head, nn.Sequential)
875
+ assert isinstance(cls_head, nn.Sequential)
876
+ del loc_head[-1]
877
+ del cls_head[-1]
878
+ self.model[-1].nc = len(names)
879
+ self.names = check_class_names(names)
880
+
881
+ def get_vocab(self, names):
882
+ """
883
+ Get fused vocabulary layer from the model.
884
+
885
+ Args:
886
+ names (list): List of class names.
887
+
888
+ Returns:
889
+ (nn.ModuleList): List of vocabulary modules.
890
+ """
891
+ assert not self.training
892
+ head = self.model[-1]
893
+ assert isinstance(head, YOLOEDetect)
894
+ assert not head.is_fused
895
+
896
+ tpe = self.get_text_pe(names)
897
+ self.set_classes(names, tpe)
898
+ device = next(self.model.parameters()).device
899
+ head.fuse(self.pe.to(device)) # fuse prompt embeddings to classify head
900
+
901
+ vocab = nn.ModuleList()
902
+ for cls_head in head.cv3:
903
+ assert isinstance(cls_head, nn.Sequential)
904
+ vocab.append(cls_head[-1])
905
+ return vocab
906
+
907
+ def set_classes(self, names, embeddings):
908
+ """
909
+ Set classes in advance so that model could do offline-inference without clip model.
910
+
911
+ Args:
912
+ names (List[str]): List of class names.
913
+ embeddings (torch.Tensor): Embeddings tensor.
914
+ """
915
+ assert embeddings.ndim == 3
916
+ self.pe = embeddings
917
+ self.model[-1].nc = len(names)
918
+ self.names = check_class_names(names)
919
+
920
+ def get_cls_pe(self, tpe, vpe):
921
+ """
922
+ Get class positional embeddings.
923
+
924
+ Args:
925
+ tpe (torch.Tensor, optional): Text positional embeddings.
926
+ vpe (torch.Tensor, optional): Visual positional embeddings.
927
+
928
+ Returns:
929
+ (torch.Tensor): Class positional embeddings.
930
+ """
931
+ all_pe = []
932
+ if tpe is not None:
933
+ assert tpe.ndim == 3
934
+ all_pe.append(tpe)
935
+ if vpe is not None:
936
+ assert vpe.ndim == 3
937
+ all_pe.append(vpe)
938
+ if not all_pe:
939
+ all_pe.append(getattr(self, "pe", torch.zeros(1, 80, 512)))
940
+ return torch.cat(all_pe, dim=1)
941
+
942
+ def predict(
943
+ self, x, profile=False, visualize=False, tpe=None, augment=False, embed=None, vpe=None, return_vpe=False
944
+ ):
945
+ """
946
+ Perform a forward pass through the model.
947
+
948
+ Args:
949
+ x (torch.Tensor): The input tensor.
950
+ profile (bool): If True, profile the computation time for each layer.
951
+ visualize (bool): If True, save feature maps for visualization.
952
+ tpe (torch.Tensor, optional): Text positional embeddings.
953
+ augment (bool): If True, perform data augmentation during inference.
954
+ embed (list, optional): A list of feature vectors/embeddings to return.
955
+ vpe (torch.Tensor, optional): Visual positional embeddings.
956
+ return_vpe (bool): If True, return visual positional embeddings.
957
+
958
+ Returns:
959
+ (torch.Tensor): Model's output tensor.
960
+ """
961
+ y, dt, embeddings = [], [], [] # outputs
962
+ b = x.shape[0]
963
+ for m in self.model: # except the head part
964
+ if m.f != -1: # if not from previous layer
965
+ x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
966
+ if profile:
967
+ self._profile_one_layer(m, x, dt)
968
+ if isinstance(m, YOLOEDetect):
969
+ vpe = m.get_vpe(x, vpe) if vpe is not None else None
970
+ if return_vpe:
971
+ assert vpe is not None
972
+ assert not self.training
973
+ return vpe
974
+ cls_pe = self.get_cls_pe(m.get_tpe(tpe), vpe).to(device=x[0].device, dtype=x[0].dtype)
975
+ if len(cls_pe) != b:
976
+ cls_pe = cls_pe.repeat(b, 1, 1)
977
+ x = m(x, cls_pe)
978
+ else:
979
+ x = m(x) # run
980
+
981
+ y.append(x if m.i in self.save else None) # save output
982
+ if visualize:
983
+ feature_visualization(x, m.type, m.i, save_dir=visualize)
984
+ if embed and m.i in embed:
985
+ embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1)) # flatten
986
+ if m.i == max(embed):
987
+ return torch.unbind(torch.cat(embeddings, 1), dim=0)
988
+ return x
989
+
990
+ def loss(self, batch, preds=None):
991
+ """
992
+ Compute loss.
993
+
994
+ Args:
995
+ batch (dict): Batch to compute loss on.
996
+ preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
997
+ """
998
+ if not hasattr(self, "criterion"):
999
+ from ultralytics.utils.loss import TVPDetectLoss
1000
+
1001
+ visual_prompt = batch.get("visuals", None) is not None # TODO
1002
+ self.criterion = TVPDetectLoss(self) if visual_prompt else self.init_criterion()
1003
+
1004
+ if preds is None:
1005
+ preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
1006
+ return self.criterion(preds, batch)
1007
+
1008
+
1009
+ class YOLOESegModel(YOLOEModel, SegmentationModel):
1010
+ """YOLOE segmentation model."""
1011
+
1012
+ def __init__(self, cfg="yoloe-v8s-seg.yaml", ch=3, nc=None, verbose=True):
1013
+ """
1014
+ Initialize YOLOE segmentation model with given config and parameters.
1015
+
1016
+ Args:
1017
+ cfg (str | dict): Model configuration file path or dictionary.
1018
+ ch (int): Number of input channels.
1019
+ nc (int, optional): Number of classes.
1020
+ verbose (bool): Whether to display model information.
1021
+ """
1022
+ super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
1023
+
1024
+ def loss(self, batch, preds=None):
1025
+ """
1026
+ Compute loss.
1027
+
1028
+ Args:
1029
+ batch (dict): Batch to compute loss on.
1030
+ preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
1031
+ """
1032
+ if not hasattr(self, "criterion"):
1033
+ from ultralytics.utils.loss import TVPSegmentLoss
1034
+
1035
+ visual_prompt = batch.get("visuals", None) is not None # TODO
1036
+ self.criterion = TVPSegmentLoss(self) if visual_prompt else self.init_criterion()
1037
+
1038
+ if preds is None:
1039
+ preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
1040
+ return self.criterion(preds, batch)
1041
+
1042
+
781
1043
  class Ensemble(torch.nn.ModuleList):
782
1044
  """Ensemble of models."""
783
1045
 
@@ -1185,6 +1447,8 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
1185
1447
  legacy = False
1186
1448
  if scale in "lx": # for L/X sizes
1187
1449
  args.extend((True, 1.2))
1450
+ if m is C2fCIB:
1451
+ legacy = False
1188
1452
  elif m is AIFI:
1189
1453
  args = [ch[f], *args]
1190
1454
  elif m in frozenset({HGStem, HGBlock}):
@@ -1199,11 +1463,13 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
1199
1463
  args = [ch[f]]
1200
1464
  elif m is Concat:
1201
1465
  c2 = sum(ch[x] for x in f)
1202
- elif m in frozenset({Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn, v10Detect}):
1466
+ elif m in frozenset(
1467
+ {Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect}
1468
+ ):
1203
1469
  args.append([ch[x] for x in f])
1204
- if m is Segment:
1470
+ if m is Segment or m is YOLOESegment:
1205
1471
  args[2] = make_divisible(min(args[2], max_channels) * width, 8)
1206
- if m in {Detect, Segment, Pose, OBB}:
1472
+ if m in {Detect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB}:
1207
1473
  m.legacy = legacy
1208
1474
  elif m is RTDETRDecoder: # special case, channels arg must be passed in index 1
1209
1475
  args.insert(1, [ch[x] for x in f])
@@ -1269,7 +1535,7 @@ def guess_model_scale(model_path):
1269
1535
  (str): The size character of the model's scale (n, s, m, l, or x).
1270
1536
  """
1271
1537
  try:
1272
- return re.search(r"yolo[v]?\d+([nslmx])", Path(model_path).stem).group(1) # returns n, s, m, l, or x
1538
+ return re.search(r"yolo(e-)?[v]?\d+([nslmx])", Path(model_path).stem).group(2) # noqa
1273
1539
  except AttributeError:
1274
1540
  return ""
1275
1541
 
@@ -1292,7 +1558,7 @@ def guess_model_task(model):
1292
1558
  return "classify"
1293
1559
  if "detect" in m:
1294
1560
  return "detect"
1295
- if m == "segment":
1561
+ if "segment" in m:
1296
1562
  return "segment"
1297
1563
  if m == "pose":
1298
1564
  return "pose"
@@ -1312,7 +1578,7 @@ def guess_model_task(model):
1312
1578
  with contextlib.suppress(Exception):
1313
1579
  return cfg2task(eval(x))
1314
1580
  for m in model.modules():
1315
- if isinstance(m, Segment):
1581
+ if isinstance(m, (Segment, YOLOESegment)):
1316
1582
  return "segment"
1317
1583
  elif isinstance(m, Classify):
1318
1584
  return "classify"
@@ -1320,7 +1586,7 @@ def guess_model_task(model):
1320
1586
  return "pose"
1321
1587
  elif isinstance(m, OBB):
1322
1588
  return "obb"
1323
- elif isinstance(m, (Detect, WorldDetect, v10Detect)):
1589
+ elif isinstance(m, (Detect, WorldDetect, YOLOEDetect, v10Detect)):
1324
1590
  return "detect"
1325
1591
 
1326
1592
  # Guess from model filename
@@ -0,0 +1,193 @@
1
+ # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
+
3
+ from abc import abstractmethod
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ from ultralytics.utils import LOGGER, checks
10
+ from ultralytics.utils.torch_utils import smart_inference_mode
11
+
12
+ try:
13
+ import clip
14
+ except ImportError:
15
+ checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
16
+ import clip
17
+
18
+ try:
19
+ import warnings
20
+
21
+ # Suppress 'timm.models.layers is deprecated, please import via timm.layers' warning from mobileclip usage
22
+ with warnings.catch_warnings():
23
+ warnings.filterwarnings("ignore", category=FutureWarning)
24
+ import mobileclip
25
+ except ImportError:
26
+ # MobileCLIP repo has an incorrect version of torchvision as dependency
27
+ # Manually install other dependencies first and install mobileclip with "--no-deps" flag
28
+ checks.check_requirements(["open-clip-torch>=2.20.0", "timm>=0.9.5"])
29
+ checks.check_requirements("git+https://github.com/apple/ml-mobileclip.git", cmds="--no-deps")
30
+ import mobileclip
31
+
32
+
33
+ class TextModel(nn.Module):
34
+ """
35
+ Abstract base class for text encoding models.
36
+
37
+ This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement
38
+ the tokenize and encode_text methods.
39
+
40
+ Methods:
41
+ tokenize: Convert input texts to tokens.
42
+ encode_text: Encode tokenized texts into feature vectors.
43
+ """
44
+
45
+ def __init__(self):
46
+ """Initialize the TextModel base class."""
47
+ super().__init__()
48
+
49
+ @abstractmethod
50
+ def tokenize(texts):
51
+ """Convert input texts to tokens for model processing."""
52
+ pass
53
+
54
+ @abstractmethod
55
+ def encode_text(texts, dtype):
56
+ """Encode tokenized texts into normalized feature vectors."""
57
+ pass
58
+
59
+
60
+ class CLIP(TextModel):
61
+ """
62
+ OpenAI CLIP text encoder implementation.
63
+
64
+ This class implements the TextModel interface using OpenAI's CLIP model for text encoding.
65
+
66
+ Attributes:
67
+ model (clip.model.CLIP): The loaded CLIP model.
68
+ device (torch.device): Device where the model is loaded.
69
+
70
+ Methods:
71
+ tokenize: Convert input texts to CLIP tokens.
72
+ encode_text: Encode tokenized texts into normalized feature vectors.
73
+ """
74
+
75
+ def __init__(self, size, device):
76
+ """
77
+ Initialize the CLIP text encoder.
78
+
79
+ Args:
80
+ size (str): Model size identifier (e.g., 'ViT-B/32').
81
+ device (torch.device): Device to load the model on.
82
+ """
83
+ super().__init__()
84
+ self.model = clip.load(size, device=device)[0]
85
+ self.to(device)
86
+ self.device = device
87
+ self.eval()
88
+
89
+ def tokenize(self, texts):
90
+ """Convert input texts to CLIP tokens."""
91
+ return clip.tokenize(texts).to(self.device)
92
+
93
+ @smart_inference_mode()
94
+ def encode_text(self, texts, dtype=torch.float32):
95
+ """
96
+ Encode tokenized texts into normalized feature vectors.
97
+
98
+ Args:
99
+ texts (torch.Tensor): Tokenized text inputs.
100
+ dtype (torch.dtype): Data type for output features.
101
+
102
+ Returns:
103
+ (torch.Tensor): Normalized text feature vectors.
104
+ """
105
+ txt_feats = self.model.encode_text(texts).to(dtype)
106
+ txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
107
+ return txt_feats
108
+
109
+
110
+ class MobileCLIP(TextModel):
111
+ """
112
+ Apple MobileCLIP text encoder implementation.
113
+
114
+ This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.
115
+
116
+ Attributes:
117
+ model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
118
+ tokenizer (callable): Tokenizer function for processing text inputs.
119
+ device (torch.device): Device where the model is loaded.
120
+ config_size_map (dict): Mapping from size identifiers to model configuration names.
121
+
122
+ Methods:
123
+ tokenize: Convert input texts to MobileCLIP tokens.
124
+ encode_text: Encode tokenized texts into normalized feature vectors.
125
+ """
126
+
127
+ config_size_map = {"s0": "s0", "s1": "s1", "s2": "s2", "b": "b", "blt": "b"}
128
+
129
+ def __init__(self, size, device):
130
+ """
131
+ Initialize the MobileCLIP text encoder.
132
+
133
+ Args:
134
+ size (str): Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').
135
+ device (torch.device): Device to load the model on.
136
+ """
137
+ super().__init__()
138
+ config = self.config_size_map[size]
139
+ file = f"mobileclip_{size}.pt"
140
+ if not Path(file).is_file():
141
+ from ultralytics import download
142
+
143
+ download(f"https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/{file}")
144
+ self.model = mobileclip.create_model_and_transforms(f"mobileclip_{config}", pretrained=file, device=device)[0]
145
+ self.tokenizer = mobileclip.get_tokenizer(f"mobileclip_{config}")
146
+ self.to(device)
147
+ self.device = device
148
+ self.eval()
149
+
150
+ def tokenize(self, texts):
151
+ """Convert input texts to MobileCLIP tokens."""
152
+ return self.tokenizer(texts).to(self.device)
153
+
154
+ @smart_inference_mode()
155
+ def encode_text(self, texts, dtype=torch.float32):
156
+ """
157
+ Encode tokenized texts into normalized feature vectors.
158
+
159
+ Args:
160
+ texts (torch.Tensor): Tokenized text inputs.
161
+ dtype (torch.dtype): Data type for output features.
162
+
163
+ Returns:
164
+ (torch.Tensor): Normalized text feature vectors.
165
+ """
166
+ text_features = self.model.encode_text(texts).to(dtype)
167
+ text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
168
+ return text_features
169
+
170
+
171
+ def build_text_model(variant, device=None):
172
+ """
173
+ Build a text encoding model based on the specified variant.
174
+
175
+ Args:
176
+ variant (str): Model variant in format "base:size" (e.g., "clip:ViT-B/32" or "mobileclip:s0").
177
+ device (torch.device, optional): Device to load the model on.
178
+
179
+ Returns:
180
+ (TextModel): Instantiated text encoding model.
181
+
182
+ Raises:
183
+ AssertionError: If the specified variant is not supported.
184
+ """
185
+ LOGGER.info(f"Build text model {variant}")
186
+ base, size = variant.split(":")
187
+ if base == "clip":
188
+ return CLIP(size, device)
189
+ elif base == "mobileclip":
190
+ return MobileCLIP(size, device)
191
+ else:
192
+ print("Variant not found")
193
+ assert False
@@ -126,6 +126,7 @@ def benchmark(
126
126
  assert not isinstance(model, YOLOWorld), "YOLOWorldv2 TensorFlow exports not supported by onnx2tf yet"
127
127
  if i == 11: # Paddle
128
128
  assert not isinstance(model, YOLOWorld), "YOLOWorldv2 Paddle exports not supported yet"
129
+ assert not model.task == "obb", "Paddle OBB bug https://github.com/PaddlePaddle/Paddle/issues/72024"
129
130
  assert not is_end2end, "End-to-end models not supported by PaddlePaddle yet"
130
131
  assert LINUX or MACOS, "Windows Paddle exports not supported yet"
131
132
  if i == 12: # MNN
@@ -194,12 +194,9 @@ def _format_prediction_annotations(image_path, metadata, class_label_map=None, c
194
194
  LOGGER.debug(f"COMET WARNING: Image: {image_path} has no bounding boxes predictions")
195
195
  return None
196
196
 
197
- label_index_offset = 0
198
- if class_map is not None:
199
197
  # offset to align indices of class labels (starting from zero)
200
198
  # with prediction's category ID indices (can start from one)
201
- label_index_offset = sorted(class_map)[0]
202
-
199
+ label_index_offset = sorted(class_map)[0] if class_map is not None else 0
203
200
  try:
204
201
  # import pycotools utilities to decompress annotations for various tasks, e.g. segmentation
205
202
  from pycocotools.mask import decode # noqa
@@ -221,8 +218,8 @@ def _format_prediction_annotations(image_path, metadata, class_label_map=None, c
221
218
  segments = prediction.get("segmentation", None)
222
219
  if segments is not None:
223
220
  segments = _extract_segmentation_annotation(segments, decode)
224
- if segments is not None:
225
- annotation_data["points"] = segments
221
+ if segments is not None:
222
+ annotation_data["points"] = segments
226
223
 
227
224
  data.append(annotation_data)
228
225
 
@@ -15,7 +15,7 @@ from ultralytics.utils import LOGGER, TQDM, checks, clean_url, emojis, is_online
15
15
 
16
16
  # Define Ultralytics GitHub assets maintained at https://github.com/ultralytics/assets
17
17
  GITHUB_ASSETS_REPO = "ultralytics/assets"
18
- GITHUB_ASSETS_NAMES = (
18
+ GITHUB_ASSETS_NAMES = frozenset(
19
19
  [f"yolov8{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb", "-oiv7")]
20
20
  + [f"yolo11{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb")]
21
21
  + [f"yolo12{k}{suffix}.pt" for k in "nsmlx" for suffix in ("",)] # detect models only currently
@@ -23,16 +23,20 @@ GITHUB_ASSETS_NAMES = (
23
23
  + [f"yolov3{k}u.pt" for k in ("", "-spp", "-tiny")]
24
24
  + [f"yolov8{k}-world.pt" for k in "smlx"]
25
25
  + [f"yolov8{k}-worldv2.pt" for k in "smlx"]
26
+ + [f"yoloe-v8{k}{suffix}.pt" for k in "sml" for suffix in ("-seg", "-seg-pf")]
27
+ + [f"yoloe-11{k}{suffix}.pt" for k in "sml" for suffix in ("-seg", "-seg-pf")]
26
28
  + [f"yolov9{k}.pt" for k in "tsmce"]
27
29
  + [f"yolov10{k}.pt" for k in "nsmblx"]
28
30
  + [f"yolo_nas_{k}.pt" for k in "sml"]
29
31
  + [f"sam_{k}.pt" for k in "bl"]
32
+ + [f"sam2_{k}.pt" for k in "blst"]
33
+ + [f"sam2.1_{k}.pt" for k in "blst"]
30
34
  + [f"FastSAM-{k}.pt" for k in "sx"]
31
35
  + [f"rtdetr-{k}.pt" for k in "lx"]
32
36
  + ["mobile_sam.pt"]
33
37
  + ["calibration_image_sample_data_20x128x128x3_float32.npy.zip"]
34
38
  )
35
- GITHUB_ASSETS_STEMS = [Path(k).stem for k in GITHUB_ASSETS_NAMES]
39
+ GITHUB_ASSETS_STEMS = frozenset(k.rsplit(".", 1)[0] for k in GITHUB_ASSETS_NAMES)
36
40
 
37
41
 
38
42
  def is_url(url, check=False):