ultralytics 8.3.136__py3-none-any.whl → 8.3.138__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/test_cuda.py +2 -7
- tests/test_exports.py +1 -6
- tests/test_solutions.py +181 -8
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +1 -1
- ultralytics/data/base.py +1 -1
- ultralytics/data/build.py +4 -3
- ultralytics/data/loaders.py +2 -2
- ultralytics/engine/exporter.py +6 -7
- ultralytics/engine/model.py +2 -2
- ultralytics/engine/predictor.py +3 -10
- ultralytics/engine/trainer.py +1 -1
- ultralytics/engine/validator.py +1 -1
- ultralytics/hub/auth.py +2 -2
- ultralytics/hub/utils.py +8 -3
- ultralytics/models/yolo/classify/predict.py +11 -0
- ultralytics/models/yolo/obb/val.py +1 -1
- ultralytics/models/yolo/world/train.py +66 -20
- ultralytics/models/yolo/world/train_world.py +1 -0
- ultralytics/models/yolo/yoloe/train.py +10 -39
- ultralytics/models/yolo/yoloe/val.py +3 -3
- ultralytics/nn/tasks.py +41 -24
- ultralytics/nn/text_model.py +1 -0
- ultralytics/solutions/similarity_search.py +3 -6
- ultralytics/solutions/streamlit_inference.py +1 -1
- ultralytics/utils/__init__.py +1 -1
- ultralytics/utils/callbacks/hub.py +5 -4
- ultralytics/utils/checks.py +13 -13
- ultralytics/utils/downloads.py +7 -5
- ultralytics/utils/export.py +1 -1
- ultralytics/utils/plotting.py +1 -1
- ultralytics/utils/torch_utils.py +3 -0
- ultralytics/utils/triton.py +1 -1
- {ultralytics-8.3.136.dist-info → ultralytics-8.3.138.dist-info}/METADATA +1 -1
- {ultralytics-8.3.136.dist-info → ultralytics-8.3.138.dist-info}/RECORD +39 -39
- {ultralytics-8.3.136.dist-info → ultralytics-8.3.138.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.136.dist-info → ultralytics-8.3.138.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.136.dist-info → ultralytics-8.3.138.dist-info}/licenses/LICENSE +0 -0
- {ultralytics-8.3.136.dist-info → ultralytics-8.3.138.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,14 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
3
|
import itertools
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import torch
|
4
7
|
|
5
8
|
from ultralytics.data import build_yolo_dataset
|
6
|
-
from ultralytics.models import
|
9
|
+
from ultralytics.models.yolo.detect import DetectionTrainer
|
7
10
|
from ultralytics.nn.tasks import WorldModel
|
8
|
-
from ultralytics.utils import DEFAULT_CFG,
|
11
|
+
from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
|
9
12
|
from ultralytics.utils.torch_utils import de_parallel
|
10
13
|
|
11
14
|
|
@@ -13,15 +16,11 @@ def on_pretrain_routine_end(trainer):
|
|
13
16
|
"""Callback to set up model classes and text encoder at the end of the pretrain routine."""
|
14
17
|
if RANK in {-1, 0}:
|
15
18
|
# Set class names for evaluation
|
16
|
-
names = [name.split("/")[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
|
19
|
+
names = [name.split("/", 1)[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
|
17
20
|
de_parallel(trainer.ema.ema).set_classes(names, cache_clip_model=False)
|
18
|
-
device = next(trainer.model.parameters()).device
|
19
|
-
trainer.text_model, _ = trainer.clip.load("ViT-B/32", device=device)
|
20
|
-
for p in trainer.text_model.parameters():
|
21
|
-
p.requires_grad_(False)
|
22
21
|
|
23
22
|
|
24
|
-
class WorldTrainer(
|
23
|
+
class WorldTrainer(DetectionTrainer):
|
25
24
|
"""
|
26
25
|
A class to fine-tune a world model on a close-set dataset.
|
27
26
|
|
@@ -54,14 +53,7 @@ class WorldTrainer(yolo.detect.DetectionTrainer):
|
|
54
53
|
if overrides is None:
|
55
54
|
overrides = {}
|
56
55
|
super().__init__(cfg, overrides, _callbacks)
|
57
|
-
|
58
|
-
# Import and assign clip
|
59
|
-
try:
|
60
|
-
import clip
|
61
|
-
except ImportError:
|
62
|
-
checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
|
63
|
-
import clip
|
64
|
-
self.clip = clip
|
56
|
+
self.text_embeddings = None
|
65
57
|
|
66
58
|
def get_model(self, cfg=None, weights=None, verbose=True):
|
67
59
|
"""
|
@@ -102,18 +94,72 @@ class WorldTrainer(yolo.detect.DetectionTrainer):
|
|
102
94
|
(Dataset): YOLO dataset configured for training or validation.
|
103
95
|
"""
|
104
96
|
gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
|
105
|
-
|
97
|
+
dataset = build_yolo_dataset(
|
106
98
|
self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"
|
107
99
|
)
|
100
|
+
if mode == "train":
|
101
|
+
self.set_text_embeddings([dataset], batch) # cache text embeddings to accelerate training
|
102
|
+
return dataset
|
103
|
+
|
104
|
+
def set_text_embeddings(self, datasets, batch):
|
105
|
+
"""
|
106
|
+
Set text embeddings for datasets to accelerate training by caching category names.
|
107
|
+
|
108
|
+
This method collects unique category names from all datasets, then generates and caches text embeddings
|
109
|
+
for these categories to improve training efficiency.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
datasets (List[Dataset]): List of datasets from which to extract category names.
|
113
|
+
batch (int | None): Batch size used for processing.
|
114
|
+
|
115
|
+
Notes:
|
116
|
+
This method collects category names from datasets that have the 'category_names' attribute,
|
117
|
+
then uses the first dataset's image path to determine where to cache the generated text embeddings.
|
118
|
+
"""
|
119
|
+
text_embeddings = {}
|
120
|
+
for dataset in datasets:
|
121
|
+
if not hasattr(dataset, "category_names"):
|
122
|
+
continue
|
123
|
+
text_embeddings.update(
|
124
|
+
self.generate_text_embeddings(
|
125
|
+
list(dataset.category_names), batch, cache_dir=Path(dataset.img_path).parent
|
126
|
+
)
|
127
|
+
)
|
128
|
+
self.text_embeddings = text_embeddings
|
129
|
+
|
130
|
+
def generate_text_embeddings(self, texts, batch, cache_dir):
|
131
|
+
"""
|
132
|
+
Generate text embeddings for a list of text samples.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
texts (List[str]): List of text samples to encode.
|
136
|
+
batch (int): Batch size for processing.
|
137
|
+
cache_dir (Path): Directory to save/load cached embeddings.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
(dict): Dictionary mapping text samples to their embeddings.
|
141
|
+
"""
|
142
|
+
model = "clip:ViT-B/32"
|
143
|
+
cache_path = cache_dir / f"text_embeddings_{model.replace(':', '_').replace('/', '_')}.pt"
|
144
|
+
if cache_path.exists():
|
145
|
+
LOGGER.info(f"Reading existed cache from '{cache_path}'")
|
146
|
+
txt_map = torch.load(cache_path)
|
147
|
+
if sorted(txt_map.keys()) == sorted(texts):
|
148
|
+
return txt_map
|
149
|
+
LOGGER.info(f"Caching text embeddings to '{cache_path}'")
|
150
|
+
assert self.model is not None
|
151
|
+
txt_feats = self.model.get_text_pe(texts, batch, cache_clip_model=False)
|
152
|
+
txt_map = dict(zip(texts, txt_feats.squeeze(0)))
|
153
|
+
torch.save(txt_map, cache_path)
|
154
|
+
return txt_map
|
108
155
|
|
109
156
|
def preprocess_batch(self, batch):
|
110
157
|
"""Preprocess a batch of images and text for YOLOWorld training."""
|
111
|
-
batch =
|
158
|
+
batch = DetectionTrainer.preprocess_batch(self, batch)
|
112
159
|
|
113
160
|
# Add text features
|
114
161
|
texts = list(itertools.chain(*batch["texts"]))
|
115
|
-
|
116
|
-
txt_feats = self.text_model.encode_text(text_token).to(dtype=batch["img"].dtype) # torch.float32
|
162
|
+
txt_feats = torch.stack([self.text_embeddings[text] for text in texts]).to(self.device)
|
117
163
|
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
|
118
164
|
batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
|
119
165
|
return batch
|
@@ -100,6 +100,7 @@ class WorldTrainerFromScratch(WorldTrainer):
|
|
100
100
|
else build_grounding(self.args, im_path["img_path"], im_path["json_file"], batch, stride=gs)
|
101
101
|
for im_path in img_path
|
102
102
|
]
|
103
|
+
self.set_text_embeddings(datasets, batch) # cache text embeddings to accelerate training
|
103
104
|
return YOLOConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
|
104
105
|
|
105
106
|
def get_dataset(self):
|
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
import itertools
|
4
4
|
from copy import copy, deepcopy
|
5
|
-
from pathlib import Path
|
6
5
|
|
7
6
|
import torch
|
8
7
|
|
@@ -157,40 +156,7 @@ class YOLOETrainerFromScratch(YOLOETrainer, WorldTrainerFromScratch):
|
|
157
156
|
Returns:
|
158
157
|
(YOLOConcatDataset | Dataset): The constructed dataset for training or validation.
|
159
158
|
"""
|
160
|
-
|
161
|
-
if mode == "train":
|
162
|
-
self.set_text_embeddings(
|
163
|
-
datasets.datasets if hasattr(datasets, "datasets") else [datasets], batch
|
164
|
-
) # cache text embeddings to accelerate training
|
165
|
-
return datasets
|
166
|
-
|
167
|
-
def set_text_embeddings(self, datasets, batch):
|
168
|
-
"""
|
169
|
-
Set text embeddings for datasets to accelerate training by caching category names.
|
170
|
-
|
171
|
-
This method collects unique category names from all datasets, then generates and caches text embeddings
|
172
|
-
for these categories to improve training efficiency.
|
173
|
-
|
174
|
-
Args:
|
175
|
-
datasets (List[Dataset]): List of datasets from which to extract category names.
|
176
|
-
batch (int | None): Batch size used for processing.
|
177
|
-
|
178
|
-
Notes:
|
179
|
-
This method collects category names from datasets that have the 'category_names' attribute,
|
180
|
-
then uses the first dataset's image path to determine where to cache the generated text embeddings.
|
181
|
-
"""
|
182
|
-
# TODO: open up an interface to determine whether to do cache
|
183
|
-
category_names = set()
|
184
|
-
for dataset in datasets:
|
185
|
-
if not hasattr(dataset, "category_names"):
|
186
|
-
continue
|
187
|
-
category_names |= dataset.category_names
|
188
|
-
|
189
|
-
# TODO: enable to update the path or use a more general way to get the path
|
190
|
-
img_path = datasets[0].img_path
|
191
|
-
self.text_embeddings = self.generate_text_embeddings(
|
192
|
-
category_names, batch, cache_path=Path(img_path).parent / "text_embeddings.pt"
|
193
|
-
)
|
159
|
+
return WorldTrainerFromScratch.build_dataset(self, img_path, mode, batch)
|
194
160
|
|
195
161
|
def preprocess_batch(self, batch):
|
196
162
|
"""Process batch for training, moving text features to the appropriate device."""
|
@@ -202,23 +168,28 @@ class YOLOETrainerFromScratch(YOLOETrainer, WorldTrainerFromScratch):
|
|
202
168
|
batch["txt_feats"] = txt_feats
|
203
169
|
return batch
|
204
170
|
|
205
|
-
def generate_text_embeddings(self, texts, batch,
|
171
|
+
def generate_text_embeddings(self, texts, batch, cache_dir):
|
206
172
|
"""
|
207
173
|
Generate text embeddings for a list of text samples.
|
208
174
|
|
209
175
|
Args:
|
210
176
|
texts (List[str]): List of text samples to encode.
|
211
177
|
batch (int): Batch size for processing.
|
212
|
-
|
178
|
+
cache_dir (Path): Directory to save/load cached embeddings.
|
213
179
|
|
214
180
|
Returns:
|
215
181
|
(dict): Dictionary mapping text samples to their embeddings.
|
216
182
|
"""
|
183
|
+
model = "mobileclip:blt"
|
184
|
+
cache_path = cache_dir / f"text_embeddings_{model.replace(':', '_').replace('/', '_')}.pt"
|
217
185
|
if cache_path.exists():
|
218
186
|
LOGGER.info(f"Reading existed cache from '{cache_path}'")
|
219
|
-
|
187
|
+
txt_map = torch.load(cache_path)
|
188
|
+
if sorted(txt_map.keys()) == sorted(texts):
|
189
|
+
return txt_map
|
190
|
+
LOGGER.info(f"Caching text embeddings to '{cache_path}'")
|
220
191
|
assert self.model is not None
|
221
|
-
txt_feats = self.model.get_text_pe(texts, batch, without_reprta=True)
|
192
|
+
txt_feats = self.model.get_text_pe(texts, batch, without_reprta=True, cache_clip_model=False)
|
222
193
|
txt_map = dict(zip(texts, txt_feats.squeeze(0)))
|
223
194
|
torch.save(txt_map, cache_path)
|
224
195
|
return txt_map
|
@@ -47,7 +47,7 @@ class YOLOEDetectValidator(DetectionValidator):
|
|
47
47
|
(torch.Tensor): Visual prompt embeddings with shape (1, num_classes, embed_dim).
|
48
48
|
"""
|
49
49
|
assert isinstance(model, YOLOEModel)
|
50
|
-
names = [name.split("/")[0] for name in list(dataloader.dataset.data["names"].values())]
|
50
|
+
names = [name.split("/", 1)[0] for name in list(dataloader.dataset.data["names"].values())]
|
51
51
|
visual_pe = torch.zeros(len(names), model.model[-1].embed, device=self.device)
|
52
52
|
cls_visual_num = torch.zeros(len(names))
|
53
53
|
|
@@ -140,7 +140,7 @@ class YOLOEDetectValidator(DetectionValidator):
|
|
140
140
|
if trainer is not None:
|
141
141
|
self.device = trainer.device
|
142
142
|
model = trainer.ema.ema
|
143
|
-
names = [name.split("/")[0] for name in list(self.dataloader.dataset.data["names"].values())]
|
143
|
+
names = [name.split("/", 1)[0] for name in list(self.dataloader.dataset.data["names"].values())]
|
144
144
|
|
145
145
|
if load_vp:
|
146
146
|
LOGGER.info("Validate using the visual prompt.")
|
@@ -164,7 +164,7 @@ class YOLOEDetectValidator(DetectionValidator):
|
|
164
164
|
model = attempt_load_weights(model, device=self.device, inplace=True)
|
165
165
|
model.eval().to(self.device)
|
166
166
|
data = check_det_dataset(refer_data or self.args.data)
|
167
|
-
names = [name.split("/")[0] for name in list(data["names"].values())]
|
167
|
+
names = [name.split("/", 1)[0] for name in list(data["names"].values())]
|
168
168
|
|
169
169
|
if load_vp:
|
170
170
|
LOGGER.info("Validate using the visual prompt.")
|
ultralytics/nn/tasks.py
CHANGED
@@ -146,6 +146,8 @@ class BaseModel(torch.nn.Module):
|
|
146
146
|
(torch.Tensor): The last output of the model.
|
147
147
|
"""
|
148
148
|
y, dt, embeddings = [], [], [] # outputs
|
149
|
+
embed = frozenset(embed) if embed is not None else {-1}
|
150
|
+
max_idx = max(embed)
|
149
151
|
for m in self.model:
|
150
152
|
if m.f != -1: # if not from previous layer
|
151
153
|
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
|
@@ -155,9 +157,9 @@ class BaseModel(torch.nn.Module):
|
|
155
157
|
y.append(x if m.i in self.save else None) # save output
|
156
158
|
if visualize:
|
157
159
|
feature_visualization(x, m.type, m.i, save_dir=visualize)
|
158
|
-
if
|
160
|
+
if m.i in embed:
|
159
161
|
embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1)) # flatten
|
160
|
-
if m.i ==
|
162
|
+
if m.i == max_idx:
|
161
163
|
return torch.unbind(torch.cat(embeddings, 1), dim=0)
|
162
164
|
return x
|
163
165
|
|
@@ -677,6 +679,8 @@ class RTDETRDetectionModel(DetectionModel):
|
|
677
679
|
(torch.Tensor): Model's output tensor.
|
678
680
|
"""
|
679
681
|
y, dt, embeddings = [], [], [] # outputs
|
682
|
+
embed = frozenset(embed) if embed is not None else {-1}
|
683
|
+
max_idx = max(embed)
|
680
684
|
for m in self.model[:-1]: # except the head part
|
681
685
|
if m.f != -1: # if not from previous layer
|
682
686
|
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
|
@@ -686,9 +690,9 @@ class RTDETRDetectionModel(DetectionModel):
|
|
686
690
|
y.append(x if m.i in self.save else None) # save output
|
687
691
|
if visualize:
|
688
692
|
feature_visualization(x, m.type, m.i, save_dir=visualize)
|
689
|
-
if
|
693
|
+
if m.i in embed:
|
690
694
|
embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1)) # flatten
|
691
|
-
if m.i ==
|
695
|
+
if m.i == max_idx:
|
692
696
|
return torch.unbind(torch.cat(embeddings, 1), dim=0)
|
693
697
|
head = self.model[-1]
|
694
698
|
x = head([y[j] for j in head.f], batch) # head inference
|
@@ -721,24 +725,33 @@ class WorldModel(DetectionModel):
|
|
721
725
|
batch (int): Batch size for processing text tokens.
|
722
726
|
cache_clip_model (bool): Whether to cache the CLIP model.
|
723
727
|
"""
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
728
|
+
self.txt_feats = self.get_text_pe(text, batch=batch, cache_clip_model=cache_clip_model)
|
729
|
+
self.model[-1].nc = len(text)
|
730
|
+
|
731
|
+
@smart_inference_mode()
|
732
|
+
def get_text_pe(self, text, batch=80, cache_clip_model=True):
|
733
|
+
"""
|
734
|
+
Set classes in advance so that model could do offline-inference without clip model.
|
735
|
+
|
736
|
+
Args:
|
737
|
+
text (List[str]): List of class names.
|
738
|
+
batch (int): Batch size for processing text tokens.
|
739
|
+
cache_clip_model (bool): Whether to cache the CLIP model.
|
740
|
+
|
741
|
+
Returns:
|
742
|
+
(torch.Tensor): Text positional embeddings.
|
743
|
+
"""
|
744
|
+
from ultralytics.nn.text_model import build_text_model
|
745
|
+
|
746
|
+
device = next(self.model.parameters()).device
|
747
|
+
if not getattr(self, "clip_model", None) and cache_clip_model:
|
748
|
+
# For backwards compatibility of models lacking clip_model attribute
|
749
|
+
self.clip_model = build_text_model("clip:ViT-B/32", device=device)
|
750
|
+
model = self.clip_model if cache_clip_model else build_text_model("clip:ViT-B/32", device=device)
|
751
|
+
text_token = model.tokenize(text)
|
737
752
|
txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
|
738
753
|
txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
|
739
|
-
|
740
|
-
self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
|
741
|
-
self.model[-1].nc = len(text)
|
754
|
+
return txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
|
742
755
|
|
743
756
|
def predict(self, x, profile=False, visualize=False, txt_feats=None, augment=False, embed=None):
|
744
757
|
"""
|
@@ -760,6 +773,8 @@ class WorldModel(DetectionModel):
|
|
760
773
|
txt_feats = txt_feats.expand(x.shape[0], -1, -1)
|
761
774
|
ori_txt_feats = txt_feats.clone()
|
762
775
|
y, dt, embeddings = [], [], [] # outputs
|
776
|
+
embed = frozenset(embed) if embed is not None else {-1}
|
777
|
+
max_idx = max(embed)
|
763
778
|
for m in self.model: # except the head part
|
764
779
|
if m.f != -1: # if not from previous layer
|
765
780
|
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
|
@@ -777,9 +792,9 @@ class WorldModel(DetectionModel):
|
|
777
792
|
y.append(x if m.i in self.save else None) # save output
|
778
793
|
if visualize:
|
779
794
|
feature_visualization(x, m.type, m.i, save_dir=visualize)
|
780
|
-
if
|
795
|
+
if m.i in embed:
|
781
796
|
embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1)) # flatten
|
782
|
-
if m.i ==
|
797
|
+
if m.i == max_idx:
|
783
798
|
return torch.unbind(torch.cat(embeddings, 1), dim=0)
|
784
799
|
return x
|
785
800
|
|
@@ -976,6 +991,8 @@ class YOLOEModel(DetectionModel):
|
|
976
991
|
"""
|
977
992
|
y, dt, embeddings = [], [], [] # outputs
|
978
993
|
b = x.shape[0]
|
994
|
+
embed = frozenset(embed) if embed is not None else {-1}
|
995
|
+
max_idx = max(embed)
|
979
996
|
for m in self.model: # except the head part
|
980
997
|
if m.f != -1: # if not from previous layer
|
981
998
|
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
|
@@ -997,9 +1014,9 @@ class YOLOEModel(DetectionModel):
|
|
997
1014
|
y.append(x if m.i in self.save else None) # save output
|
998
1015
|
if visualize:
|
999
1016
|
feature_visualization(x, m.type, m.i, save_dir=visualize)
|
1000
|
-
if
|
1017
|
+
if m.i in embed:
|
1001
1018
|
embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1)) # flatten
|
1002
|
-
if m.i ==
|
1019
|
+
if m.i == max_idx:
|
1003
1020
|
return torch.unbind(torch.cat(embeddings, 1), dim=0)
|
1004
1021
|
return x
|
1005
1022
|
|
ultralytics/nn/text_model.py
CHANGED
@@ -30,12 +30,9 @@ class VisualAISearch(BaseSolution):
|
|
30
30
|
"""Initializes the VisualAISearch class with the FAISS index file and CLIP model."""
|
31
31
|
super().__init__(**kwargs)
|
32
32
|
check_requirements(["git+https://github.com/ultralytics/CLIP.git", "faiss-cpu"])
|
33
|
-
import clip
|
34
|
-
import faiss
|
35
|
-
|
36
|
-
self.faiss = faiss
|
37
|
-
self.clip = clip
|
38
33
|
|
34
|
+
self.faiss = __import__("faiss")
|
35
|
+
self.clip = __import__("clip")
|
39
36
|
self.faiss_index = "faiss.index"
|
40
37
|
self.data_path_npy = "paths.npy"
|
41
38
|
self.model_name = "ViT-B/32"
|
@@ -51,7 +48,7 @@ class VisualAISearch(BaseSolution):
|
|
51
48
|
safe_download(url=f"{ASSETS_URL}/images.zip", unzip=True, retry=3)
|
52
49
|
self.data_dir = Path("images")
|
53
50
|
|
54
|
-
self.model, self.preprocess = clip.load(self.model_name, device=self.device)
|
51
|
+
self.model, self.preprocess = self.clip.load(self.model_name, device=self.device)
|
55
52
|
|
56
53
|
self.index = None
|
57
54
|
self.image_paths = []
|
@@ -130,7 +130,7 @@ class Inference:
|
|
130
130
|
# Add dropdown menu for model selection
|
131
131
|
available_models = [x.replace("yolo", "YOLO") for x in GITHUB_ASSETS_STEMS if x.startswith("yolo11")]
|
132
132
|
if self.model_path: # If user provided the custom model, insert model without suffix as *.pt is added later
|
133
|
-
available_models.insert(0, self.model_path.split(".pt")[0])
|
133
|
+
available_models.insert(0, self.model_path.split(".pt", 1)[0])
|
134
134
|
selected_model = self.st.sidebar.selectbox("Model", available_models)
|
135
135
|
|
136
136
|
with self.st.spinner("Model is downloading..."):
|
ultralytics/utils/__init__.py
CHANGED
@@ -1387,7 +1387,7 @@ def deprecation_warn(arg, new_arg=None):
|
|
1387
1387
|
def clean_url(url):
|
1388
1388
|
"""Strip auth from URL, i.e. https://url.com/file.txt?auth -> https://url.com/file.txt."""
|
1389
1389
|
url = Path(url).as_posix().replace(":/", "://") # Pathlib turns :// -> :/, as_posix() for Windows
|
1390
|
-
return unquote(url).split("?")[0] # '%2F' to '/', split https://url.com/file.txt?auth
|
1390
|
+
return unquote(url).split("?", 1)[0] # '%2F' to '/', split https://url.com/file.txt?auth
|
1391
1391
|
|
1392
1392
|
|
1393
1393
|
def url2file(url):
|
@@ -73,22 +73,23 @@ def on_train_end(trainer):
|
|
73
73
|
|
74
74
|
def on_train_start(trainer):
|
75
75
|
"""Run events on train start."""
|
76
|
-
events(trainer.args)
|
76
|
+
events(trainer.args, trainer.device)
|
77
77
|
|
78
78
|
|
79
79
|
def on_val_start(validator):
|
80
80
|
"""Run events on validation start."""
|
81
|
-
|
81
|
+
if not validator.training:
|
82
|
+
events(validator.args, validator.device)
|
82
83
|
|
83
84
|
|
84
85
|
def on_predict_start(predictor):
|
85
86
|
"""Run events on predict start."""
|
86
|
-
events(predictor.args)
|
87
|
+
events(predictor.args, predictor.device)
|
87
88
|
|
88
89
|
|
89
90
|
def on_export_start(exporter):
|
90
91
|
"""Run events on export start."""
|
91
|
-
events(exporter.args)
|
92
|
+
events(exporter.args, exporter.device)
|
92
93
|
|
93
94
|
|
94
95
|
callbacks = (
|
ultralytics/utils/checks.py
CHANGED
@@ -73,7 +73,7 @@ def parse_requirements(file_path=ROOT.parent / "requirements.txt", package=""):
|
|
73
73
|
for line in requires:
|
74
74
|
line = line.strip()
|
75
75
|
if line and not line.startswith("#"):
|
76
|
-
line = line.
|
76
|
+
line = line.partition("#")[0].strip() # ignore inline comments
|
77
77
|
if match := re.match(r"([a-zA-Z0-9-_]+)\s*([<>!=~]+.*)?", line):
|
78
78
|
requirements.append(SimpleNamespace(name=match[1], specifier=match[2].strip() if match[2] else ""))
|
79
79
|
|
@@ -379,7 +379,7 @@ def check_requirements(requirements=ROOT.parent / "requirements.txt", exclude=()
|
|
379
379
|
|
380
380
|
pkgs = []
|
381
381
|
for r in requirements:
|
382
|
-
r_stripped = r.
|
382
|
+
r_stripped = r.rpartition("/")[-1].replace(".git", "") # replace git+https://org/repo.git -> 'repo'
|
383
383
|
match = re.match(r"([a-zA-Z0-9-_]+)([<>!=~]+.*)?", r_stripped)
|
384
384
|
name, required = match[1], match[2].strip() if match[2] else ""
|
385
385
|
try:
|
@@ -423,6 +423,7 @@ def check_torchvision():
|
|
423
423
|
to the compatibility table based on: https://github.com/pytorch/vision#installation.
|
424
424
|
"""
|
425
425
|
compatibility_table = {
|
426
|
+
"2.7": ["0.22"],
|
426
427
|
"2.6": ["0.21"],
|
427
428
|
"2.5": ["0.20"],
|
428
429
|
"2.4": ["0.19"],
|
@@ -435,10 +436,10 @@ def check_torchvision():
|
|
435
436
|
}
|
436
437
|
|
437
438
|
# Check major and minor versions
|
438
|
-
v_torch = ".".join(torch.__version__.split("+")[0].split(".")[:2])
|
439
|
+
v_torch = ".".join(torch.__version__.split("+", 1)[0].split(".")[:2])
|
439
440
|
if v_torch in compatibility_table:
|
440
441
|
compatible_versions = compatibility_table[v_torch]
|
441
|
-
v_torchvision = ".".join(TORCHVISION_VERSION.split("+")[0].split(".")[:2])
|
442
|
+
v_torchvision = ".".join(TORCHVISION_VERSION.split("+", 1)[0].split(".")[:2])
|
442
443
|
if all(v_torchvision != v for v in compatible_versions):
|
443
444
|
LOGGER.warning(
|
444
445
|
f"torchvision=={v_torchvision} is incompatible with torch=={v_torch}.\n"
|
@@ -461,9 +462,8 @@ def check_suffix(file="yolo11n.pt", suffix=".pt", msg=""):
|
|
461
462
|
if isinstance(suffix, str):
|
462
463
|
suffix = {suffix}
|
463
464
|
for f in file if isinstance(file, (list, tuple)) else [file]:
|
464
|
-
s
|
465
|
-
|
466
|
-
assert s in suffix, f"{msg}{f} acceptable suffix is {suffix}, not {s}"
|
465
|
+
if s := str(f).rpartition(".")[-1].lower().strip(): # file suffix
|
466
|
+
assert f".{s}" in suffix, f"{msg}{f} acceptable suffix is {suffix}, not .{s}"
|
467
467
|
|
468
468
|
|
469
469
|
def check_yolov5u_filename(file: str, verbose: bool = True):
|
@@ -504,10 +504,10 @@ def check_model_file_from_stem(model="yolo11n"):
|
|
504
504
|
Returns:
|
505
505
|
(str | Path): Model filename with appropriate suffix.
|
506
506
|
"""
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
507
|
+
path = Path(model)
|
508
|
+
if not path.suffix and path.stem in downloads.GITHUB_ASSETS_STEMS:
|
509
|
+
return path.with_suffix(".pt") # add suffix, i.e. yolo11n -> yolo11n.pt
|
510
|
+
return model
|
511
511
|
|
512
512
|
|
513
513
|
def check_file(file, suffix="", download=True, download_dir=".", hard=True):
|
@@ -655,7 +655,7 @@ def collect_system_info():
|
|
655
655
|
from ultralytics.utils.torch_utils import get_cpu_info, get_gpu_info
|
656
656
|
|
657
657
|
gib = 1 << 30 # bytes per GiB
|
658
|
-
cuda = torch
|
658
|
+
cuda = torch.cuda.is_available()
|
659
659
|
check_yolo()
|
660
660
|
total, used, free = shutil.disk_usage("/")
|
661
661
|
|
@@ -837,7 +837,7 @@ def cuda_device_count() -> int:
|
|
837
837
|
)
|
838
838
|
|
839
839
|
# Take the first line and strip any leading/trailing white space
|
840
|
-
first_line = output.strip().split("\n")[0]
|
840
|
+
first_line = output.strip().split("\n", 1)[0]
|
841
841
|
|
842
842
|
return int(first_line)
|
843
843
|
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
ultralytics/utils/downloads.py
CHANGED
@@ -32,11 +32,13 @@ GITHUB_ASSETS_NAMES = frozenset(
|
|
32
32
|
+ [f"sam2.1_{k}.pt" for k in "blst"]
|
33
33
|
+ [f"FastSAM-{k}.pt" for k in "sx"]
|
34
34
|
+ [f"rtdetr-{k}.pt" for k in "lx"]
|
35
|
-
+ [
|
36
|
-
|
37
|
-
|
35
|
+
+ [
|
36
|
+
"mobile_sam.pt",
|
37
|
+
"mobileclip_blt.ts",
|
38
|
+
"calibration_image_sample_data_20x128x128x3_float32.npy.zip",
|
39
|
+
]
|
38
40
|
)
|
39
|
-
GITHUB_ASSETS_STEMS = frozenset(k.
|
41
|
+
GITHUB_ASSETS_STEMS = frozenset(k.rpartition(".")[0] for k in GITHUB_ASSETS_NAMES)
|
40
42
|
|
41
43
|
|
42
44
|
def is_url(url, check=False):
|
@@ -247,7 +249,7 @@ def get_google_drive_file_info(link):
|
|
247
249
|
"""
|
248
250
|
import requests # slow import
|
249
251
|
|
250
|
-
file_id = link.split("/d/")[1].split("/view")[0]
|
252
|
+
file_id = link.split("/d/")[1].split("/view", 1)[0]
|
251
253
|
drive_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
252
254
|
filename = None
|
253
255
|
|
ultralytics/utils/export.py
CHANGED
@@ -97,7 +97,7 @@ def export_engine(
|
|
97
97
|
builder = trt.Builder(logger)
|
98
98
|
config = builder.create_builder_config()
|
99
99
|
workspace = int((workspace or 0) * (1 << 30))
|
100
|
-
is_trt10 = int(trt.__version__.split(".")[0]) >= 10 # is TensorRT >= 10
|
100
|
+
is_trt10 = int(trt.__version__.split(".", 1)[0]) >= 10 # is TensorRT >= 10
|
101
101
|
if is_trt10 and workspace > 0:
|
102
102
|
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace)
|
103
103
|
elif workspace > 0: # TensorRT versions 7, 8
|
ultralytics/utils/plotting.py
CHANGED
@@ -1000,7 +1000,7 @@ def feature_visualization(x, module_type, stage, n=32, save_dir=Path("runs/detec
|
|
1000
1000
|
if isinstance(x, torch.Tensor):
|
1001
1001
|
_, channels, height, width = x.shape # batch, channels, height, width
|
1002
1002
|
if height > 1 and width > 1:
|
1003
|
-
f = save_dir / f"stage{stage}_{module_type.
|
1003
|
+
f = save_dir / f"stage{stage}_{module_type.rsplit('.', 1)[-1]}_features.png" # filename
|
1004
1004
|
|
1005
1005
|
blocks = torch.chunk(x[0].cpu(), channels, dim=0) # select batch index 0, block by channels
|
1006
1006
|
n = min(n, channels) # number of plots
|
ultralytics/utils/torch_utils.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
+
import functools
|
3
4
|
import gc
|
4
5
|
import math
|
5
6
|
import os
|
@@ -101,6 +102,7 @@ def autocast(enabled: bool, device: str = "cuda"):
|
|
101
102
|
return torch.cuda.amp.autocast(enabled)
|
102
103
|
|
103
104
|
|
105
|
+
@functools.lru_cache
|
104
106
|
def get_cpu_info():
|
105
107
|
"""Return a string with system CPU information, i.e. 'Apple M2'."""
|
106
108
|
from ultralytics.utils import PERSISTENT_CACHE # avoid circular import error
|
@@ -118,6 +120,7 @@ def get_cpu_info():
|
|
118
120
|
return PERSISTENT_CACHE.get("cpu_info", "unknown")
|
119
121
|
|
120
122
|
|
123
|
+
@functools.lru_cache
|
121
124
|
def get_gpu_info(index):
|
122
125
|
"""Return a string with system GPU information, i.e. 'Tesla T4, 15102MiB'."""
|
123
126
|
properties = torch.cuda.get_device_properties(index)
|
ultralytics/utils/triton.py
CHANGED
@@ -53,7 +53,7 @@ class TritonRemoteModel:
|
|
53
53
|
"""
|
54
54
|
if not endpoint and not scheme: # Parse all args from URL string
|
55
55
|
splits = urlsplit(url)
|
56
|
-
endpoint = splits.path.strip("/").split("/")[0]
|
56
|
+
endpoint = splits.path.strip("/").split("/", 1)[0]
|
57
57
|
scheme = splits.scheme
|
58
58
|
url = splits.netloc
|
59
59
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ultralytics
|
3
|
-
Version: 8.3.
|
3
|
+
Version: 8.3.138
|
4
4
|
Summary: Ultralytics YOLO 🚀 for SOTA object detection, multi-object tracking, instance segmentation, pose estimation and image classification.
|
5
5
|
Author-email: Glenn Jocher <glenn.jocher@ultralytics.com>, Jing Qiu <jing.qiu@ultralytics.com>
|
6
6
|
Maintainer-email: Ultralytics <hello@ultralytics.com>
|