dgenerate-ultralytics-headless 8.3.195__py3-none-any.whl → 8.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/METADATA +1 -1
- {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/RECORD +37 -36
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +1 -0
- ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
- ultralytics/cfg/default.yaml +1 -0
- ultralytics/data/augment.py +1 -1
- ultralytics/data/build.py +5 -1
- ultralytics/engine/exporter.py +20 -31
- ultralytics/engine/model.py +1 -2
- ultralytics/engine/predictor.py +3 -1
- ultralytics/engine/trainer.py +17 -8
- ultralytics/engine/validator.py +6 -2
- ultralytics/models/yolo/classify/train.py +1 -11
- ultralytics/models/yolo/detect/train.py +27 -6
- ultralytics/models/yolo/detect/val.py +6 -5
- ultralytics/models/yolo/obb/train.py +0 -9
- ultralytics/models/yolo/pose/train.py +0 -9
- ultralytics/models/yolo/pose/val.py +1 -1
- ultralytics/models/yolo/segment/train.py +0 -9
- ultralytics/models/yolo/segment/val.py +5 -5
- ultralytics/models/yolo/world/train.py +4 -4
- ultralytics/models/yolo/world/train_world.py +2 -2
- ultralytics/models/yolo/yoloe/train.py +3 -12
- ultralytics/models/yolo/yoloe/val.py +0 -7
- ultralytics/nn/tasks.py +4 -2
- ultralytics/utils/__init__.py +30 -19
- ultralytics/utils/callbacks/tensorboard.py +2 -2
- ultralytics/utils/checks.py +2 -0
- ultralytics/utils/loss.py +12 -7
- ultralytics/utils/nms.py +3 -1
- ultralytics/utils/plotting.py +1 -0
- ultralytics/utils/torch_utils.py +89 -9
- {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/top_level.txt +0 -0
ultralytics/utils/torch_utils.py
CHANGED
@@ -429,7 +429,7 @@ def get_flops(model, imgsz=640):
|
|
429
429
|
return 0.0 # if not installed return 0.0 GFLOPs
|
430
430
|
|
431
431
|
try:
|
432
|
-
model =
|
432
|
+
model = unwrap_model(model)
|
433
433
|
p = next(model.parameters())
|
434
434
|
if not isinstance(imgsz, list):
|
435
435
|
imgsz = [imgsz, imgsz] # expand if int/float
|
@@ -460,7 +460,7 @@ def get_flops_with_torch_profiler(model, imgsz=640):
|
|
460
460
|
"""
|
461
461
|
if not TORCH_2_0: # torch profiler implemented in torch>=2.0
|
462
462
|
return 0.0
|
463
|
-
model =
|
463
|
+
model = unwrap_model(model)
|
464
464
|
p = next(model.parameters())
|
465
465
|
if not isinstance(imgsz, list):
|
466
466
|
imgsz = [imgsz, imgsz] # expand if int/float
|
@@ -577,17 +577,24 @@ def is_parallel(model):
|
|
577
577
|
return isinstance(model, (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel))
|
578
578
|
|
579
579
|
|
580
|
-
def
|
580
|
+
def unwrap_model(m: nn.Module) -> nn.Module:
|
581
581
|
"""
|
582
|
-
|
582
|
+
Unwrap compiled and parallel models to get the base model.
|
583
583
|
|
584
584
|
Args:
|
585
|
-
|
585
|
+
m (nn.Module): A model that may be wrapped by torch.compile (._orig_mod) or parallel wrappers such as
|
586
|
+
DataParallel/DistributedDataParallel (.module).
|
586
587
|
|
587
588
|
Returns:
|
588
|
-
(nn.Module):
|
589
|
+
m (nn.Module): The unwrapped base model without compile or parallel wrappers.
|
589
590
|
"""
|
590
|
-
|
591
|
+
while True:
|
592
|
+
if hasattr(m, "_orig_mod") and isinstance(m._orig_mod, nn.Module):
|
593
|
+
m = m._orig_mod
|
594
|
+
elif hasattr(m, "module") and isinstance(m.module, nn.Module):
|
595
|
+
m = m.module
|
596
|
+
else:
|
597
|
+
return m
|
591
598
|
|
592
599
|
|
593
600
|
def one_cycle(y1=0.0, y2=1.0, steps=100):
|
@@ -669,7 +676,7 @@ class ModelEMA:
|
|
669
676
|
tau (int, optional): EMA decay time constant.
|
670
677
|
updates (int, optional): Initial number of updates.
|
671
678
|
"""
|
672
|
-
self.ema = deepcopy(
|
679
|
+
self.ema = deepcopy(unwrap_model(model)).eval() # FP32 EMA
|
673
680
|
self.updates = updates # number of EMA updates
|
674
681
|
self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs)
|
675
682
|
for p in self.ema.parameters():
|
@@ -687,7 +694,7 @@ class ModelEMA:
|
|
687
694
|
self.updates += 1
|
688
695
|
d = self.decay(self.updates)
|
689
696
|
|
690
|
-
msd =
|
697
|
+
msd = unwrap_model(model).state_dict() # model state_dict
|
691
698
|
for k, v in self.ema.state_dict().items():
|
692
699
|
if v.dtype.is_floating_point: # true for FP16 and FP32
|
693
700
|
v *= d
|
@@ -997,3 +1004,76 @@ class FXModel(nn.Module):
|
|
997
1004
|
x = m(x) # run
|
998
1005
|
y.append(x) # save output
|
999
1006
|
return x
|
1007
|
+
|
1008
|
+
|
1009
|
+
def attempt_compile(
|
1010
|
+
model: torch.nn.Module,
|
1011
|
+
device: torch.device,
|
1012
|
+
imgsz: int = 640,
|
1013
|
+
use_autocast: bool = False,
|
1014
|
+
warmup: bool = False,
|
1015
|
+
prefix: str = colorstr("compile:"),
|
1016
|
+
) -> torch.nn.Module:
|
1017
|
+
"""
|
1018
|
+
Compile a model with torch.compile and optionally warm up the graph to reduce first-iteration latency.
|
1019
|
+
|
1020
|
+
This utility attempts to compile the provided model using the inductor backend with dynamic shapes enabled and an
|
1021
|
+
autotuning mode. If compilation is unavailable or fails, the original model is returned unchanged. An optional
|
1022
|
+
warmup performs a single forward pass on a dummy input to prime the compiled graph and measure compile/warmup time.
|
1023
|
+
|
1024
|
+
Args:
|
1025
|
+
model (torch.nn.Module): Model to compile.
|
1026
|
+
device (torch.device): Inference device used for warmup and autocast decisions.
|
1027
|
+
imgsz (int, optional): Square input size to create a dummy tensor with shape (1, 3, imgsz, imgsz) for warmup.
|
1028
|
+
use_autocast (bool, optional): Whether to run warmup under autocast on CUDA or MPS devices.
|
1029
|
+
warmup (bool, optional): Whether to execute a single dummy forward pass to warm up the compiled model.
|
1030
|
+
prefix (str, optional): Message prefix for logger output.
|
1031
|
+
|
1032
|
+
Returns:
|
1033
|
+
model (torch.nn.Module): Compiled model if compilation succeeds, otherwise the original unmodified model.
|
1034
|
+
|
1035
|
+
Notes:
|
1036
|
+
- If the current PyTorch build does not provide torch.compile, the function returns the input model immediately.
|
1037
|
+
- Warmup runs under torch.inference_mode and may use torch.autocast for CUDA/MPS to align compute precision.
|
1038
|
+
- CUDA devices are synchronized after warmup to account for asynchronous kernel execution.
|
1039
|
+
|
1040
|
+
Examples:
|
1041
|
+
>>> device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
1042
|
+
>>> # Try to compile and warm up a model with a 640x640 input
|
1043
|
+
>>> model = attempt_compile(model, device=device, imgsz=640, use_autocast=True, warmup=True)
|
1044
|
+
"""
|
1045
|
+
if not hasattr(torch, "compile"):
|
1046
|
+
return model
|
1047
|
+
|
1048
|
+
LOGGER.info(f"{prefix} starting torch.compile...")
|
1049
|
+
t0 = time.perf_counter()
|
1050
|
+
try:
|
1051
|
+
model = torch.compile(model, mode="max-autotune", backend="inductor")
|
1052
|
+
except Exception as e:
|
1053
|
+
LOGGER.warning(f"{prefix} torch.compile failed, continuing uncompiled: {e}")
|
1054
|
+
return model
|
1055
|
+
t_compile = time.perf_counter() - t0
|
1056
|
+
|
1057
|
+
t_warm = 0.0
|
1058
|
+
if warmup:
|
1059
|
+
# Use a single dummy tensor to build the graph shape state and reduce first-iteration latency
|
1060
|
+
dummy = torch.zeros(1, 3, imgsz, imgsz, device=device)
|
1061
|
+
if use_autocast and device.type == "cuda":
|
1062
|
+
dummy = dummy.half()
|
1063
|
+
t1 = time.perf_counter()
|
1064
|
+
with torch.inference_mode():
|
1065
|
+
if use_autocast and device.type in {"cuda", "mps"}:
|
1066
|
+
with torch.autocast(device.type):
|
1067
|
+
_ = model(dummy)
|
1068
|
+
else:
|
1069
|
+
_ = model(dummy)
|
1070
|
+
if device.type == "cuda":
|
1071
|
+
torch.cuda.synchronize(device)
|
1072
|
+
t_warm = time.perf_counter() - t1
|
1073
|
+
|
1074
|
+
total = t_compile + t_warm
|
1075
|
+
if warmup:
|
1076
|
+
LOGGER.info(f"{prefix} complete in {total:.1f}s (compile {t_compile:.1f}s + warmup {t_warm:.1f}s)")
|
1077
|
+
else:
|
1078
|
+
LOGGER.info(f"{prefix} compile complete in {t_compile:.1f}s (no warmup)")
|
1079
|
+
return model
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|