dgenerate-ultralytics-headless 8.3.195__py3-none-any.whl → 8.3.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/METADATA +1 -1
  2. {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/RECORD +37 -36
  3. ultralytics/__init__.py +1 -1
  4. ultralytics/cfg/__init__.py +1 -0
  5. ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
  6. ultralytics/cfg/default.yaml +1 -0
  7. ultralytics/data/augment.py +1 -1
  8. ultralytics/data/build.py +5 -1
  9. ultralytics/engine/exporter.py +20 -31
  10. ultralytics/engine/model.py +1 -2
  11. ultralytics/engine/predictor.py +3 -1
  12. ultralytics/engine/trainer.py +17 -8
  13. ultralytics/engine/validator.py +6 -2
  14. ultralytics/models/yolo/classify/train.py +1 -11
  15. ultralytics/models/yolo/detect/train.py +27 -6
  16. ultralytics/models/yolo/detect/val.py +6 -5
  17. ultralytics/models/yolo/obb/train.py +0 -9
  18. ultralytics/models/yolo/pose/train.py +0 -9
  19. ultralytics/models/yolo/pose/val.py +1 -1
  20. ultralytics/models/yolo/segment/train.py +0 -9
  21. ultralytics/models/yolo/segment/val.py +5 -5
  22. ultralytics/models/yolo/world/train.py +4 -4
  23. ultralytics/models/yolo/world/train_world.py +2 -2
  24. ultralytics/models/yolo/yoloe/train.py +3 -12
  25. ultralytics/models/yolo/yoloe/val.py +0 -7
  26. ultralytics/nn/tasks.py +4 -2
  27. ultralytics/utils/__init__.py +30 -19
  28. ultralytics/utils/callbacks/tensorboard.py +2 -2
  29. ultralytics/utils/checks.py +2 -0
  30. ultralytics/utils/loss.py +12 -7
  31. ultralytics/utils/nms.py +3 -1
  32. ultralytics/utils/plotting.py +1 -0
  33. ultralytics/utils/torch_utils.py +89 -9
  34. {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/WHEEL +0 -0
  35. {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/entry_points.txt +0 -0
  36. {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/licenses/LICENSE +0 -0
  37. {dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/top_level.txt +0 -0
@@ -429,7 +429,7 @@ def get_flops(model, imgsz=640):
429
429
  return 0.0 # if not installed return 0.0 GFLOPs
430
430
 
431
431
  try:
432
- model = de_parallel(model)
432
+ model = unwrap_model(model)
433
433
  p = next(model.parameters())
434
434
  if not isinstance(imgsz, list):
435
435
  imgsz = [imgsz, imgsz] # expand if int/float
@@ -460,7 +460,7 @@ def get_flops_with_torch_profiler(model, imgsz=640):
460
460
  """
461
461
  if not TORCH_2_0: # torch profiler implemented in torch>=2.0
462
462
  return 0.0
463
- model = de_parallel(model)
463
+ model = unwrap_model(model)
464
464
  p = next(model.parameters())
465
465
  if not isinstance(imgsz, list):
466
466
  imgsz = [imgsz, imgsz] # expand if int/float
@@ -577,17 +577,24 @@ def is_parallel(model):
577
577
  return isinstance(model, (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel))
578
578
 
579
579
 
580
- def de_parallel(model):
580
+ def unwrap_model(m: nn.Module) -> nn.Module:
581
581
  """
582
- De-parallelize a model: return single-GPU model if model is of type DP or DDP.
582
+ Unwrap compiled and parallel models to get the base model.
583
583
 
584
584
  Args:
585
- model (nn.Module): Model to de-parallelize.
585
+ m (nn.Module): A model that may be wrapped by torch.compile (._orig_mod) or parallel wrappers such as
586
+ DataParallel/DistributedDataParallel (.module).
586
587
 
587
588
  Returns:
588
- (nn.Module): De-parallelized model.
589
+ m (nn.Module): The unwrapped base model without compile or parallel wrappers.
589
590
  """
590
- return model.module if is_parallel(model) else model
591
+ while True:
592
+ if hasattr(m, "_orig_mod") and isinstance(m._orig_mod, nn.Module):
593
+ m = m._orig_mod
594
+ elif hasattr(m, "module") and isinstance(m.module, nn.Module):
595
+ m = m.module
596
+ else:
597
+ return m
591
598
 
592
599
 
593
600
  def one_cycle(y1=0.0, y2=1.0, steps=100):
@@ -669,7 +676,7 @@ class ModelEMA:
669
676
  tau (int, optional): EMA decay time constant.
670
677
  updates (int, optional): Initial number of updates.
671
678
  """
672
- self.ema = deepcopy(de_parallel(model)).eval() # FP32 EMA
679
+ self.ema = deepcopy(unwrap_model(model)).eval() # FP32 EMA
673
680
  self.updates = updates # number of EMA updates
674
681
  self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs)
675
682
  for p in self.ema.parameters():
@@ -687,7 +694,7 @@ class ModelEMA:
687
694
  self.updates += 1
688
695
  d = self.decay(self.updates)
689
696
 
690
- msd = de_parallel(model).state_dict() # model state_dict
697
+ msd = unwrap_model(model).state_dict() # model state_dict
691
698
  for k, v in self.ema.state_dict().items():
692
699
  if v.dtype.is_floating_point: # true for FP16 and FP32
693
700
  v *= d
@@ -997,3 +1004,76 @@ class FXModel(nn.Module):
997
1004
  x = m(x) # run
998
1005
  y.append(x) # save output
999
1006
  return x
1007
+
1008
+
1009
+ def attempt_compile(
1010
+ model: torch.nn.Module,
1011
+ device: torch.device,
1012
+ imgsz: int = 640,
1013
+ use_autocast: bool = False,
1014
+ warmup: bool = False,
1015
+ prefix: str = colorstr("compile:"),
1016
+ ) -> torch.nn.Module:
1017
+ """
1018
+ Compile a model with torch.compile and optionally warm up the graph to reduce first-iteration latency.
1019
+
1020
+ This utility attempts to compile the provided model using the inductor backend with dynamic shapes enabled and an
1021
+ autotuning mode. If compilation is unavailable or fails, the original model is returned unchanged. An optional
1022
+ warmup performs a single forward pass on a dummy input to prime the compiled graph and measure compile/warmup time.
1023
+
1024
+ Args:
1025
+ model (torch.nn.Module): Model to compile.
1026
+ device (torch.device): Inference device used for warmup and autocast decisions.
1027
+ imgsz (int, optional): Square input size to create a dummy tensor with shape (1, 3, imgsz, imgsz) for warmup.
1028
+ use_autocast (bool, optional): Whether to run warmup under autocast on CUDA or MPS devices.
1029
+ warmup (bool, optional): Whether to execute a single dummy forward pass to warm up the compiled model.
1030
+ prefix (str, optional): Message prefix for logger output.
1031
+
1032
+ Returns:
1033
+ model (torch.nn.Module): Compiled model if compilation succeeds, otherwise the original unmodified model.
1034
+
1035
+ Notes:
1036
+ - If the current PyTorch build does not provide torch.compile, the function returns the input model immediately.
1037
+ - Warmup runs under torch.inference_mode and may use torch.autocast for CUDA/MPS to align compute precision.
1038
+ - CUDA devices are synchronized after warmup to account for asynchronous kernel execution.
1039
+
1040
+ Examples:
1041
+ >>> device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
1042
+ >>> # Try to compile and warm up a model with a 640x640 input
1043
+ >>> model = attempt_compile(model, device=device, imgsz=640, use_autocast=True, warmup=True)
1044
+ """
1045
+ if not hasattr(torch, "compile"):
1046
+ return model
1047
+
1048
+ LOGGER.info(f"{prefix} starting torch.compile...")
1049
+ t0 = time.perf_counter()
1050
+ try:
1051
+ model = torch.compile(model, mode="max-autotune", backend="inductor")
1052
+ except Exception as e:
1053
+ LOGGER.warning(f"{prefix} torch.compile failed, continuing uncompiled: {e}")
1054
+ return model
1055
+ t_compile = time.perf_counter() - t0
1056
+
1057
+ t_warm = 0.0
1058
+ if warmup:
1059
+ # Use a single dummy tensor to build the graph shape state and reduce first-iteration latency
1060
+ dummy = torch.zeros(1, 3, imgsz, imgsz, device=device)
1061
+ if use_autocast and device.type == "cuda":
1062
+ dummy = dummy.half()
1063
+ t1 = time.perf_counter()
1064
+ with torch.inference_mode():
1065
+ if use_autocast and device.type in {"cuda", "mps"}:
1066
+ with torch.autocast(device.type):
1067
+ _ = model(dummy)
1068
+ else:
1069
+ _ = model(dummy)
1070
+ if device.type == "cuda":
1071
+ torch.cuda.synchronize(device)
1072
+ t_warm = time.perf_counter() - t1
1073
+
1074
+ total = t_compile + t_warm
1075
+ if warmup:
1076
+ LOGGER.info(f"{prefix} complete in {total:.1f}s (compile {t_compile:.1f}s + warmup {t_warm:.1f}s)")
1077
+ else:
1078
+ LOGGER.info(f"{prefix} compile complete in {t_compile:.1f}s (no warmup)")
1079
+ return model