PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.195__py3-none-any.whl → 8.3.197__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.195py3-none-any.whl → 8.3.197py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/METADATA +1 -1
{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/RECORD +37 -36
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +1 -0
ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
ultralytics/cfg/default.yaml +1 -0
ultralytics/data/augment.py +1 -1
ultralytics/data/build.py +5 -1
ultralytics/engine/exporter.py +20 -31
ultralytics/engine/model.py +1 -2
ultralytics/engine/predictor.py +3 -1
ultralytics/engine/trainer.py +17 -8
ultralytics/engine/validator.py +6 -2
ultralytics/models/yolo/classify/train.py +1 -11
ultralytics/models/yolo/detect/train.py +27 -6
ultralytics/models/yolo/detect/val.py +6 -5
ultralytics/models/yolo/obb/train.py +0 -9
ultralytics/models/yolo/pose/train.py +0 -9
ultralytics/models/yolo/pose/val.py +1 -1
ultralytics/models/yolo/segment/train.py +0 -9
ultralytics/models/yolo/segment/val.py +5 -5
ultralytics/models/yolo/world/train.py +4 -4
ultralytics/models/yolo/world/train_world.py +2 -2
ultralytics/models/yolo/yoloe/train.py +3 -12
ultralytics/models/yolo/yoloe/val.py +0 -7
ultralytics/nn/tasks.py +4 -2
ultralytics/utils/__init__.py +30 -19
ultralytics/utils/callbacks/tensorboard.py +2 -2
ultralytics/utils/checks.py +2 -0
ultralytics/utils/loss.py +12 -7
ultralytics/utils/nms.py +3 -1
ultralytics/utils/plotting.py +1 -0
ultralytics/utils/torch_utils.py +89 -9
{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/top_level.txt +0 -0

ultralytics/utils/torch_utils.py CHANGED Viewed

@@ -429,7 +429,7 @@ def get_flops(model, imgsz=640):
         return 0.0  # if not installed return 0.0 GFLOPs
     try:
-        model = de_parallel(model)
+        model = unwrap_model(model)
         p = next(model.parameters())
         if not isinstance(imgsz, list):
             imgsz = [imgsz, imgsz]  # expand if int/float
@@ -460,7 +460,7 @@ def get_flops_with_torch_profiler(model, imgsz=640):
     """
     if not TORCH_2_0:  # torch profiler implemented in torch>=2.0
         return 0.0
-    model = de_parallel(model)
+    model = unwrap_model(model)
     p = next(model.parameters())
     if not isinstance(imgsz, list):
         imgsz = [imgsz, imgsz]  # expand if int/float
@@ -577,17 +577,24 @@ def is_parallel(model):
     return isinstance(model, (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel))
-def de_parallel(model):
+def unwrap_model(m: nn.Module) -> nn.Module:
     """
-    De-parallelize a model: return single-GPU model if model is of type DP or DDP.
+    Unwrap compiled and parallel models to get the base model.
     Args:
-        model (nn.Module): Model to de-parallelize.
+        m (nn.Module): A model that may be wrapped by torch.compile (._orig_mod) or parallel wrappers such as
+            DataParallel/DistributedDataParallel (.module).
     Returns:
-        (nn.Module): De-parallelized model.
+        m (nn.Module): The unwrapped base model without compile or parallel wrappers.
     """
-    return model.module if is_parallel(model) else model
+    while True:
+        if hasattr(m, "_orig_mod") and isinstance(m._orig_mod, nn.Module):
+            m = m._orig_mod
+        elif hasattr(m, "module") and isinstance(m.module, nn.Module):
+            m = m.module
+        else:
+            return m
 def one_cycle(y1=0.0, y2=1.0, steps=100):
@@ -669,7 +676,7 @@ class ModelEMA:
             tau (int, optional): EMA decay time constant.
             updates (int, optional): Initial number of updates.
         """
-        self.ema = deepcopy(de_parallel(model)).eval()  # FP32 EMA
+        self.ema = deepcopy(unwrap_model(model)).eval()  # FP32 EMA
         self.updates = updates  # number of EMA updates
         self.decay = lambda x: decay * (1 - math.exp(-x / tau))  # decay exponential ramp (to help early epochs)
         for p in self.ema.parameters():
@@ -687,7 +694,7 @@ class ModelEMA:
             self.updates += 1
             d = self.decay(self.updates)
-            msd = de_parallel(model).state_dict()  # model state_dict
+            msd = unwrap_model(model).state_dict()  # model state_dict
             for k, v in self.ema.state_dict().items():
                 if v.dtype.is_floating_point:  # true for FP16 and FP32
                     v *= d
@@ -997,3 +1004,76 @@ class FXModel(nn.Module):
             x = m(x)  # run
             y.append(x)  # save output
         return x
+def attempt_compile(
+    model: torch.nn.Module,
+    device: torch.device,
+    imgsz: int = 640,
+    use_autocast: bool = False,
+    warmup: bool = False,
+    prefix: str = colorstr("compile:"),
+) -> torch.nn.Module:
+    """
+    Compile a model with torch.compile and optionally warm up the graph to reduce first-iteration latency.
+    This utility attempts to compile the provided model using the inductor backend with dynamic shapes enabled and an
+    autotuning mode. If compilation is unavailable or fails, the original model is returned unchanged. An optional
+    warmup performs a single forward pass on a dummy input to prime the compiled graph and measure compile/warmup time.
+    Args:
+        model (torch.nn.Module): Model to compile.
+        device (torch.device): Inference device used for warmup and autocast decisions.
+        imgsz (int, optional): Square input size to create a dummy tensor with shape (1, 3, imgsz, imgsz) for warmup.
+        use_autocast (bool, optional): Whether to run warmup under autocast on CUDA or MPS devices.
+        warmup (bool, optional): Whether to execute a single dummy forward pass to warm up the compiled model.
+        prefix (str, optional): Message prefix for logger output.
+    Returns:
+        model (torch.nn.Module): Compiled model if compilation succeeds, otherwise the original unmodified model.
+    Notes:
+        - If the current PyTorch build does not provide torch.compile, the function returns the input model immediately.
+        - Warmup runs under torch.inference_mode and may use torch.autocast for CUDA/MPS to align compute precision.
+        - CUDA devices are synchronized after warmup to account for asynchronous kernel execution.
+    Examples:
+        >>> device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        >>> # Try to compile and warm up a model with a 640x640 input
+        >>> model = attempt_compile(model, device=device, imgsz=640, use_autocast=True, warmup=True)
+    """
+    if not hasattr(torch, "compile"):
+        return model
+    LOGGER.info(f"{prefix} starting torch.compile...")
+    t0 = time.perf_counter()
+    try:
+        model = torch.compile(model, mode="max-autotune", backend="inductor")
+    except Exception as e:
+        LOGGER.warning(f"{prefix} torch.compile failed, continuing uncompiled: {e}")
+        return model
+    t_compile = time.perf_counter() - t0
+    t_warm = 0.0
+    if warmup:
+        # Use a single dummy tensor to build the graph shape state and reduce first-iteration latency
+        dummy = torch.zeros(1, 3, imgsz, imgsz, device=device)
+        if use_autocast and device.type == "cuda":
+            dummy = dummy.half()
+        t1 = time.perf_counter()
+        with torch.inference_mode():
+            if use_autocast and device.type in {"cuda", "mps"}:
+                with torch.autocast(device.type):
+                    _ = model(dummy)
+            else:
+                _ = model(dummy)
+        if device.type == "cuda":
+            torch.cuda.synchronize(device)
+        t_warm = time.perf_counter() - t1
+    total = t_compile + t_warm
+    if warmup:
+        LOGGER.info(f"{prefix} complete in {total:.1f}s (compile {t_compile:.1f}s + warmup {t_warm:.1f}s)")
+    else:
+        LOGGER.info(f"{prefix} compile complete in {t_compile:.1f}s (no warmup)")
+    return model

{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/WHEEL RENAMED Viewed

File without changes

{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dgenerate_ultralytics_headless-8.3.195.dist-info → dgenerate_ultralytics_headless-8.3.197.dist-info}/top_level.txt RENAMED Viewed

File without changes

dgenerate-ultralytics-headless 8.3.195__py3-none-any.whl → 8.3.197__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.195py3-none-any.whl → 8.3.197py3-none-any.whl