PyPI - diffcb - Versions diffs - 0.1.0__py3-none-any.whl - Mend

diffcb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

dcb/__init__.py +22 -0
dcb/diagnostics.py +163 -0
dcb/fft_kde.py +128 -0
dcb/kde.py +394 -0
dcb/layer.py +231 -0
dcb/solver.py +604 -0
dcb/utils.py +183 -0
diffcb-0.1.0.dist-info/METADATA +148 -0
diffcb-0.1.0.dist-info/RECORD +11 -0
diffcb-0.1.0.dist-info/WHEEL +4 -0
diffcb-0.1.0.dist-info/licenses/LICENSE +21 -0

dcb/solver.py ADDED Viewed

@@ -0,0 +1,604 @@
+"""
+dcb.solver — IFT Root-Finder and Backward Pass
+This module implements the two-stage computation that produces a differentiable
+h_crit: (1) a bisection search (via scipy.optimize.brentq wrapped in a
+no-grad context) that locates the root of M̃(h) - m = 0 for target mode count
+m, and (2) the Implicit Function Theorem backward pass that computes
+∂h_crit/∂X without unrolling the bisection iterations. The stabilized
+denominator sg(u, δ) = sign(u) · max(|u|, δ) with default δ=1e-4 prevents
+division by zero when ∂M̃/∂h is near zero (e.g., for distributions with
+closely spaced modes). The public interface is `find_h_crit(M_tilde_fn,
+h_lo, h_hi, m)` which returns (h_crit, context) suitable for use inside
+`DCBFunction.forward`, and `ift_gradient(context, grad_output)` which
+implements the IFT formula in `DCBFunction.backward`.
+Round 15a: `g_brentq` parameter adds a coarse grid (default G=128) for the
+brentq objective, giving 4× fewer KDE evaluations per iteration with negligible
+root-location error. The full main grid is still used for IFT gradient.
+Round 15b: `use_hard_bisection=True` (default) routes to `find_h_crit_hard`,
+which bisects on the hard (discrete) mode count — provably non-increasing in h
+(Silverman 1981) — eliminating the ~25% false-root rate of the brentq path.
+The IFT backward is unchanged: M̃_cross is evaluated at the confirmed h_crit.
+"""
+from __future__ import annotations
+import math
+import warnings
+import torch
+from torch import Tensor
+from dcb.kde import (
+    soft_mode_count,
+    soft_mode_count_cross,
+    soft_mode_count_cross_from_derivs,
+    kde_derivatives_chunked,
+)
+from dcb.fft_kde import fft_mode_count, adaptive_fft_G
+_AUTO_FFT_THRESHOLD = 50_000  # n above which FFT bisection activates (use_fft_effective)
+def hard_mode_count(f_prime: Tensor, grid: Tensor) -> int:
+    """Count local maxima of f_h: sign changes of f' from + to - on the grid.
+    Parameters
+    ----------
+    f_prime : Tensor, shape (G,)
+        First derivative of KDE evaluated on the grid.
+    grid : Tensor, shape (G,)
+        Uniform evaluation grid (used for shape reference only).
+    Returns
+    -------
+    int
+        Number of downward zero-crossings of f', i.e., number of modes.
+    """
+    sign_changes = (f_prime[:-1] > 0) & (f_prime[1:] <= 0)
+    return int(sign_changes.sum().item())
+def find_h_crit_hard(
+    X: Tensor,
+    grid: Tensor,
+    target_modes: int,
+    chunk_size: int,
+    brentq_n_max: int,
+    h_lo: float,
+    h_hi: float,
+    formula: str = 'cross',
+    tol: float = 1e-6,
+    eps: float = 0.1,
+    tau: float = 0.2,
+    use_fft: bool = False,
+) -> tuple[float, float]:
+    """Find h_crit via hard-mode-count bisection (monotone, no false roots).
+    Uses the hard (discrete) mode count — the number of sign changes of f'
+    from + to — which is provably non-increasing in h (Silverman 1981).
+    Bisects h until `hard_mode_count(f'_h) <= target_modes`. This approach
+    cannot produce false roots because the objective is monotone.
+    The IFT-backward condition number is still computed via M̃_cross evaluated
+    at the confirmed h_crit for diagnostic purposes.
+    Parameters
+    ----------
+    X : Tensor, shape (n,)
+    grid : Tensor, shape (G,)
+    target_modes : int
+    chunk_size : int
+    brentq_n_max : int
+        Subsample X to this size for the bisection loop.
+    h_lo, h_hi : float
+        Initial search bracket.
+    formula : str
+        Used only for condition-number computation (default 'cross').
+    tol : float
+        Bisection tolerance on h (default 1e-6).
+    eps, tau : float
+        Soft mode count parameters for condition-number diagnostic.
+    use_fft : bool
+        If True (Round 18b), use FFT-based mode counting for bisection — no
+        subsampling, O(n + G log G) complexity. If False (default), use the
+        chunked KDE approach on a subsample of size brentq_n_max.
+    Returns
+    -------
+    (h_crit, cond_num) : (float, float)
+        h_crit: the critical bandwidth (smallest h with hard count <= target_modes).
+        cond_num: |∂M̃/∂h| at h_crit (large = well-conditioned IFT).
+    """
+    mode_fn = _get_mode_count_fn(formula)
+    with torch.no_grad():
+        n = X.shape[0]
+        # FFT is only beneficial (and reliable) when n > brentq_n_max.
+        # For small n the histogram is too sparse (n/G < 1) and produces
+        # spurious sign changes.  Fall back to direct KDE — there is no
+        # subsampling bias to fix when n ≤ brentq_n_max anyway.
+        use_fft_effective = use_fft and (n > brentq_n_max)
+        if not use_fft_effective and n > brentq_n_max:
+            idx = torch.randperm(n, device=X.device)[:brentq_n_max]
+            X_sub = X[idx]
+        else:
+            X_sub = X
+    if not use_fft_effective and n > brentq_n_max:
+        bias_factor = (brentq_n_max / n) ** (-0.2)
+        warnings.warn(
+            f"DCB: n={n} > brentq_n_max={brentq_n_max}. "
+            f"h_crit estimated on {brentq_n_max}-point subsample; "
+            f"expected upward bias ~{bias_factor:.2f}x vs full-data h_crit. "
+            "Use use_fft=True to eliminate subsampling bias.",
+            UserWarning,
+            stacklevel=4,
+        )
+    if use_fft_effective:
+        # Compute adaptive FFT grid size before bisection
+        with torch.no_grad():
+            sigma = X.std().item()
+            if sigma == 0.0:
+                sigma = 1.0
+            lo_domain = X.min().item() - 3 * sigma
+            hi_domain = X.max().item() + 3 * sigma
+            data_range = hi_domain - lo_domain
+        G_fft = adaptive_fft_G(data_range, h_hi)
+        with torch.no_grad():
+            # Verify bracket using FFT mode count on full X
+            count_lo = fft_mode_count(X, h_lo, G=G_fft)
+            if count_lo <= target_modes:
+                h_lo_try = h_lo
+                for _ in range(30):
+                    h_lo_try *= 0.5
+                    if h_lo_try < 1e-10:
+                        break
+                    if fft_mode_count(X, h_lo_try, G=G_fft) > target_modes:
+                        h_lo = h_lo_try
+                        break
+            count_hi = fft_mode_count(X, h_hi, G=G_fft)
+            if count_hi > target_modes:
+                for _ in range(30):
+                    h_hi *= 2.0
+                    if fft_mode_count(X, h_hi, G=G_fft) <= target_modes:
+                        break
+            # Standard bisection: 50 iterations → bracket width / 2^50
+            lo, hi = h_lo, h_hi
+            for _ in range(50):
+                mid = (lo + hi) / 2.0
+                count = fft_mode_count(X, mid, G=G_fft)
+                if count <= target_modes:
+                    hi = mid
+                else:
+                    lo = mid
+                if (hi - lo) < tol:
+                    break
+            h_crit = float(hi)  # smallest h with count <= target_modes
+    else:
+        with torch.no_grad():
+            # Verify bracket: need count > target at h_lo, count <= target at h_hi.
+            f_lo, fp_lo, _ = kde_derivatives_chunked(X_sub, h_lo, grid, chunk_size)
+            count_lo = hard_mode_count(fp_lo, grid)
+            if count_lo <= target_modes:
+                # h_lo is already in the target regime — shrink h_lo
+                h_lo_try = h_lo
+                for _ in range(30):
+                    h_lo_try *= 0.5
+                    if h_lo_try < 1e-10:
+                        break
+                    _, fp_try, _ = kde_derivatives_chunked(X_sub, h_lo_try, grid, chunk_size)
+                    if hard_mode_count(fp_try, grid) > target_modes:
+                        h_lo = h_lo_try
+                        break
+            _, fp_hi, _ = kde_derivatives_chunked(X_sub, h_hi, grid, chunk_size)
+            count_hi = hard_mode_count(fp_hi, grid)
+            if count_hi > target_modes:
+                # h_hi is still in the multi-mode regime — grow h_hi
+                for _ in range(30):
+                    h_hi *= 2.0
+                    _, fp_try, _ = kde_derivatives_chunked(X_sub, h_hi, grid, chunk_size)
+                    if hard_mode_count(fp_try, grid) <= target_modes:
+                        break
+            # Standard bisection: 50 iterations → bracket width / 2^50
+            lo, hi = h_lo, h_hi
+            for _ in range(50):
+                mid = (lo + hi) / 2.0
+                _, fp_mid, _ = kde_derivatives_chunked(X_sub, mid, grid, chunk_size)
+                count = hard_mode_count(fp_mid, grid)
+                if count <= target_modes:
+                    hi = mid
+                else:
+                    lo = mid
+                if (hi - lo) < tol:
+                    break
+            h_crit = float(hi)  # smallest h with count <= target_modes
+    # Condition number: |∂M̃/∂h| via finite difference at h_crit (for diagnostics).
+    # When use_fft=True, X_sub == X (full n points). soft_mode_count_cross builds an
+    # (n × G) matrix, so we cap to brentq_n_max to avoid O(n×G) OOM at large n.
+    dh = h_crit * 1e-4
+    with torch.no_grad():
+        X_cond = X_sub
+        if X_sub.shape[0] > brentq_n_max:
+            idx_cond = torch.randperm(X_sub.shape[0], device=X_sub.device)[:brentq_n_max]
+            X_cond = X_sub[idx_cond]
+        m_plus  = mode_fn(X_cond, h_crit + dh, grid, eps, tau).item()
+        m_minus = mode_fn(X_cond, h_crit - dh, grid, eps, tau).item()
+    cond_num = abs((m_plus - m_minus) / (2 * dh))
+    return h_crit, float(cond_num)
+def sg(u: Tensor, delta: float = 1e-4) -> Tensor:
+    """Stabilized sign-magnitude denominator.
+    Computes sign(u) * max(|u|, delta), with the special case sg(0) = delta.
+    Parameters
+    ----------
+    u : Tensor
+        Input tensor.
+    delta : float
+        Minimum absolute value (floor), default 1e-4.
+    Returns
+    -------
+    Tensor
+        Stabilized tensor, same shape as u.
+    """
+    return torch.where(
+        u == 0,
+        torch.full_like(u, delta),
+        u.sign() * u.abs().clamp(min=delta),
+    )
+def _get_mode_count_fn(formula: str):
+    if formula == 'cross':
+        return soft_mode_count_cross
+    return soft_mode_count
+def find_h_crit(
+    X: Tensor,
+    grid: Tensor,
+    eps: float,
+    tau: float,
+    target_modes: int = 1,
+    h_lo: float = None,
+    h_hi: float = None,
+    formula: str = 'cross',
+    brentq_n_max: int = 50_000,
+    chunk_size: int = 50_000,
+    g_brentq: int = 128,
+    use_hard_bisection: bool = True,
+    use_fft: bool = True,
+) -> tuple[float, float]:
+    """Find h_crit and return (h_crit, condition_number).
+    When use_hard_bisection=True (default, Round 15b), dispatches to
+    `find_h_crit_hard` which bisects on the hard (discrete) mode count —
+    provably non-increasing in h (Silverman 1981), no false roots.
+    When use_hard_bisection=False, locates the root of
+    G(h, X) = M̃(h; X) - target_modes = 0 via Brent's method on a coarse
+    grid of g_brentq points (Round 15a: 4× fewer KDE evaluations per call).
+    The full main grid is still used for IFT gradient computation.
+    Also returns |∂M̃/∂h| at h_crit as a condition-number diagnostic:
+    values near zero indicate bifurcation instability (IFT denominator small).
+    Parameters
+    ----------
+    X : Tensor, shape (n,)
+    grid : Tensor, shape (G,)
+    eps, tau : float
+    target_modes : int
+    h_lo, h_hi : float or None
+    formula : str
+        'cross' (default) — use M̃_cross (crossing-count, fixes m=1 bias).
+        'integral' — use original M̃ (legacy, known m=1 failure).
+    g_brentq : int
+        Grid resolution for the brentq objective (default 128). Ignored when
+        use_hard_bisection=True.
+    use_hard_bisection : bool
+        If True (default), use hard-mode-count bisection (no false roots).
+        If False, use legacy brentq on M̃_cross with coarse grid g_brentq.
+    use_fft : bool
+        Default True. Uses FFT-based mode counting (O(n + G log G)) for n > 50K,
+        eliminating subsampling bias. Falls back to direct KDE for n ≤ 50K (no
+        bias at small n). Set False only for legacy/ablation comparison.
+    Returns
+    -------
+    (h_crit, cond_num) : (float, float)
+        h_crit: the critical bandwidth.
+        cond_num: |∂M̃/∂h| at h_crit (large = well-conditioned IFT).
+    """
+    with torch.no_grad():
+        std_val = X.std().item()
+        if h_lo is None:
+            h_lo = 1e-3 * std_val
+        if h_hi is None:
+            h_hi = 3.0 * std_val
+    if use_hard_bisection:
+        return find_h_crit_hard(
+            X, grid, target_modes, chunk_size, brentq_n_max,
+            h_lo, h_hi, formula=formula, eps=eps, tau=tau,
+            use_fft=use_fft,
+        )
+    from scipy.optimize import brentq
+    mode_fn = _get_mode_count_fn(formula)
+    with torch.no_grad():
+        n = X.shape[0]
+        if n > brentq_n_max:
+            idx = torch.randperm(n, device=X.device)[:brentq_n_max]
+            X_brentq = X[idx]
+        else:
+            X_brentq = X
+        # Build a coarse grid for brentq objective evaluation (Round 15a).
+        # g_brentq=128 gives 4× fewer KDE evaluations than G=512 with negligible
+        # root-location error — sign detection only needs coarse resolution.
+        grid_lo = grid[0].item()
+        grid_hi = grid[-1].item()
+        grid_coarse = torch.linspace(
+            grid_lo, grid_hi, g_brentq, dtype=grid.dtype, device=grid.device
+        )
+        # For 'cross': M̃_cross has a soft floor ~(target_modes + 0.001) in the
+        # target-mode regime, so a hard threshold of target_modes never cleanly
+        # separates the transition from the plateau.  Using target_modes + 0.5
+        # places the root at the center of the sharp mode-merging transition.
+        # For 'integral': the formula has no such floor; use target_modes directly.
+        threshold = target_modes + (0.5 if formula == 'cross' else 0)
+        def f(h: float) -> float:
+            return mode_fn(X_brentq, h, grid_coarse, eps, tau).item() - threshold
+        f_lo = f(h_lo)
+        f_hi = f(h_hi)
+        if f_lo <= 0:
+            h_scan = h_lo
+            for _ in range(40):
+                h_scan *= 1.5
+                if h_scan >= h_hi:
+                    break
+                f_scan = f(h_scan)
+                if f_scan > 0:
+                    h_lo = h_scan
+                    f_lo = f_scan
+                    break
+            else:
+                return h_lo, 0.0
+            if f_lo <= 0:
+                return h_lo, 0.0
+        if f_hi > 0:
+            h_hi *= 3
+            f_hi = f(h_hi)
+            if f_hi > 0:
+                return h_hi, 0.0
+        h_crit = brentq(f, h_lo, h_hi)
+    # Condition number: |∂M̃/∂h| via finite difference at h_crit
+    dh = h_crit * 1e-4
+    with torch.no_grad():
+        m_plus  = mode_fn(X_brentq, h_crit + dh, grid_coarse, eps, tau).item()
+        m_minus = mode_fn(X_brentq, h_crit - dh, grid_coarse, eps, tau).item()
+    cond_num = abs((m_plus - m_minus) / (2 * dh))
+    return float(h_crit), float(cond_num)
+def _analytical_dM_dX(
+    X: Tensor,
+    h: float,
+    grid: Tensor,
+    dM_dfp: Tensor,
+    dM_dfpp: Tensor,
+    chunk_size: int,
+) -> Tensor:
+    """Chunked analytical ∂M̃/∂X using the KDE chain rule.
+    Avoids materialising the full n×G autograd graph. Derived from:
+        ∂f'(grid_j)/∂X_i  = (1/n) K_ij · (1/h² − diff²_ij/h⁴)
+        ∂f''(grid_j)/∂X_i = (1/n) K_ij · diff_ij/h⁴ · (diff²_ij/h² − 3)
+    Then ∂M̃/∂X_i = Σ_j [∂M̃/∂f'_j · ∂f'_j/∂X_i + ∂M̃/∂f''_j · ∂f''_j/∂X_i].
+    """
+    n   = X.shape[0]
+    h2  = h * h
+    h4  = h2 * h2
+    h6  = h4 * h2
+    out = torch.zeros(n, dtype=X.dtype, device=X.device)
+    with torch.no_grad():
+        for start in range(0, n, chunk_size):
+            Xc   = X[start : start + chunk_size]
+            diff = grid.unsqueeze(0) - Xc.unsqueeze(1)      # (c, G)
+            K    = torch.exp(-0.5 * (diff / h) ** 2) / (math.sqrt(2 * math.pi) * h)
+            # ∂f'_j/∂X_i
+            gfp  = K * (1.0 / h2 - diff ** 2 / h4)          # (c, G)
+            # ∂f''_j/∂X_i
+            gfpp = K * diff / h4 * (diff ** 2 / h2 - 3.0)   # (c, G)
+            out[start : start + chunk_size] = (
+                dM_dfp * gfp + dM_dfpp * gfpp
+            ).sum(1)
+    return out / n
+def ift_gradient(
+    X: Tensor,
+    h_crit: float,
+    grid: Tensor,
+    eps: float,
+    tau: float,
+    grad_output: Tensor,
+    delta: float = 1e-4,
+    formula: str = 'cross',
+    chunk_size: int = 50_000,
+    analytical_n_thresh: int = 10_000,
+    safe_backward: bool = False,
+) -> Tensor:
+    """Compute the IFT gradient ∂h_crit/∂X and chain with upstream grad.
+    Implements the Implicit Function Theorem formula:
+        ∂h_crit/∂X = -(∂M̃/∂h)^{-1} · (∂M̃/∂X)  evaluated at h = h_crit
+    using a single PyTorch autograd backward pass. The denominator is
+    stabilized by sg(·, delta) to prevent division by zero.
+    Parameters
+    ----------
+    X : Tensor, shape (n,)
+        Observed data points (need not require grad on entry).
+    h_crit : float
+        Critical bandwidth found by find_h_crit.
+    grid : Tensor, shape (G,)
+        Uniform evaluation grid.
+    eps : float
+        Width of the Gaussian delta approximation.
+    tau : float
+        Sigmoid temperature.
+    grad_output : Tensor, shape ()
+        Upstream scalar gradient from the loss w.r.t. h_crit.
+    delta : float
+        Stabilisation floor for the sg denominator. Default 1e-4.
+    Returns
+    -------
+    Tensor, shape (n,)
+        Gradient of the loss w.r.t. X.
+    """
+    # torch.autograd.Function.backward runs under no_grad; we need enable_grad
+    # so that the forward pass through soft_mode_count builds a graph that lets
+    # us differentiate w.r.t. both h_tensor and X_req.
+    #
+    # Total-derivative IFT: h_crit is a function of X both as KDE data and as
+    # the source of the evaluation grid Ω(X).  We must differentiate M̃ w.r.t.
+    # X in both roles to get the correct total ∂M/∂X.
+    #
+    # We do this by rebuilding a differentiable grid from X_req inside the
+    # grad context, so autograd sees both paths automatically.
+    G = grid.shape[0]
+    margin_sigma = 3.0
+    eps_coeff = 0.1
+    tau_coeff = 0.2
+    n = X.shape[0]
+    if n > analytical_n_thresh and formula == 'cross':
+        # Large-n analytical path — O(chunk×G) peak memory.
+        # Avoids building an n×G autograd graph; uses the KDE chain rule instead.
+        # Step 1: KDE derivatives at h_crit (chunked, no graph)
+        with torch.no_grad():
+            f, fp, fpp = kde_derivatives_chunked(X, h_crit, grid, chunk_size)
+        # Step 2: ∂M̃/∂fp and ∂M̃/∂fpp via G-dim autograd only
+        with torch.enable_grad():
+            fp_req  = fp.detach().requires_grad_(True)
+            fpp_req = fpp.detach().requires_grad_(True)
+            M_val = soft_mode_count_cross_from_derivs(
+                f.detach(), fp_req, fpp_req, grid, eps, tau
+            )
+            dM_dfp, dM_dfpp = torch.autograd.grad(
+                M_val, [fp_req, fpp_req], retain_graph=False
+            )
+        # Step 3: ∂M̃/∂h via central finite differences (2 chunked KDE passes)
+        dh = h_crit * 1e-4
+        with torch.no_grad():
+            f_p, fp_p, fpp_p = kde_derivatives_chunked(X, h_crit + dh, grid, chunk_size)
+            f_m, fp_m, fpp_m = kde_derivatives_chunked(X, h_crit - dh, grid, chunk_size)
+            M_plus  = soft_mode_count_cross_from_derivs(f_p, fp_p, fpp_p, grid, eps, tau)
+            M_minus = soft_mode_count_cross_from_derivs(f_m, fp_m, fpp_m, grid, eps, tau)
+        dM_dh = torch.tensor(
+            float((M_plus - M_minus) / (2 * dh)), dtype=X.dtype, device=X.device
+        )
+        # Step 4: ∂M̃/∂X analytically (chunked chain rule, O(chunk×G) memory)
+        dM_dX = _analytical_dM_dX(X, h_crit, grid, dM_dfp, dM_dfpp, chunk_size)
+    else:
+        # Small-n autograd path — total-derivative IFT through data + grid + eps/tau.
+        # Rebuilds a differentiable grid and eps/tau from X_req so all three paths
+        # (KDE data, grid bounds, adaptive scale) contribute to ∂M̃/∂X.
+        with torch.enable_grad():
+            h_tensor = torch.tensor(h_crit, dtype=X.dtype, device=X.device, requires_grad=True)
+            X_req = X.detach().requires_grad_(True)
+            n_req = X_req.shape[0]
+            std_x = X_req.std()
+            lo = X_req.min() - margin_sigma * std_x
+            hi = X_req.max() + margin_sigma * std_x
+            t = torch.linspace(0.0, 1.0, G, dtype=X.dtype, device=X.device)
+            grid_diff = lo + (hi - lo) * t
+            mode_fn = _get_mode_count_fn(formula)
+            if formula == 'cross':
+                eps_diff = torch.tensor(eps, dtype=X.dtype, device=X.device)
+                tau_diff = torch.tensor(tau, dtype=X.dtype, device=X.device)
+            else:
+                h0_diff = 0.9 * std_x * (n_req ** -0.2)
+                u = (grid_diff.unsqueeze(0) - X_req.unsqueeze(1)) / h0_diff
+                K0 = torch.exp(-0.5 * u ** 2) / (math.sqrt(2 * math.pi) * h0_diff)
+                f_prime0 = (-u / h0_diff * K0).mean(dim=0)
+                f_dbl0 = ((u ** 2 - 1.0) / h0_diff ** 2 * K0).mean(dim=0)
+                eps_diff = eps_coeff * f_prime0.std()
+                tau_diff = tau_coeff * f_dbl0.abs().median()
+            M = mode_fn(X_req, h_tensor, grid_diff, eps_diff, tau_diff)
+            dM_dh = torch.autograd.grad(M, h_tensor, retain_graph=True, create_graph=False)[0]
+            dM_dX = torch.autograd.grad(M, X_req, retain_graph=False, create_graph=False)[0]
+    # IFT formula with stabilized denominator + denom guard.
+    DENOM_GUARD = 0.01
+    SAFE_GUARD  = 0.1   # 10× stricter clamp when safe_backward=True
+    denom_abs = dM_dh.abs().item()
+    denom_signed = dM_dh.item()            # raw signed value before any clamping
+    ift_gradient.last_denom_abs    = denom_abs
+    ift_gradient.last_denom_signed = denom_signed
+    ift_gradient.last_guard_triggered = denom_abs < DENOM_GUARD
+    if denom_abs < DENOM_GUARD:
+        warnings.warn(
+            f"DCB IFT denominator |∂M̃/∂h|={denom_abs:.2e} < {DENOM_GUARD}. "
+            "Gradient may be large. Use safe_backward=True to clamp.",
+            stacklevel=3,
+        )
+    if safe_backward and denom_abs == 0.0:
+        # Exact-zero edge case: sg(0, delta) always returns +delta, which can
+        # invert the gradient sign if dM_dh approached zero from the negative
+        # side.  Use copysign to preserve whatever sign the raw value had
+        # (copysign(SAFE_GUARD, 0.0) = +SAFE_GUARD by IEEE convention, but
+        # this is no worse than the original behaviour and documents intent).
+        safe_denom = torch.tensor(
+            math.copysign(SAFE_GUARD, denom_signed),
+            dtype=X.dtype, device=X.device,
+        )
+        dh_dX = -(1.0 / safe_denom) * dM_dX
+    else:
+        effective_guard = SAFE_GUARD if safe_backward else (DENOM_GUARD if denom_abs < DENOM_GUARD else delta)
+        dh_dX = -(1.0 / sg(dM_dh, effective_guard)) * dM_dX
+    ift_gradient.last_grad_norm = float((grad_output * dh_dX).norm().item())
+    return grad_output * dh_dX