PyPI - lucid-dl - Versions diffs - 2.11.0__py3-none-any.whl → 2.11.2__py3-none-any.whl - Mend

lucid-dl 2.11.0py3-none-any.whl → 2.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lucid/__init__.py +4 -2
lucid/_backend/core.py +89 -9
lucid/_backend/metal.py +5 -1
lucid/_func/__init__.py +162 -0
lucid/_tensor/{tensor_ops.py → base.py} +64 -0
lucid/_tensor/tensor.py +63 -19
lucid/autograd/__init__.py +4 -1
lucid/datasets/mnist.py +135 -6
lucid/models/imggen/__init__.py +1 -0
lucid/models/imggen/ncsn.py +402 -0
lucid/nn/_kernel/__init__.py +1 -0
lucid/nn/_kernel/activation.py +188 -0
lucid/nn/_kernel/attention.py +125 -0
lucid/{_backend → nn/_kernel}/conv.py +4 -13
lucid/nn/_kernel/embedding.py +72 -0
lucid/nn/_kernel/loss.py +416 -0
lucid/nn/_kernel/norm.py +365 -0
lucid/{_backend → nn/_kernel}/pool.py +7 -27
lucid/nn/functional/__init__.py +4 -0
lucid/nn/functional/_activation.py +19 -13
lucid/nn/functional/_attention.py +9 -0
lucid/nn/functional/_conv.py +5 -16
lucid/nn/functional/_loss.py +31 -32
lucid/nn/functional/_norm.py +60 -69
lucid/nn/functional/_pool.py +7 -7
lucid/nn/functional/_util.py +5 -1
lucid/nn/init/_dist.py +1 -0
lucid/types.py +24 -2
{lucid_dl-2.11.0.dist-info → lucid_dl-2.11.2.dist-info}/METADATA +7 -5
{lucid_dl-2.11.0.dist-info → lucid_dl-2.11.2.dist-info}/RECORD +33 -26
{lucid_dl-2.11.0.dist-info → lucid_dl-2.11.2.dist-info}/WHEEL +1 -1
{lucid_dl-2.11.0.dist-info → lucid_dl-2.11.2.dist-info}/licenses/LICENSE +0 -0
{lucid_dl-2.11.0.dist-info → lucid_dl-2.11.2.dist-info}/top_level.txt +0 -0

lucid/nn/_kernel/norm.py ADDED Viewed

@@ -0,0 +1,365 @@
+import functools
+from types import ModuleType
+from typing import Sequence
+import numpy as np
+from lucid._backend.core import Operation, func_op, _FuncOpReturnType, _GradType
+from lucid._backend.metal import mx
+from lucid._tensor import Tensor
+from lucid.types import _DeviceType
+def _norm_axes(ndim: int, normalized_shape: Sequence[int]) -> tuple[int, ...]:
+    return tuple(range(ndim - len(normalized_shape), ndim))
+def _broadcast_shape(ndim: int, normalized_shape: Sequence[int]) -> tuple[int, ...]:
+    return (1,) * (ndim - len(normalized_shape)) + tuple(normalized_shape)
+class layer_norm_kernel(Operation):
+    def __init__(
+        self,
+        normalized_shape: Sequence[int],
+        eps: float = 1e-5,
+        has_weight: bool = True,
+        has_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.normalized_shape = tuple(int(v) for v in normalized_shape)
+        self.eps = float(eps)
+        self.has_weight = bool(has_weight)
+        self.has_bias = bool(has_bias)
+        self._xhat = None
+        self._rstd = None
+        self._norm_axes = None
+        self._n = None
+    def clear(self) -> None:
+        super().clear()
+        self._xhat = None
+        self._rstd = None
+        self._norm_axes = None
+        self._n = None
+    @func_op(n_in=3, n_ret=1, device="cpu")
+    def cpu(self, a: Tensor, w: Tensor, b: Tensor) -> _FuncOpReturnType:
+        return self._forward(a, w, b, lib_=np, device="cpu")
+    @func_op(n_in=3, n_ret=1, device="gpu")
+    def gpu(self, a: Tensor, w: Tensor, b: Tensor) -> _FuncOpReturnType:
+        return self._forward(a, w, b, lib_=mx, device="gpu")
+    def _forward(
+        self,
+        a: Tensor,
+        w: Tensor,
+        b: Tensor,
+        lib_: ModuleType,
+        device: _DeviceType,
+    ) -> _FuncOpReturnType:
+        norm_axes = _norm_axes(a.ndim, self.normalized_shape)
+        n = int(np.prod(self.normalized_shape))
+        mean = lib_.mean(a.data, axis=norm_axes, keepdims=True)
+        var = lib_.var(a.data, axis=norm_axes, keepdims=True)
+        rstd = 1.0 / lib_.sqrt(var + self.eps)
+        xhat = (a.data - mean) * rstd
+        out = xhat
+        if self.has_weight:
+            out = out * w.data.reshape(_broadcast_shape(a.ndim, self.normalized_shape))
+        if self.has_bias:
+            out = out + b.data.reshape(_broadcast_shape(a.ndim, self.normalized_shape))
+        self._xhat = xhat
+        self._rstd = rstd
+        self._norm_axes = norm_axes
+        self._n = n
+        self.result = Tensor(out, device=device)
+        return self.result, functools.partial(self.__grad__, a=a, w=w, lib_=lib_)
+    def __grad__(self, a: Tensor, w: Tensor, lib_: ModuleType) -> _GradType:
+        if self.result is None or self.result.grad is None:
+            raise RuntimeError("layer_norm backward called before forward.")
+        if self._xhat is None or self._rstd is None or self._norm_axes is None:
+            raise RuntimeError("layer_norm cached data missing.")
+        dy = self.result.grad
+        xhat = self._xhat
+        rstd = self._rstd
+        norm_axes = self._norm_axes
+        n = self._n if self._n is not None else int(np.prod(self.normalized_shape))
+        if self.has_weight:
+            w_broadcast = w.data.reshape(
+                _broadcast_shape(a.ndim, self.normalized_shape)
+            )
+            dyw = dy * w_broadcast
+        else:
+            dyw = dy
+        sum1 = lib_.sum(dyw, axis=norm_axes, keepdims=True)
+        sum2 = lib_.sum(dyw * xhat, axis=norm_axes, keepdims=True)
+        dx = (1.0 / n) * rstd * (n * dyw - sum1 - xhat * sum2)
+        reduce_axes = tuple(range(0, a.ndim - len(self.normalized_shape)))
+        if reduce_axes:
+            dweight = lib_.sum(dy * xhat, axis=reduce_axes)
+            dbias = lib_.sum(dy, axis=reduce_axes)
+        else:
+            dweight = dy * xhat
+            dbias = dy
+        return dx, dweight, dbias
+class batch_norm_kernel(Operation):
+    def __init__(
+        self,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        training: bool = True,
+        has_running: bool = True,
+        has_weight: bool = True,
+        has_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.eps = float(eps)
+        self.momentum = float(momentum)
+        self.training = bool(training)
+        self.has_running = bool(has_running)
+        self.has_weight = bool(has_weight)
+        self.has_bias = bool(has_bias)
+        self._xhat = None
+        self._rstd = None
+        self._axes = None
+        self._m = None
+        self._use_batch_stats = None
+    def clear(self) -> None:
+        super().clear()
+        self._xhat = None
+        self._rstd = None
+        self._axes = None
+        self._m = None
+        self._use_batch_stats = None
+    @func_op(n_in=5, n_ret=1, device="cpu")
+    def cpu(
+        self, a: Tensor, running_mean: Tensor, running_var: Tensor, w: Tensor, b: Tensor
+    ) -> _FuncOpReturnType:
+        return self._forward(a, running_mean, running_var, w, b, lib_=np, device="cpu")
+    @func_op(n_in=5, n_ret=1, device="gpu")
+    def gpu(
+        self, a: Tensor, running_mean: Tensor, running_var: Tensor, w: Tensor, b: Tensor
+    ) -> _FuncOpReturnType:
+        return self._forward(a, running_mean, running_var, w, b, lib_=mx, device="gpu")
+    def _forward(
+        self,
+        a: Tensor,
+        running_mean: Tensor,
+        running_var: Tensor,
+        w: Tensor,
+        b: Tensor,
+        lib_: ModuleType,
+        device: _DeviceType,
+    ) -> _FuncOpReturnType:
+        axes = (0,) + tuple(range(2, a.ndim))
+        m = int(np.prod([a.shape[i] for i in axes]))
+        use_batch_stats = self.training or not self.has_running
+        if use_batch_stats:
+            mean = lib_.mean(a.data, axis=axes, keepdims=True)
+            var = lib_.var(a.data, axis=axes, keepdims=True)
+            if self.training and self.has_running:
+                rm = (
+                    self.momentum * mean.reshape(-1)
+                    + (1 - self.momentum) * running_mean.data
+                )
+                rv = (
+                    self.momentum * var.reshape(-1)
+                    + (1 - self.momentum) * running_var.data
+                )
+                running_mean.data = rm
+                running_var.data = rv
+        else:
+            mean = running_mean.data.reshape(1, -1, *([1] * (a.ndim - 2)))
+            var = running_var.data.reshape(1, -1, *([1] * (a.ndim - 2)))
+        rstd = 1.0 / lib_.sqrt(var + self.eps)
+        xhat = (a.data - mean) * rstd
+        out = xhat
+        if self.has_weight:
+            out = out * w.data.reshape(1, -1, *([1] * (a.ndim - 2)))
+        if self.has_bias:
+            out = out + b.data.reshape(1, -1, *([1] * (a.ndim - 2)))
+        self._xhat = xhat
+        self._rstd = rstd
+        self._axes = axes
+        self._m = m
+        self._use_batch_stats = use_batch_stats
+        self.result = Tensor(out, device=device)
+        return self.result, functools.partial(self.__grad__, a=a, w=w, lib_=lib_)
+    def __grad__(self, a: Tensor, w: Tensor, lib_: ModuleType) -> _GradType:
+        if self.result is None or self.result.grad is None:
+            raise RuntimeError("batch_norm backward called before forward.")
+        if self._rstd is None or self._axes is None or self._m is None:
+            raise RuntimeError("batch_norm cached data missing.")
+        dy = self.result.grad
+        axes = self._axes
+        m = self._m
+        if self.has_weight:
+            w_broadcast = w.data.reshape(1, -1, *([1] * (a.ndim - 2)))
+            dyw = dy * w_broadcast
+        else:
+            dyw = dy
+        if self._use_batch_stats:
+            xhat = self._xhat
+            rstd = self._rstd
+            sum1 = lib_.sum(dyw, axis=axes, keepdims=True)
+            sum2 = lib_.sum(dyw * xhat, axis=axes, keepdims=True)
+            dx = (1.0 / m) * rstd * (m * dyw - sum1 - xhat * sum2)
+        else:
+            rstd = self._rstd
+            dx = dyw * rstd
+        reduce_axes = (0,) + tuple(range(2, a.ndim))
+        dweight = lib_.sum(
+            dy * (self._xhat if self._xhat is not None else 1.0), axis=reduce_axes
+        )
+        dbias = lib_.sum(dy, axis=reduce_axes)
+        return dx, None, None, dweight, dbias
+class group_norm_kernel(Operation):
+    def __init__(
+        self,
+        num_groups: int,
+        eps: float = 1e-5,
+        has_weight: bool = True,
+        has_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.num_groups = int(num_groups)
+        self.eps = float(eps)
+        self.has_weight = bool(has_weight)
+        self.has_bias = bool(has_bias)
+        self._xhat = None
+        self._rstd = None
+        self._group_shape = None
+        self._reduce_axes = None
+        self._m = None
+    def clear(self) -> None:
+        super().clear()
+        self._xhat = None
+        self._rstd = None
+        self._group_shape = None
+        self._reduce_axes = None
+        self._m = None
+    @func_op(n_in=3, n_ret=1, device="cpu")
+    def cpu(self, a: Tensor, w: Tensor, b: Tensor) -> _FuncOpReturnType:
+        return self._forward(a, w, b, lib_=np, device="cpu")
+    @func_op(n_in=3, n_ret=1, device="gpu")
+    def gpu(self, a: Tensor, w: Tensor, b: Tensor) -> _FuncOpReturnType:
+        return self._forward(a, w, b, lib_=mx, device="gpu")
+    def _forward(
+        self,
+        a: Tensor,
+        w: Tensor,
+        b: Tensor,
+        lib_: ModuleType,
+        device: _DeviceType,
+    ) -> _FuncOpReturnType:
+        N, C, *spatial = a.shape
+        if C % self.num_groups != 0:
+            raise ValueError("num_groups must divide channels.")
+        group_size = C // self.num_groups
+        group_shape = (N, self.num_groups, group_size, *spatial)
+        x = a.data.reshape(group_shape)
+        reduce_axes = (2,) + tuple(range(3, x.ndim))
+        m = int(np.prod([x.shape[i] for i in reduce_axes]))
+        mean = lib_.mean(x, axis=reduce_axes, keepdims=True)
+        var = lib_.var(x, axis=reduce_axes, keepdims=True)
+        rstd = 1.0 / lib_.sqrt(var + self.eps)
+        xhat = (x - mean) * rstd
+        out = xhat.reshape(a.shape)
+        if self.has_weight:
+            out = out * w.data.reshape(1, C, *([1] * len(spatial)))
+        if self.has_bias:
+            out = out + b.data.reshape(1, C, *([1] * len(spatial)))
+        self._xhat = xhat
+        self._rstd = rstd
+        self._group_shape = group_shape
+        self._reduce_axes = reduce_axes
+        self._m = m
+        self.result = Tensor(out, device=device)
+        return self.result, functools.partial(self.__grad__, a=a, w=w, b=b, lib_=lib_)
+    def __grad__(self, a: Tensor, w: Tensor, b: Tensor, lib_: ModuleType) -> _GradType:
+        if self.result is None or self.result.grad is None:
+            raise RuntimeError("group_norm backward called before forward.")
+        if (
+            self._xhat is None
+            or self._rstd is None
+            or self._group_shape is None
+            or self._reduce_axes is None
+            or self._m is None
+        ):
+            raise RuntimeError("group_norm cached data missing.")
+        dy = self.result.grad
+        N, C, *spatial = a.shape
+        dy_g = dy.reshape(self._group_shape)
+        xhat = self._xhat
+        rstd = self._rstd
+        axes = self._reduce_axes
+        m = self._m
+        if self.has_weight:
+            w_broadcast = w.data.reshape(1, C, *([1] * len(spatial)))
+            dyw = dy * w_broadcast
+            dyw_g = dyw.reshape(self._group_shape)
+        else:
+            dyw_g = dy_g
+        sum1 = lib_.sum(dyw_g, axis=axes, keepdims=True)
+        sum2 = lib_.sum(dyw_g * xhat, axis=axes, keepdims=True)
+        dx_g = (1.0 / m) * rstd * (m * dyw_g - sum1 - xhat * sum2)
+        dx = dx_g.reshape(a.shape)
+        reduce_axes = (0,) + tuple(range(2, a.ndim))
+        dweight = lib_.sum(dy * xhat.reshape(a.shape), axis=reduce_axes)
+        dbias = lib_.sum(dy, axis=reduce_axes)
+        return dx, dweight, dbias

lucid/{_backend → nn/_kernel}/pool.py RENAMED Viewed

@@ -8,7 +8,7 @@ import numpy as np
 from lucid._tensor import Tensor
 from lucid._backend.core import (
     Operation,
-    unary_func_op,
+    func_op,
     _FuncOpReturnType,
     _GradType,
 )
@@ -92,11 +92,7 @@ def _where(lib_: ModuleType, cond: _Array, x: _Array, y: _Array) -> _Array:
 def _pool_forward_sum(
-    lib_: ModuleType,
-    x_pad: _Array,
-    out_dims: _Shape,
-    kernel_size: _Shape,
-    stride: _Shape,
+    x_pad: _Array, out_dims: _Shape, kernel_size: _Shape, stride: _Shape
 ) -> _Array:
     out = None
     for k_idx in itertools.product(*[range(k) for k in kernel_size]):
@@ -211,7 +207,7 @@ def _pool_backward_max(
     return _crop_padding(grad_input_pad, padding)
-class pool_nd(Operation):
+class pool_nd_kernel(Operation):
     def __init__(
         self,
         kernel_size: int | tuple[int, ...] | list[int],
@@ -259,7 +255,7 @@ class pool_nd(Operation):
         return kernel, stride, padding
-    @unary_func_op()
+    @func_op(n_in=1, n_ret=1)
     def cpu(self, a: Tensor) -> _FuncOpReturnType:
         kernel, stride, padding = self._normalize(a)
         out_dims = _pool_out_dims(a.shape[2:], kernel, stride, padding)
@@ -268,7 +264,7 @@ class pool_nd(Operation):
         x_pad = _pad_input(np, a.data, padding)
         if self.mode == "avg":
-            out_sum = _pool_forward_sum(np, x_pad, out_dims, kernel, stride)
+            out_sum = _pool_forward_sum(x_pad, out_dims, kernel, stride)
             out = out_sum / _prod(kernel)
         else:
             out, max_idx = _pool_forward_max(np, x_pad, out_dims, kernel, stride)
@@ -277,7 +273,7 @@ class pool_nd(Operation):
         self.result = Tensor(out)
         return self.result, partial(self.__grad__, lib_=np)
-    @unary_func_op(device="gpu")
+    @func_op(n_in=1, n_ret=1, device="gpu")
     def gpu(self, a: Tensor) -> _FuncOpReturnType:
         kernel, stride, padding = self._normalize(a)
         out_dims = _pool_out_dims(a.shape[2:], kernel, stride, padding)
@@ -286,7 +282,7 @@ class pool_nd(Operation):
         x_pad = _pad_input(mx, a.data, padding)
         if self.mode == "avg":
-            out_sum = _pool_forward_sum(mx, x_pad, out_dims, kernel, stride)
+            out_sum = _pool_forward_sum(x_pad, out_dims, kernel, stride)
             out = out_sum / _prod(kernel)
         else:
             out, max_idx = _pool_forward_max(mx, x_pad, out_dims, kernel, stride)
@@ -350,19 +346,3 @@ class pool_nd(Operation):
         if self.mode == "avg":
             return out_elems * kernel_elems
         return out_elems * max(kernel_elems - 1, 0)
-def avg_pool_nd_op(
-    kernel_size: int | tuple[int, ...] | list[int],
-    stride: int | tuple[int, ...] | list[int],
-    padding: int | tuple[int, ...] | list[int],
-) -> pool_nd:
-    return pool_nd(kernel_size, stride, padding, mode="avg")
-def max_pool_nd_op(
-    kernel_size: int | tuple[int, ...] | list[int],
-    stride: int | tuple[int, ...] | list[int],
-    padding: int | tuple[int, ...] | list[int],
-) -> pool_nd:
-    return pool_nd(kernel_size, stride, padding, mode="max")

lucid/nn/functional/__init__.py CHANGED Viewed

@@ -54,6 +54,10 @@ def tanh(input_: Tensor) -> Tensor:
     return _activation.tanh(input_)
+def silu(input_: Tensor) -> Tensor:
+    return _activation.silu(input_)
 def softmax(input_: Tensor, axis: int = -1) -> Tensor:
     return _activation.softmax(input_, axis)

lucid/nn/functional/_activation.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import lucid
 from lucid._tensor import Tensor
+from lucid.nn._kernel.activation import (
+    softmax_kernel,
+    sigmoid_kernel,
+    gelu_kernel,
+    silu_kernel,
+)
 def relu(input_: Tensor) -> Tensor:
@@ -9,14 +15,14 @@ def relu(input_: Tensor) -> Tensor:
 def leaky_relu(input_: Tensor, negative_slope: float = 0.01) -> Tensor:
     mask = input_ > 0
-    out = input_ * mask + input_ * negative_slope * (1 - mask)
+    out = input_ * mask + input_ * negative_slope * (~mask)
     return out
 def elu(input_: Tensor, alpha: float = 1.0) -> Tensor:
     mask = input_ >= 0
     pos = input_ * mask
-    neg = alpha * (lucid.exp(input_) - 1) * (1 - mask)
+    neg = alpha * (lucid.exp(input_) - 1) * (~mask)
     return pos + neg
@@ -26,29 +32,29 @@ def selu(input_: Tensor) -> Tensor:
     mask = input_ >= 0
     pos = _scale * input_ * mask
-    neg = _scale * _alpha * (lucid.exp(input_) - 1) * (1 - mask)
+    neg = _scale * _alpha * (lucid.exp(input_) - 1) * (~mask)
     return pos + neg
 def gelu(input_: Tensor) -> Tensor:
-    c = lucid.sqrt(2 / lucid.pi).free()
-    return 0.5 * input_ * (1 + lucid.tanh(c * (input_ + 0.044715 * input_**3)))
+    op = gelu_kernel()
+    return op(input_)
 def sigmoid(input_: Tensor) -> Tensor:
-    return 1 / (1 + lucid.exp(-input_))
+    op = sigmoid_kernel()
+    return op(input_)
 def tanh(input_: Tensor) -> Tensor:
     return lucid.tanh(input_)
-def softmax(input_: Tensor, axis: int = -1) -> Tensor:
-    input_max = lucid.max(input_, axis=axis, keepdims=True)
-    input_stable = input_ - input_max
+def silu(input_: Tensor) -> Tensor:
+    op = silu_kernel()
+    return op(input_)
-    e_input = lucid.exp(input_stable)
-    sum_e_input = e_input.sum(axis=axis, keepdims=True)
-    output = e_input / sum_e_input
-    return output
+def softmax(input_: Tensor, axis: int = -1) -> Tensor:
+    op = softmax_kernel(axis=axis)
+    return op(input_)

lucid/nn/functional/_attention.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import math
 import lucid
 import lucid.nn.functional as F
+from lucid.nn._kernel.attention import scaled_dot_product_attention_kernel
 from lucid._tensor import Tensor
@@ -14,6 +17,12 @@ def scaled_dot_product_attention(
     is_causal: bool = False,
     scale: float | None = None,
 ) -> Tensor:
+    if dropout_p == 0.0:
+        op = scaled_dot_product_attention_kernel(
+            attn_mask=attn_mask, is_causal=is_causal, scale=scale
+        )
+        return op(query, key, value)
     L, S = query.shape[-2], key.shape[-2]
     scale_factor = 1 / math.sqrt(query.shape[-1]) if scale is None else scale
     attn_bias = lucid.zeros(L, S, dtype=query.dtype).free()

lucid/nn/functional/_conv.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Tuple, Optional
 import lucid
 from lucid._tensor import Tensor
-from lucid._backend.conv import conv_nd_op
+from lucid.nn._kernel.conv import conv_nd_kernel
 def unfold(
@@ -66,17 +66,6 @@ def unfold(
     return col.reshape((N_out, C_filt))
-def _conv_tensor(
-    input_: Tensor,
-    weight: Tensor,
-    stride: Tuple[int, ...],
-    padding: Tuple[int, ...],
-    dilation: Tuple[int, ...],
-    groups: int,
-) -> Tensor:
-    return conv_nd_op(stride, padding, dilation, groups)(input_, weight)
 def conv(
     input_: Tensor,
     weight: Tensor,
@@ -92,7 +81,7 @@ def conv(
     if len(stride) != len(padding) or len(stride) != len(dilation):
         raise ValueError("Stride, padding, and dilation must have the same length.")
-    out = _conv_tensor(input_, weight, stride, padding, dilation, groups)
+    out = conv_nd_kernel(stride, padding, dilation, groups)(input_, weight)
     if bias is not None:
         bias_sh = [1, weight.shape[0]] + [1] * (input_.ndim - 2)
@@ -181,9 +170,9 @@ def conv_transpose(
                     zeros = lucid.zeros(*zero_shape, dtype=ups.dtype, device=ups.device)
                     ups = lucid.concatenate([ups, zeros], axis=axis)
-        out_g = _conv_tensor(
-            ups, w_t, stride=(1,) * D, padding=pad_, dilation=dilation, groups=1
-        )
+        out_g = conv_nd_kernel(
+            stride=(1,) * D, padding=pad_, dilation=dilation, groups=1
+        )(ups, w_t)
         outputs.append(out_g)
     output = lucid.concatenate(outputs, axis=1)

lucid/nn/functional/_loss.py CHANGED Viewed

@@ -3,6 +3,12 @@ from typing import Literal
 import lucid
 from lucid._tensor import Tensor
+from lucid.nn._kernel.loss import (
+    cross_entropy_kernel,
+    binary_cross_entropy_kernel,
+    binary_cross_entropy_with_logits_kernel,
+)
 _ReductionType = Literal["mean", "sum"]
@@ -55,13 +61,14 @@ def binary_cross_entropy(
     reduction: _ReductionType | None = "mean",
     eps: float = 1e-7,
 ) -> Tensor:
-    input_ = lucid.clip(input_, eps, 1 - eps)
-    loss = -target * lucid.log(input_) - (1 - target) * lucid.log(1 - input_)
+    has_weight = weight is not None
+    if weight is None:
+        weight = lucid.ones_like(input_, device=input_.device)
-    if weight is not None:
-        loss *= weight
-    return _loss_reduction(loss, reduction)
+    op = binary_cross_entropy_kernel(
+        reduction=reduction, eps=eps, has_weight=has_weight
+    )
+    return op(input_, target, weight)
 def binary_cross_entropy_with_logits(
@@ -71,19 +78,17 @@ def binary_cross_entropy_with_logits(
     pos_weight: Tensor | None = None,
     reduction: _ReductionType | None = "mean",
 ) -> Tensor:
-    max_val = lucid.maximum(-input_, 0)
-    sp = max_val + lucid.log(lucid.exp(-max_val) + lucid.exp(-input_ - max_val))
+    has_weight = weight is not None
+    has_pos_weight = pos_weight is not None
+    if weight is None:
+        weight = lucid.ones_like(input_, device=input_.device)
+    if pos_weight is None:
+        pos_weight = lucid.ones_like(input_, device=input_.device)
-    if pos_weight is not None:
-        coeff = 1 + (pos_weight - 1) * target
-        loss = (1 - target) * input_ + coeff * sp
-    else:
-        loss = lucid.maximum(input_, 0) - input_ * target + sp
-    if weight is not None:
-        loss *= weight
-    return _loss_reduction(loss, reduction)
+    op = binary_cross_entropy_with_logits_kernel(
+        reduction=reduction, has_weight=has_weight, has_pos_weight=has_pos_weight
+    )
+    return op(input_, target, weight, pos_weight)
 def cross_entropy(
@@ -94,20 +99,14 @@ def cross_entropy(
     eps: float = 1e-7,
     ignore_index: int | None = None,
 ) -> Tensor:
-    exp_logits = lucid.exp(input_ - lucid.max(input_, axis=1, keepdims=True))
-    prob = exp_logits / lucid.sum(exp_logits, axis=1, keepdims=True)
-    indices = lucid.arange(input_.shape[0], device=input_.device).astype(lucid.Int)
-    target_int = target.astype(lucid.Int)
-    loss = -lucid.log(prob[indices, target_int] + eps)
-    if weight is not None:
-        loss *= weight[target_int]
-    if ignore_index is not None:
-        return _ignore_index_loss(loss, target_int, ignore_index, reduction)
-    return _loss_reduction(loss, reduction)
+    has_weight = weight is not None
+    if weight is None:
+        weight = lucid.ones((input_.shape[1],), device=input_.device)
+    op = cross_entropy_kernel(
+        reduction=reduction, eps=eps, ignore_index=ignore_index, has_weight=has_weight
+    )
+    return op(input_, target, weight)
 def nll_loss(

lucid-dl 2.11.0__py3-none-any.whl → 2.11.2__py3-none-any.whl

lucid-dl 2.11.0py3-none-any.whl → 2.11.2py3-none-any.whl