RubyGems - ignis - Versions diffs - 0.0.1 - Mend

ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

checksums.yaml +7 -0
data/README.md +15 -0
data/lib/ignis.rb +94 -0
data/lib/nnw/platform.rb +304 -0
data/lib/nnw/shared/event_bus.rb +240 -0
data/lib/nnw/shared/ffi_loader.rb +63 -0
data/lib/nnw/shared/memory_contract.rb +204 -0
data/lib/nnw/shared/nv_array.rb +710 -0
data/lib/nnw/shared/recovery_protocol.rb +307 -0
data/lib/nvruby/configuration.rb +217 -0
data/lib/nvruby/cuda/device.rb +275 -0
data/lib/nvruby/cuda/device_props.rb +202 -0
data/lib/nvruby/cuda/graph.rb +265 -0
data/lib/nvruby/cuda/graph_bindings.rb +119 -0
data/lib/nvruby/cuda/library_loader.rb +285 -0
data/lib/nvruby/cuda/memory.rb +410 -0
data/lib/nvruby/cuda/runtime_api.rb +804 -0
data/lib/nvruby/cuda/stream.rb +234 -0
data/lib/nvruby/dtype.rb +139 -0
data/lib/nvruby/epilogues.rb +438 -0
data/lib/nvruby/errors.rb +303 -0
data/lib/nvruby/half.rb +97 -0
data/lib/nvruby/jit/compiled_kernel.rb +80 -0
data/lib/nvruby/jit/compiler.rb +231 -0
data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
data/lib/nvruby/jit/kernel.rb +240 -0
data/lib/nvruby/jit/kernel_module.rb +133 -0
data/lib/nvruby/jit/kernels/activations.rb +179 -0
data/lib/nvruby/jit/kernels/attention.rb +504 -0
data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
data/lib/nvruby/jit/kernels/loss.rb +213 -0
data/lib/nvruby/jit/kernels/normalization.rb +200 -0
data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
data/lib/nvruby/linalg/epilog.rb +67 -0
data/lib/nvruby/linalg/matmul.rb +247 -0
data/lib/nvruby/linalg/matmul_plan.rb +229 -0
data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
data/lib/nvruby/memory/device_memory_resource.rb +106 -0
data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
data/lib/nvruby/memory/stats.rb +107 -0
data/lib/nvruby/memory.rb +124 -0
data/lib/nvruby/version.rb +5 -0
metadata +108 -0

data/lib/nvruby/jit/kernels/elementwise.rb ADDED Viewed

@@ -0,0 +1,488 @@
+# frozen_string_literal: true
+module Ignis
+  module JIT
+    module Kernels
+      # Elementwise CUDA kernels for AI tensor operations.
+      # Includes arithmetic ops, initialization, and embedding ops.
+      module Elementwise
+        class << self
+          # Elementwise addition forward: c = a + b
+          # @return [Ignis::JIT::Kernel]
+          def add_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void add_forward(const float* __restrict__ a,
+                               const float* __restrict__ b,
+                               float* __restrict__ c,
+                               const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  c[idx] = a[idx] + b[idx];
+                }
+              }
+            CUDA
+            compile_cached(source, "add_forward")
+          end
+          # Elementwise addition backward: grad passes through to both inputs
+          # (identity for add — no separate kernel needed, but useful for scalar broadcast)
+          # @return [Ignis::JIT::Kernel]
+          def add_backward_broadcast
+            source = <<~CUDA
+              extern "C" __global__
+              void add_backward_broadcast(const float* __restrict__ grad_output,
+                                          float* __restrict__ grad_bias,
+                                          const int batch_size,
+                                          const int features) {
+                int f = blockIdx.x * blockDim.x + threadIdx.x;
+                if (f < features) {
+                  float sum = 0.0f;
+                  for (int b = 0; b < batch_size; b++) {
+                    sum += grad_output[b * features + f];
+                  }
+                  grad_bias[f] = sum;
+                }
+              }
+            CUDA
+            compile_cached(source, "add_backward_broadcast")
+          end
+          # Elementwise subtraction forward: c = a - b
+          # @return [Ignis::JIT::Kernel]
+          def sub_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void sub_forward(const float* __restrict__ a,
+                               const float* __restrict__ b,
+                               float* __restrict__ c,
+                               const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  c[idx] = a[idx] - b[idx];
+                }
+              }
+            CUDA
+            compile_cached(source, "sub_forward")
+          end
+          # Elementwise multiplication forward: c = a * b (Hadamard product)
+          # @return [Ignis::JIT::Kernel]
+          def mul_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void mul_forward(const float* __restrict__ a,
+                               const float* __restrict__ b,
+                               float* __restrict__ c,
+                               const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  c[idx] = a[idx] * b[idx];
+                }
+              }
+            CUDA
+            compile_cached(source, "mul_forward")
+          end
+          # Elementwise multiply backward for first operand: grad_a = grad * b
+          # @return [Ignis::JIT::Kernel]
+          def mul_backward
+            source = <<~CUDA
+              extern "C" __global__
+              void mul_backward(const float* __restrict__ grad_output,
+                                const float* __restrict__ other,
+                                float* __restrict__ grad_input,
+                                const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  grad_input[idx] = grad_output[idx] * other[idx];
+                }
+              }
+            CUDA
+            compile_cached(source, "mul_backward")
+          end
+          # Elementwise minimum: c = min(a, b)  (used by collective reductions)
+          # @return [Ignis::JIT::Kernel]
+          def min_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void min_forward(const float* __restrict__ a,
+                               const float* __restrict__ b,
+                               float* __restrict__ c,
+                               const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  c[idx] = fminf(a[idx], b[idx]);
+                }
+              }
+            CUDA
+            compile_cached(source, "min_forward")
+          end
+          # Elementwise maximum: c = max(a, b)  (used by collective reductions)
+          # @return [Ignis::JIT::Kernel]
+          def max_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void max_forward(const float* __restrict__ a,
+                               const float* __restrict__ b,
+                               float* __restrict__ c,
+                               const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  c[idx] = fmaxf(a[idx], b[idx]);
+                }
+              }
+            CUDA
+            compile_cached(source, "max_forward")
+          end
+          # Scalar multiplication: output = input * scalar
+          # @return [Ignis::JIT::Kernel]
+          def scale_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void scale_forward(const float* __restrict__ input,
+                                 float* __restrict__ output,
+                                 const float scalar,
+                                 const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  output[idx] = input[idx] * scalar;
+                }
+              }
+            CUDA
+            compile_cached(source, "scale_forward")
+          end
+          # Fill tensor with a constant value
+          # @return [Ignis::JIT::Kernel]
+          def fill
+            source = <<~CUDA
+              extern "C" __global__
+              void fill(float* __restrict__ output,
+                        const float value,
+                        const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  output[idx] = value;
+                }
+              }
+            CUDA
+            compile_cached(source, "fill")
+          end
+          # Kaiming uniform initialization: U(-bound, bound)
+          # Uses cuRAND-style Philox counter-based generator for reproducibility
+          # @return [Ignis::JIT::Kernel]
+          def kaiming_uniform_init
+            source = <<~CUDA
+              extern "C" __global__
+              void kaiming_uniform_init(float* __restrict__ output,
+                                        const float bound,
+                                        const unsigned long long seed,
+                                        const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  unsigned long long state = seed + (unsigned long long)idx;
+                  state ^= state >> 33;
+                  state *= 0xff51afd7ed558ccdULL;
+                  state ^= state >> 33;
+                  state *= 0xc4ceb9fe1a85ec53ULL;
+                  state ^= state >> 33;
+                  float u = (float)(state & 0xFFFFFFFF) / 4294967296.0f;
+                  output[idx] = (2.0f * u - 1.0f) * bound;
+                }
+              }
+            CUDA
+            compile_cached(source, "kaiming_uniform_init")
+          end
+          # Gather rows for Embedding forward: output[i] = weight[indices[i]]
+          # @return [Ignis::JIT::Kernel]
+          def gather_rows
+            source = <<~CUDA
+              extern "C" __global__
+              void gather_rows(const float* __restrict__ weight,
+                               const int* __restrict__ indices,
+                               float* __restrict__ output,
+                               const int num_indices,
+                               const int embed_dim) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                int total = num_indices * embed_dim;
+                if (idx < total) {
+                  int row = idx / embed_dim;
+                  int col = idx % embed_dim;
+                  int src_row = indices[row];
+                  output[idx] = weight[src_row * embed_dim + col];
+                }
+              }
+            CUDA
+            compile_cached(source, "gather_rows")
+          end
+          # Scatter add for Embedding backward: weight_grad[indices[i]] += grad[i]
+          # Uses atomicAdd for thread safety
+          # @return [Ignis::JIT::Kernel]
+          def scatter_add
+            source = <<~CUDA
+              extern "C" __global__
+              void scatter_add(const float* __restrict__ grad_output,
+                               const int* __restrict__ indices,
+                               float* __restrict__ grad_weight,
+                               const int num_indices,
+                               const int embed_dim) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                int total = num_indices * embed_dim;
+                if (idx < total) {
+                  int row = idx / embed_dim;
+                  int col = idx % embed_dim;
+                  int dst_row = indices[row];
+                  atomicAdd(&grad_weight[dst_row * embed_dim + col], grad_output[idx]);
+                }
+              }
+            CUDA
+            compile_cached(source, "scatter_add")
+          end
+          # Accumulate gradients: dst += src (for gradient accumulation)
+          # @return [Ignis::JIT::Kernel]
+          def accumulate
+            source = <<~CUDA
+              extern "C" __global__
+              void accumulate(float* __restrict__ dst,
+                              const float* __restrict__ src,
+                              const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  dst[idx] += src[idx];
+                }
+              }
+            CUDA
+            compile_cached(source, "accumulate")
+          end
+          # Sum reduction along the last dimension
+          # @return [Ignis::JIT::Kernel]
+          def sum_reduce
+            source = <<~CUDA
+              extern "C" __global__
+              void sum_reduce(const float* __restrict__ input,
+                              float* __restrict__ output,
+                              const int outer_size,
+                              const int reduce_size) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < outer_size) {
+                  float sum = 0.0f;
+                  for (int j = 0; j < reduce_size; j++) {
+                    sum += input[idx * reduce_size + j];
+                  }
+                  output[idx] = sum;
+                }
+              }
+            CUDA
+            compile_cached(source, "sum_reduce")
+          end
+          # Broadcast scalar gradient back to original shape
+          # @return [Ignis::JIT::Kernel]
+          def broadcast_grad
+            source = <<~CUDA
+              extern "C" __global__
+              void broadcast_grad(const float* __restrict__ grad_output,
+                                  float* __restrict__ grad_input,
+                                  const float scale,
+                                  const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  grad_input[idx] = grad_output[0] * scale;
+                }
+              }
+            CUDA
+            compile_cached(source, "broadcast_grad")
+          end
+          # Transpose 2D matrix: output[j,i] = input[i,j]
+          # Tiled for coalesced memory access
+          # @return [Ignis::JIT::Kernel]
+          def transpose_2d
+            source = <<~CUDA
+              #define TILE_DIM 32
+              #define BLOCK_ROWS 8
+              extern "C" __global__
+              void transpose_2d(const float* __restrict__ input,
+                                float* __restrict__ output,
+                                const int rows,
+                                const int cols) {
+                __shared__ float tile[TILE_DIM][TILE_DIM + 1];
+                int x = blockIdx.x * TILE_DIM + threadIdx.x;
+                int y = blockIdx.y * TILE_DIM + threadIdx.y;
+                for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+                  if (x < cols && (y + j) < rows) {
+                    tile[threadIdx.y + j][threadIdx.x] = input[(y + j) * cols + x];
+                  }
+                }
+                __syncthreads();
+                x = blockIdx.y * TILE_DIM + threadIdx.x;
+                y = blockIdx.x * TILE_DIM + threadIdx.y;
+                for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+                  if (x < rows && (y + j) < cols) {
+                    output[(y + j) * rows + x] = tile[threadIdx.x][threadIdx.y + j];
+                  }
+                }
+              }
+            CUDA
+            compile_cached(source, "transpose_2d")
+          end
+          # Copy a contiguous column range [col_off, col_off+len) from each row.
+          # dst[r, c] = src[r, col_off + c]  (dst is [rows, len], src is [rows, total_cols]).
+          # Used to split [seq, embed] projections into per-head [seq, head_dim] slices.
+          # @return [Ignis::JIT::Kernel]
+          def slice_cols
+            source = <<~CUDA
+              extern "C" __global__
+              void slice_cols(const float* __restrict__ src,
+                              float* __restrict__ dst,
+                              const int rows, const int total_cols,
+                              const int col_off, const int len) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                int total = rows * len;
+                if (idx < total) {
+                  int r = idx / len;
+                  int c = idx % len;
+                  dst[idx] = src[r * total_cols + col_off + c];
+                }
+              }
+            CUDA
+            compile_cached(source, "slice_cols")
+          end
+          # Inverse of slice_cols: dst[r, col_off + c] = src[r, c].
+          # Used to scatter per-head [seq, head_dim] results back into [seq, embed].
+          # @return [Ignis::JIT::Kernel]
+          def scatter_cols
+            source = <<~CUDA
+              extern "C" __global__
+              void scatter_cols(const float* __restrict__ src,
+                                float* __restrict__ dst,
+                                const int rows, const int total_cols,
+                                const int col_off, const int len) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                int total = rows * len;
+                if (idx < total) {
+                  int r = idx / len;
+                  int c = idx % len;
+                  dst[r * total_cols + col_off + c] = src[idx];
+                }
+              }
+            CUDA
+            compile_cached(source, "scatter_cols")
+          end
+          # Accumulating scatter: dst[r, col_off + c] += src[r, c].
+          # Used for GQA backward, where the group_size query heads sharing one KV
+          # head each contribute to the same dK/dV columns — their gradients must
+          # SUM, not overwrite. (Columns are disjoint across rows, so no atomics
+          # are needed: each (r, col_off+c) is written by exactly one thread here;
+          # accumulation across heads happens via separate launches into the buffer.)
+          # @return [Ignis::JIT::Kernel]
+          def scatter_cols_add
+            source = <<~CUDA
+              extern "C" __global__
+              void scatter_cols_add(const float* __restrict__ src,
+                                    float* __restrict__ dst,
+                                    const int rows, const int total_cols,
+                                    const int col_off, const int len) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                int total = rows * len;
+                if (idx < total) {
+                  int r = idx / len;
+                  int c = idx % len;
+                  dst[r * total_cols + col_off + c] += src[idx];
+                }
+              }
+            CUDA
+            compile_cached(source, "scatter_cols_add")
+          end
+          # Dequantize bfloat16 → float32 on-device. bf16 is exactly the top 16 bits
+          # of an fp32 value (same sign/exponent, truncated mantissa), so widening is
+          # lossless: float32_bits = uint16(bf16) << 16. Lets us load bf16 checkpoints
+          # (e.g. Llama) into fp32 weights without materializing a giant host array.
+          # @return [Ignis::JIT::Kernel]
+          def bf16_to_f32
+            source = <<~CUDA
+              extern "C" __global__
+              void bf16_to_f32(const unsigned short* __restrict__ src,
+                               float* __restrict__ dst,
+                               const int n) {
+                int i = blockIdx.x * blockDim.x + threadIdx.x;
+                if (i < n) {
+                  unsigned int bits = ((unsigned int)src[i]) << 16;
+                  dst[i] = __uint_as_float(bits);
+                }
+              }
+            CUDA
+            compile_cached(source, "bf16_to_f32")
+          end
+          # Affine transform: output = input * scale + shift (fp32).
+          # Used e.g. to map cuRAND U[0,1) into U[low, high).
+          # @return [Ignis::JIT::Kernel]
+          def affine_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void affine_forward(const float* __restrict__ input,
+                                  float* __restrict__ output,
+                                  const float scale, const float shift,
+                                  const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  output[idx] = input[idx] * scale + shift;
+                }
+              }
+            CUDA
+            compile_cached(source, "affine_forward")
+          end
+          # Row-broadcast bias add: out[r, c] = a[r, c] + bias[c]
+          # (a is [rows, cols], bias is [cols]). Linear layer bias.
+          # @return [Ignis::JIT::Kernel]
+          def add_bias_rows
+            source = <<~CUDA
+              extern "C" __global__
+              void add_bias_rows(const float* __restrict__ a,
+                                 const float* __restrict__ bias,
+                                 float* __restrict__ out,
+                                 const int rows, const int cols) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                int total = rows * cols;
+                if (idx < total) {
+                  out[idx] = a[idx] + bias[idx % cols];
+                }
+              }
+            CUDA
+            compile_cached(source, "add_bias_rows")
+          end
+          private
+          # @param source [String] CUDA source code
+          # @param name [String] kernel function name
+          # @param device_id [Integer]
+          # @return [Ignis::JIT::Kernel]
+          def compile_cached(source, name, device_id: 0)
+            Ignis::JIT::Compiler.compile(source, name, device_id: device_id)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/nvruby/jit/kernels/loss.rb ADDED Viewed

@@ -0,0 +1,213 @@
+# frozen_string_literal: true
+module Ignis
+  module JIT
+    module Kernels
+      # Loss function CUDA kernels for training.
+      # All are fused implementations for numerical stability and performance.
+      module Loss
+        class << self
+          # Fused cross-entropy forward: log_softmax + NLL in a single kernel
+          # Avoids materializing full log-softmax output
+          # @return [Ignis::JIT::Kernel]
+          def cross_entropy_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void cross_entropy_forward(const float* __restrict__ logits,
+                                          const int* __restrict__ targets,
+                                          float* __restrict__ losses,
+                                          float* __restrict__ log_softmax_out,
+                                          const int batch_size,
+                                          const int vocab_size,
+                                          const float label_smoothing) {
+                int row = blockIdx.x * blockDim.x + threadIdx.x;
+                if (row < batch_size) {
+                  const float* row_logits = logits + row * vocab_size;
+                  float* row_lsm = log_softmax_out + row * vocab_size;
+                  int target = targets[row];
+                  // Find max for numerical stability
+                  float max_val = row_logits[0];
+                  for (int j = 1; j < vocab_size; j++) {
+                    max_val = fmaxf(max_val, row_logits[j]);
+                  }
+                  // log_softmax = x - max - log(sum(exp(x - max)))
+                  float log_sum_exp = 0.0f;
+                  for (int j = 0; j < vocab_size; j++) {
+                    log_sum_exp += expf(row_logits[j] - max_val);
+                  }
+                  log_sum_exp = logf(log_sum_exp);
+                  // Compute log_softmax and store
+                  for (int j = 0; j < vocab_size; j++) {
+                    row_lsm[j] = row_logits[j] - max_val - log_sum_exp;
+                  }
+                  // NLL loss with optional label smoothing
+                  if (label_smoothing > 0.0f) {
+                    float smooth_loss = 0.0f;
+                    for (int j = 0; j < vocab_size; j++) {
+                      smooth_loss -= row_lsm[j];
+                    }
+                    smooth_loss /= (float)vocab_size;
+                    float nll = -row_lsm[target];
+                    losses[row] = (1.0f - label_smoothing) * nll + label_smoothing * smooth_loss;
+                  } else {
+                    losses[row] = -row_lsm[target];
+                  }
+                }
+              }
+            CUDA
+            compile_cached(source, "cross_entropy_forward")
+          end
+          # Cross-entropy backward: softmax(logits) - one_hot(target)
+          # Combined softmax + gradient in one kernel
+          # @return [Ignis::JIT::Kernel]
+          def cross_entropy_backward
+            source = <<~CUDA
+              extern "C" __global__
+              void cross_entropy_backward(const float* __restrict__ log_softmax,
+                                           const int* __restrict__ targets,
+                                           const float* __restrict__ grad_output,
+                                           float* __restrict__ grad_logits,
+                                           const int batch_size,
+                                           const int vocab_size,
+                                           const float label_smoothing) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                int total = batch_size * vocab_size;
+                if (idx < total) {
+                  int row = idx / vocab_size;
+                  int col = idx % vocab_size;
+                  int target = targets[row];
+                  float softmax_val = expf(log_softmax[idx]);
+                  float grad_scale = grad_output[row];
+                  if (label_smoothing > 0.0f) {
+                    float smooth_target = label_smoothing / (float)vocab_size;
+                    float hard_target = (col == target) ? (1.0f - label_smoothing + smooth_target) : smooth_target;
+                    grad_logits[idx] = grad_scale * (softmax_val - hard_target);
+                  } else {
+                    float indicator = (col == target) ? 1.0f : 0.0f;
+                    grad_logits[idx] = grad_scale * (softmax_val - indicator);
+                  }
+                }
+              }
+            CUDA
+            compile_cached(source, "cross_entropy_backward")
+          end
+          # MSE forward: (pred - target)^2, per element
+          # @return [Ignis::JIT::Kernel]
+          def mse_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void mse_forward(const float* __restrict__ predictions,
+                               const float* __restrict__ targets,
+                               float* __restrict__ losses,
+                               const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  float diff = predictions[idx] - targets[idx];
+                  losses[idx] = diff * diff;
+                }
+              }
+            CUDA
+            compile_cached(source, "mse_forward")
+          end
+          # MSE backward: 2 * (pred - target) / n
+          # @return [Ignis::JIT::Kernel]
+          def mse_backward
+            source = <<~CUDA
+              extern "C" __global__
+              void mse_backward(const float* __restrict__ predictions,
+                                const float* __restrict__ targets,
+                                const float* __restrict__ grad_output,
+                                float* __restrict__ grad_input,
+                                const int n,
+                                const float scale) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  grad_input[idx] = grad_output[idx] * 2.0f * (predictions[idx] - targets[idx]) * scale;
+                }
+              }
+            CUDA
+            compile_cached(source, "mse_backward")
+          end
+          # Binary cross-entropy with logits: -[y*log(σ(x)) + (1-y)*log(1-σ(x))]
+          # @return [Ignis::JIT::Kernel]
+          def bce_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void bce_forward(const float* __restrict__ logits,
+                               const float* __restrict__ targets,
+                               float* __restrict__ losses,
+                               const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  float x = logits[idx];
+                  float y = targets[idx];
+                  // Numerically stable: max(x,0) - x*y + log(1+exp(-|x|))
+                  float max_val = fmaxf(x, 0.0f);
+                  losses[idx] = max_val - x * y + logf(1.0f + expf(-fabsf(x)));
+                }
+              }
+            CUDA
+            compile_cached(source, "bce_forward")
+          end
+          # BCE backward: σ(x) - y
+          # @return [Ignis::JIT::Kernel]
+          def bce_backward
+            source = <<~CUDA
+              extern "C" __global__
+              void bce_backward(const float* __restrict__ logits,
+                                const float* __restrict__ targets,
+                                const float* __restrict__ grad_output,
+                                float* __restrict__ grad_input,
+                                const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  float sig = 1.0f / (1.0f + expf(-logits[idx]));
+                  grad_input[idx] = grad_output[idx] * (sig - targets[idx]);
+                }
+              }
+            CUDA
+            compile_cached(source, "bce_backward")
+          end
+          # Mean reduction: compute mean of array
+          # @return [Ignis::JIT::Kernel]
+          def mean_reduce
+            source = <<~CUDA
+              extern "C" __global__
+              void mean_reduce(const float* __restrict__ input,
+                               float* __restrict__ output,
+                               const int n) {
+                // Single-thread simple reduction (for loss scalar)
+                if (blockIdx.x == 0 && threadIdx.x == 0) {
+                  float sum = 0.0f;
+                  for (int i = 0; i < n; i++) {
+                    sum += input[i];
+                  }
+                  output[0] = sum / (float)n;
+                }
+              }
+            CUDA
+            compile_cached(source, "mean_reduce")
+          end
+          private
+          def compile_cached(source, name, device_id: 0)
+            Ignis::JIT::Compiler.compile(source, name, device_id: device_id)
+          end
+        end
+      end
+    end
+  end
+end