RubyGems - ignis - Versions diffs - 0.0.1 - Mend

ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

checksums.yaml +7 -0
data/README.md +15 -0
data/lib/ignis.rb +94 -0
data/lib/nnw/platform.rb +304 -0
data/lib/nnw/shared/event_bus.rb +240 -0
data/lib/nnw/shared/ffi_loader.rb +63 -0
data/lib/nnw/shared/memory_contract.rb +204 -0
data/lib/nnw/shared/nv_array.rb +710 -0
data/lib/nnw/shared/recovery_protocol.rb +307 -0
data/lib/nvruby/configuration.rb +217 -0
data/lib/nvruby/cuda/device.rb +275 -0
data/lib/nvruby/cuda/device_props.rb +202 -0
data/lib/nvruby/cuda/graph.rb +265 -0
data/lib/nvruby/cuda/graph_bindings.rb +119 -0
data/lib/nvruby/cuda/library_loader.rb +285 -0
data/lib/nvruby/cuda/memory.rb +410 -0
data/lib/nvruby/cuda/runtime_api.rb +804 -0
data/lib/nvruby/cuda/stream.rb +234 -0
data/lib/nvruby/dtype.rb +139 -0
data/lib/nvruby/epilogues.rb +438 -0
data/lib/nvruby/errors.rb +303 -0
data/lib/nvruby/half.rb +97 -0
data/lib/nvruby/jit/compiled_kernel.rb +80 -0
data/lib/nvruby/jit/compiler.rb +231 -0
data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
data/lib/nvruby/jit/kernel.rb +240 -0
data/lib/nvruby/jit/kernel_module.rb +133 -0
data/lib/nvruby/jit/kernels/activations.rb +179 -0
data/lib/nvruby/jit/kernels/attention.rb +504 -0
data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
data/lib/nvruby/jit/kernels/loss.rb +213 -0
data/lib/nvruby/jit/kernels/normalization.rb +200 -0
data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
data/lib/nvruby/linalg/epilog.rb +67 -0
data/lib/nvruby/linalg/matmul.rb +247 -0
data/lib/nvruby/linalg/matmul_plan.rb +229 -0
data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
data/lib/nvruby/memory/device_memory_resource.rb +106 -0
data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
data/lib/nvruby/memory/stats.rb +107 -0
data/lib/nvruby/memory.rb +124 -0
data/lib/nvruby/version.rb +5 -0
metadata +108 -0

data/lib/nvruby/jit/kernels/normalization.rb ADDED Viewed

@@ -0,0 +1,200 @@
+# frozen_string_literal: true
+module Ignis
+  module JIT
+    module Kernels
+      # Layer normalization CUDA kernels.
+      # Forward computes mean, variance, normalizes, scales, and shifts.
+      # Backward computes gradients for input, weight (gamma), and bias (beta).
+      module Normalization
+        class << self
+          # LayerNorm forward: y = gamma * (x - mean) / sqrt(var + eps) + beta
+          # Each row (last dim) is normalized independently.
+          # @return [Ignis::JIT::Kernel]
+          def layer_norm_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void layer_norm_forward(const float* __restrict__ input,
+                                      const float* __restrict__ gamma,
+                                      const float* __restrict__ beta,
+                                      float* __restrict__ output,
+                                      float* __restrict__ mean_out,
+                                      float* __restrict__ rstd_out,
+                                      const int outer_size,
+                                      const int norm_size,
+                                      const float eps) {
+                int row = blockIdx.x * blockDim.x + threadIdx.x;
+                if (row < outer_size) {
+                  const float* in_row = input + row * norm_size;
+                  float* out_row = output + row * norm_size;
+                  // Compute mean
+                  float mean = 0.0f;
+                  for (int j = 0; j < norm_size; j++) {
+                    mean += in_row[j];
+                  }
+                  mean /= (float)norm_size;
+                  // Compute variance
+                  float var = 0.0f;
+                  for (int j = 0; j < norm_size; j++) {
+                    float diff = in_row[j] - mean;
+                    var += diff * diff;
+                  }
+                  var /= (float)norm_size;
+                  float rstd = rsqrtf(var + eps);
+                  // Save for backward pass
+                  if (mean_out) mean_out[row] = mean;
+                  if (rstd_out) rstd_out[row] = rstd;
+                  // Normalize, scale, shift
+                  for (int j = 0; j < norm_size; j++) {
+                    float normalized = (in_row[j] - mean) * rstd;
+                    out_row[j] = gamma[j] * normalized + beta[j];
+                  }
+                }
+              }
+            CUDA
+            compile_cached(source, "layer_norm_forward")
+          end
+          # LayerNorm backward: computes dL/dx, dL/dgamma, dL/dbeta
+          # @return [Ignis::JIT::Kernel]
+          def layer_norm_backward
+            source = <<~CUDA
+              extern "C" __global__
+              void layer_norm_backward(const float* __restrict__ grad_output,
+                                       const float* __restrict__ input,
+                                       const float* __restrict__ gamma,
+                                       const float* __restrict__ mean,
+                                       const float* __restrict__ rstd,
+                                       float* __restrict__ grad_input,
+                                       float* __restrict__ grad_gamma,
+                                       float* __restrict__ grad_beta,
+                                       const int outer_size,
+                                       const int norm_size) {
+                int row = blockIdx.x * blockDim.x + threadIdx.x;
+                if (row < outer_size) {
+                  const float* go = grad_output + row * norm_size;
+                  const float* in_row = input + row * norm_size;
+                  float* gi = grad_input + row * norm_size;
+                  float m = mean[row];
+                  float rs = rstd[row];
+                  // Compute intermediate sums for efficient backward
+                  float sum_go_x = 0.0f;
+                  float sum_go = 0.0f;
+                  for (int j = 0; j < norm_size; j++) {
+                    float x_hat = (in_row[j] - m) * rs;
+                    sum_go_x += go[j] * gamma[j] * x_hat;
+                    sum_go += go[j] * gamma[j];
+                  }
+                  float inv_n = 1.0f / (float)norm_size;
+                  // Compute grad_input
+                  for (int j = 0; j < norm_size; j++) {
+                    float x_hat = (in_row[j] - m) * rs;
+                    gi[j] = rs * (go[j] * gamma[j] - inv_n * (sum_go + x_hat * sum_go_x));
+                  }
+                  // Accumulate grad_gamma and grad_beta (needs atomicAdd for multi-row)
+                  for (int j = 0; j < norm_size; j++) {
+                    float x_hat = (in_row[j] - m) * rs;
+                    atomicAdd(&grad_gamma[j], go[j] * x_hat);
+                    atomicAdd(&grad_beta[j], go[j]);
+                  }
+                }
+              }
+            CUDA
+            compile_cached(source, "layer_norm_backward")
+          end
+          # RMSNorm forward: y = gamma * x / sqrt(mean(x^2) + eps)
+          # Used in LLaMA/Mistral architectures
+          # @return [Ignis::JIT::Kernel]
+          def rms_norm_forward
+            source = <<~CUDA
+              extern "C" __global__
+              void rms_norm_forward(const float* __restrict__ input,
+                                    const float* __restrict__ gamma,
+                                    float* __restrict__ output,
+                                    float* __restrict__ rstd_out,
+                                    const int outer_size,
+                                    const int norm_size,
+                                    const float eps) {
+                int row = blockIdx.x * blockDim.x + threadIdx.x;
+                if (row < outer_size) {
+                  const float* in_row = input + row * norm_size;
+                  float* out_row = output + row * norm_size;
+                  float ss = 0.0f;
+                  for (int j = 0; j < norm_size; j++) {
+                    ss += in_row[j] * in_row[j];
+                  }
+                  float rstd = rsqrtf(ss / (float)norm_size + eps);
+                  if (rstd_out) rstd_out[row] = rstd;
+                  for (int j = 0; j < norm_size; j++) {
+                    out_row[j] = gamma[j] * in_row[j] * rstd;
+                  }
+                }
+              }
+            CUDA
+            compile_cached(source, "rms_norm_forward")
+          end
+          # RMSNorm backward: dL/dx and dL/dgamma (no bias in RMSNorm).
+          # With x_hat_j = x_j * rstd and y_j = gamma_j * x_hat_j:
+          #   dL/dx_i     = rstd * (go_i*gamma_i - x_hat_i * S / n),  S = sum_j go_j*gamma_j*x_hat_j
+          #   dL/dgamma_j = sum_rows go_j * x_hat_j
+          # @return [Ignis::JIT::Kernel]
+          def rms_norm_backward
+            source = <<~CUDA
+              extern "C" __global__
+              void rms_norm_backward(const float* __restrict__ grad_output,
+                                     const float* __restrict__ input,
+                                     const float* __restrict__ gamma,
+                                     const float* __restrict__ rstd,
+                                     float* __restrict__ grad_input,
+                                     float* __restrict__ grad_gamma,
+                                     const int outer_size,
+                                     const int norm_size) {
+                int row = blockIdx.x * blockDim.x + threadIdx.x;
+                if (row < outer_size) {
+                  const float* go = grad_output + row * norm_size;
+                  const float* in_row = input + row * norm_size;
+                  float* gi = grad_input + row * norm_size;
+                  float r = rstd[row];
+                  // S = sum_j go_j * gamma_j * x_hat_j   (x_hat_j = x_j * r)
+                  float s = 0.0f;
+                  for (int j = 0; j < norm_size; j++) {
+                    s += go[j] * gamma[j] * (in_row[j] * r);
+                  }
+                  float inv_n = 1.0f / (float)norm_size;
+                  for (int j = 0; j < norm_size; j++) {
+                    float x_hat = in_row[j] * r;
+                    gi[j] = r * (go[j] * gamma[j] - x_hat * s * inv_n);
+                    atomicAdd(&grad_gamma[j], go[j] * x_hat);
+                  }
+                }
+              }
+            CUDA
+            compile_cached(source, "rms_norm_backward")
+          end
+          private
+          def compile_cached(source, name, device_id: 0)
+            Ignis::JIT::Compiler.compile(source, name, device_id: device_id)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/nvruby/jit/kernels/optimizer.rb ADDED Viewed

@@ -0,0 +1,193 @@
+# frozen_string_literal: true
+module Ignis
+  module JIT
+    module Kernels
+      # Optimizer CUDA kernels.
+      # Each optimizer step is a single fused kernel per parameter
+      # (update moments + param in one pass, avoiding multiple kernel launches).
+      module Optimizer
+        class << self
+          # Fused Adam step: update m, v, and param in one kernel launch
+          # @return [Ignis::JIT::Kernel]
+          def adam_step
+            source = <<~CUDA
+              extern "C" __global__
+              void adam_step(float* __restrict__ param,
+                             const float* __restrict__ grad,
+                             float* __restrict__ m,
+                             float* __restrict__ v,
+                             const float lr,
+                             const float beta1,
+                             const float beta2,
+                             const float eps,
+                             const float weight_decay,
+                             const float bias_correction1,
+                             const float bias_correction2,
+                             const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  float g = grad[idx];
+                  // L2 regularization (Adam-style, not decoupled)
+                  if (weight_decay > 0.0f) {
+                    g += weight_decay * param[idx];
+                  }
+                  // Update biased first moment estimate
+                  float m_new = beta1 * m[idx] + (1.0f - beta1) * g;
+                  m[idx] = m_new;
+                  // Update biased second moment estimate
+                  float v_new = beta2 * v[idx] + (1.0f - beta2) * g * g;
+                  v[idx] = v_new;
+                  // Bias correction
+                  float m_hat = m_new / bias_correction1;
+                  float v_hat = v_new / bias_correction2;
+                  // Update parameter
+                  param[idx] -= lr * m_hat / (sqrtf(v_hat) + eps);
+                }
+              }
+            CUDA
+            compile_cached(source, "adam_step")
+          end
+          # Fused AdamW step: Adam with decoupled weight decay
+          # Weight decay applied directly to param, not through gradient
+          # @return [Ignis::JIT::Kernel]
+          def adamw_step
+            source = <<~CUDA
+              extern "C" __global__
+              void adamw_step(float* __restrict__ param,
+                              const float* __restrict__ grad,
+                              float* __restrict__ m,
+                              float* __restrict__ v,
+                              const float lr,
+                              const float beta1,
+                              const float beta2,
+                              const float eps,
+                              const float weight_decay,
+                              const float bias_correction1,
+                              const float bias_correction2,
+                              const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  float g = grad[idx];
+                  // Update biased first moment
+                  float m_new = beta1 * m[idx] + (1.0f - beta1) * g;
+                  m[idx] = m_new;
+                  // Update biased second moment
+                  float v_new = beta2 * v[idx] + (1.0f - beta2) * g * g;
+                  v[idx] = v_new;
+                  // Bias correction
+                  float m_hat = m_new / bias_correction1;
+                  float v_hat = v_new / bias_correction2;
+                  // Decoupled weight decay + Adam update
+                  param[idx] -= lr * (m_hat / (sqrtf(v_hat) + eps) + weight_decay * param[idx]);
+                }
+              }
+            CUDA
+            compile_cached(source, "adamw_step")
+          end
+          # SGD step with momentum and weight decay
+          # @return [Ignis::JIT::Kernel]
+          def sgd_step
+            source = <<~CUDA
+              extern "C" __global__
+              void sgd_step(float* __restrict__ param,
+                            const float* __restrict__ grad,
+                            float* __restrict__ velocity,
+                            const float lr,
+                            const float momentum,
+                            const float weight_decay,
+                            const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  float g = grad[idx];
+                  if (weight_decay > 0.0f) {
+                    g += weight_decay * param[idx];
+                  }
+                  float v;
+                  if (momentum > 0.0f) {
+                    v = momentum * velocity[idx] + g;
+                    velocity[idx] = v;
+                  } else {
+                    v = g;
+                  }
+                  param[idx] -= lr * v;
+                }
+              }
+            CUDA
+            compile_cached(source, "sgd_step")
+          end
+          # Gradient clipping by global norm
+          # Phase 1: compute per-parameter squared sum
+          # @return [Ignis::JIT::Kernel]
+          def grad_squared_sum
+            source = <<~CUDA
+              extern "C" __global__
+              void grad_squared_sum(const float* __restrict__ grad,
+                                    float* __restrict__ partial_sum,
+                                    const int n) {
+                extern __shared__ float sdata[];
+                int tid = threadIdx.x;
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                sdata[tid] = (idx < n) ? grad[idx] * grad[idx] : 0.0f;
+                __syncthreads();
+                // Parallel reduction in shared memory
+                for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+                  if (tid < s) {
+                    sdata[tid] += sdata[tid + s];
+                  }
+                  __syncthreads();
+                }
+                if (tid == 0) {
+                  atomicAdd(partial_sum, sdata[0]);
+                }
+              }
+            CUDA
+            compile_cached(source, "grad_squared_sum")
+          end
+          # Phase 2: scale gradients by clip factor
+          # clip_factor = max_norm / (total_norm + eps)
+          # @return [Ignis::JIT::Kernel]
+          def grad_clip_scale
+            source = <<~CUDA
+              extern "C" __global__
+              void grad_clip_scale(float* __restrict__ grad,
+                                   const float clip_factor,
+                                   const int n) {
+                int idx = blockIdx.x * blockDim.x + threadIdx.x;
+                if (idx < n) {
+                  grad[idx] *= clip_factor;
+                }
+              }
+            CUDA
+            compile_cached(source, "grad_clip_scale")
+          end
+          private
+          def compile_cached(source, name, device_id: 0)
+            Ignis::JIT::Compiler.compile(source, name, device_id: device_id)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/nvruby/jit/nvrtc_bindings.rb ADDED Viewed

@@ -0,0 +1,282 @@
+# frozen_string_literal: true
+require "ffi"
+module Ignis
+  module JIT
+    # NVRTC (NVIDIA Runtime Compilation) library FFI bindings
+    # Provides runtime compilation of CUDA C++ source code to PTX/CUBIN
+    module NVRTCBindings
+      extend FFI::Library
+      # NVRTC Result codes
+      NVRTC_SUCCESS = 0
+      NVRTC_ERROR_OUT_OF_MEMORY = 1
+      NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
+      NVRTC_ERROR_INVALID_INPUT = 3
+      NVRTC_ERROR_INVALID_PROGRAM = 4
+      NVRTC_ERROR_INVALID_OPTION = 5
+      NVRTC_ERROR_COMPILATION = 6
+      NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
+      NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
+      NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
+      NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
+      NVRTC_ERROR_INTERNAL_ERROR = 11
+      # @return [Boolean] Whether bindings are loaded
+      @loaded = false
+      # @return [Mutex] Thread safety lock
+      @mutex = Mutex.new
+      class << self
+        # Ensure NVRTC library is loaded
+        # @return [void]
+        # @raise [LibraryNotFoundError] If NVRTC cannot be loaded
+        def ensure_loaded!
+          @mutex.synchronize do
+            return if @loaded
+            Ignis::CUDA::LibraryLoader.load_library(:nvrtc)
+            attach_nvrtc_functions!
+            @loaded = true
+            Ignis.logger.info("NVRTC bindings initialized")
+          end
+        end
+        # Check if NVRTC is loaded
+        # @return [Boolean]
+        def loaded?
+          @mutex.synchronize { @loaded }
+        end
+        # Get NVRTC version
+        # @return [String] Version string (e.g., "12.6")
+        # @raise [NVRTCError] If version cannot be retrieved
+        def version
+          ensure_loaded!
+          major_ptr = FFI::MemoryPointer.new(:int)
+          minor_ptr = FFI::MemoryPointer.new(:int)
+          result = nvrtcVersion(major_ptr, minor_ptr)
+          check_result!(result, "nvrtcVersion")
+          "#{major_ptr.read_int}.#{minor_ptr.read_int}"
+        end
+        # Create an NVRTC program from source code
+        # @param source [String] CUDA C++ source code
+        # @param name [String] Program name (for error messages)
+        # @param headers [Array<String>] Header contents (optional)
+        # @param header_names [Array<String>] Header names (optional)
+        # @return [FFI::Pointer] Program handle
+        # @raise [NVRTCError] If program creation fails
+        def create_program(source, name: "kernel.cu", headers: [], header_names: [])
+          ensure_loaded!
+          prog_ptr = FFI::MemoryPointer.new(:pointer)
+          source_ptr = FFI::MemoryPointer.from_string(source)
+          name_ptr = FFI::MemoryPointer.from_string(name)
+          num_headers = headers.size
+          if num_headers.positive?
+            headers_ptr = FFI::MemoryPointer.new(:pointer, num_headers)
+            header_names_ptr = FFI::MemoryPointer.new(:pointer, num_headers)
+            headers.each_with_index do |header, i|
+              headers_ptr.put_pointer(i * FFI::Pointer.size, FFI::MemoryPointer.from_string(header))
+            end
+            header_names.each_with_index do |header_name, i|
+              header_names_ptr.put_pointer(i * FFI::Pointer.size, FFI::MemoryPointer.from_string(header_name))
+            end
+          else
+            headers_ptr = nil
+            header_names_ptr = nil
+          end
+          result = nvrtcCreateProgram(prog_ptr, source_ptr, name_ptr, num_headers, headers_ptr, header_names_ptr)
+          check_result!(result, "nvrtcCreateProgram")
+          prog_ptr.read_pointer
+        end
+        # Compile an NVRTC program
+        # @param program [FFI::Pointer] Program handle
+        # @param options [Array<String>] Compilation options
+        # @return [void]
+        # @raise [NVRTCError] If compilation fails (includes error log)
+        def compile_program(program, options: [])
+          ensure_loaded!
+          if options.any?
+            options_array = FFI::MemoryPointer.new(:pointer, options.size)
+            options.each_with_index do |opt, i|
+              options_array.put_pointer(i * FFI::Pointer.size, FFI::MemoryPointer.from_string(opt))
+            end
+          else
+            options_array = nil
+          end
+          result = nvrtcCompileProgram(program, options.size, options_array)
+          if result != NVRTC_SUCCESS
+            log = get_program_log(program)
+            raise NVRTCError.new(result, compilation_log: log)
+          end
+          nil
+        end
+        # Get the program compilation log
+        # @param program [FFI::Pointer] Program handle
+        # @return [String] Compilation log
+        def get_program_log(program)
+          ensure_loaded!
+          size_ptr = FFI::MemoryPointer.new(:size_t)
+          result = nvrtcGetProgramLogSize(program, size_ptr)
+          return "" unless result == NVRTC_SUCCESS
+          log_size = size_ptr.read(:size_t)
+          return "" if log_size.zero?
+          log_ptr = FFI::MemoryPointer.new(:char, log_size)
+          result = nvrtcGetProgramLog(program, log_ptr)
+          return "" unless result == NVRTC_SUCCESS
+          log_ptr.read_string(log_size - 1)
+        end
+        # Get compiled CUBIN size
+        # @param program [FFI::Pointer] Program handle
+        # @return [Integer] CUBIN size in bytes
+        # @raise [NVRTCError] If size cannot be retrieved
+        def get_cubin_size(program)
+          ensure_loaded!
+          size_ptr = FFI::MemoryPointer.new(:size_t)
+          result = nvrtcGetCUBINSize(program, size_ptr)
+          check_result!(result, "nvrtcGetCUBINSize")
+          size_ptr.read(:size_t)
+        end
+        # Get compiled CUBIN binary
+        # @param program [FFI::Pointer] Program handle
+        # @return [String] CUBIN binary data
+        # @raise [NVRTCError] If CUBIN cannot be retrieved
+        def get_cubin(program)
+          ensure_loaded!
+          cubin_size = get_cubin_size(program)
+          cubin_ptr = FFI::MemoryPointer.new(:char, cubin_size)
+          result = nvrtcGetCUBIN(program, cubin_ptr)
+          check_result!(result, "nvrtcGetCUBIN")
+          cubin_ptr.read_bytes(cubin_size)
+        end
+        # Get compiled PTX size
+        # @param program [FFI::Pointer] Program handle
+        # @return [Integer] PTX size in bytes
+        # @raise [NVRTCError] If size cannot be retrieved
+        def get_ptx_size(program)
+          ensure_loaded!
+          size_ptr = FFI::MemoryPointer.new(:size_t)
+          result = nvrtcGetPTXSize(program, size_ptr)
+          check_result!(result, "nvrtcGetPTXSize")
+          size_ptr.read(:size_t)
+        end
+        # Get compiled PTX code
+        # @param program [FFI::Pointer] Program handle
+        # @return [String] PTX code
+        # @raise [NVRTCError] If PTX cannot be retrieved
+        def get_ptx(program)
+          ensure_loaded!
+          ptx_size = get_ptx_size(program)
+          ptx_ptr = FFI::MemoryPointer.new(:char, ptx_size)
+          result = nvrtcGetPTX(program, ptx_ptr)
+          check_result!(result, "nvrtcGetPTX")
+          ptx_ptr.read_string(ptx_size - 1)
+        end
+        # Destroy an NVRTC program
+        # @param program [FFI::Pointer] Program handle
+        # @return [void]
+        def destroy_program(program)
+          return if program.nil? || program.null?
+          ensure_loaded!
+          prog_ptr = FFI::MemoryPointer.new(:pointer)
+          prog_ptr.write_pointer(program)
+          nvrtcDestroyProgram(prog_ptr)
+        end
+        # Check NVRTC result and raise on error
+        # @param result [Integer] NVRTC result code
+        # @param context [String] Context for error message
+        # @return [void]
+        # @raise [NVRTCError] If result is not success
+        def check_result!(result, context)
+          return if result == NVRTC_SUCCESS
+          raise NVRTCError.new(result, context: context)
+        end
+        private
+        # Attach all NVRTC FFI functions
+        # @return [void]
+        def attach_nvrtc_functions!
+          handle = Ignis::CUDA::LibraryLoader.load_library(:nvrtc)
+          define_nvrtc_function(handle, :nvrtcVersion, [:pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcGetErrorString, [:int], :string)
+          define_nvrtc_function(handle, :nvrtcCreateProgram, [:pointer, :pointer, :pointer, :int, :pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcDestroyProgram, [:pointer], :int)
+          define_nvrtc_function(handle, :nvrtcCompileProgram, [:pointer, :int, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcGetPTXSize, [:pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcGetPTX, [:pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcGetCUBINSize, [:pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcGetCUBIN, [:pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcGetProgramLogSize, [:pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcGetProgramLog, [:pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcAddNameExpression, [:pointer, :pointer], :int)
+          define_nvrtc_function(handle, :nvrtcGetLoweredName, [:pointer, :pointer, :pointer], :int)
+        end
+        # Define an NVRTC function from the loaded library
+        # @param handle [FFI::DynamicLibrary] Library handle
+        # @param name [Symbol] Function name
+        # @param args [Array] Argument types
+        # @param ret [Symbol] Return type
+        # @return [void]
+        def define_nvrtc_function(handle, name, args, ret)
+          # LibraryLoader returns a Fiddle::Handle; resolve the symbol address with
+          # Fiddle::Handle#[] and build an FFI::Function from it (Fiddle::Handle has
+          # no #find_function method).
+          func_ptr = begin
+            handle[name.to_s]
+          rescue Fiddle::DLError
+            nil
+          end
+          raise NVRTCError.new(NVRTC_ERROR_INTERNAL_ERROR, context: "Function #{name} not found") unless func_ptr
+          func = FFI::Function.new(ret, args, FFI::Pointer.new(func_ptr))
+          define_singleton_method(name) { |*call_args| func.call(*call_args) }
+        end
+      end
+    end
+  end
+end