RubyGems - ignis - Versions diffs - 0.0.1 - Mend

ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

checksums.yaml +7 -0
data/README.md +15 -0
data/lib/ignis.rb +94 -0
data/lib/nnw/platform.rb +304 -0
data/lib/nnw/shared/event_bus.rb +240 -0
data/lib/nnw/shared/ffi_loader.rb +63 -0
data/lib/nnw/shared/memory_contract.rb +204 -0
data/lib/nnw/shared/nv_array.rb +710 -0
data/lib/nnw/shared/recovery_protocol.rb +307 -0
data/lib/nvruby/configuration.rb +217 -0
data/lib/nvruby/cuda/device.rb +275 -0
data/lib/nvruby/cuda/device_props.rb +202 -0
data/lib/nvruby/cuda/graph.rb +265 -0
data/lib/nvruby/cuda/graph_bindings.rb +119 -0
data/lib/nvruby/cuda/library_loader.rb +285 -0
data/lib/nvruby/cuda/memory.rb +410 -0
data/lib/nvruby/cuda/runtime_api.rb +804 -0
data/lib/nvruby/cuda/stream.rb +234 -0
data/lib/nvruby/dtype.rb +139 -0
data/lib/nvruby/epilogues.rb +438 -0
data/lib/nvruby/errors.rb +303 -0
data/lib/nvruby/half.rb +97 -0
data/lib/nvruby/jit/compiled_kernel.rb +80 -0
data/lib/nvruby/jit/compiler.rb +231 -0
data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
data/lib/nvruby/jit/kernel.rb +240 -0
data/lib/nvruby/jit/kernel_module.rb +133 -0
data/lib/nvruby/jit/kernels/activations.rb +179 -0
data/lib/nvruby/jit/kernels/attention.rb +504 -0
data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
data/lib/nvruby/jit/kernels/loss.rb +213 -0
data/lib/nvruby/jit/kernels/normalization.rb +200 -0
data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
data/lib/nvruby/linalg/epilog.rb +67 -0
data/lib/nvruby/linalg/matmul.rb +247 -0
data/lib/nvruby/linalg/matmul_plan.rb +229 -0
data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
data/lib/nvruby/memory/device_memory_resource.rb +106 -0
data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
data/lib/nvruby/memory/stats.rb +107 -0
data/lib/nvruby/memory.rb +124 -0
data/lib/nvruby/version.rb +5 -0
metadata +108 -0

data/lib/nnw/shared/nv_array.rb ADDED Viewed

@@ -0,0 +1,710 @@
+# frozen_string_literal: true
+require 'fiddle'
+require 'fiddle/import'
+require_relative '../../nvruby/half'
+module Ignis
+  module Shared
+    # MemoryContractViolation is raised when ownership rules are violated.
+    class MemoryContractViolation < StandardError; end
+    # NvArray — The canonical GPU tensor type for the entire Ignis system.
+    #
+    # Ignis creates them. NvCCL moves them. WNAIS serializes them to NOVA.
+    # There is exactly ONE definition of NvArray in the codebase.
+    #
+    # Thread safety: owner transitions, ref_count changes, and free operations
+    # are protected by a per-instance Mutex.
+    class NvArray
+      # @return [Hash{Symbol => Integer}] dtype to byte size mapping
+      DTYPE_SIZES = {
+        float16:  2,
+        float32:  4,
+        float64:  8,
+        int32:    4,
+        int64:    8,
+        uint8:    1,
+        bfloat16: 2
+      }.freeze
+      # @return [Array<Symbol>] valid dtype symbols
+      VALID_DTYPES = DTYPE_SIZES.keys.freeze
+      # @return [Array<Symbol>] valid owner symbols
+      VALID_OWNERS = %i[nvruby nvccl wnais].freeze
+      # cudaMemcpy direction constants
+      MEMCPY_HOST_TO_DEVICE   = 1
+      MEMCPY_DEVICE_TO_HOST   = 2
+      MEMCPY_DEVICE_TO_DEVICE = 3
+      # cudaHostAlloc flags
+      CUDA_HOST_ALLOC_DEFAULT = 0
+      # @return [Array<Integer>] tensor shape dimensions
+      attr_reader :shape
+      # @return [Symbol] data type (:float16, :float32, :float64, :int32, :int64, :uint8, :bfloat16)
+      attr_reader :dtype
+      # @return [Integer] GPU device index
+      attr_reader :device_id
+      # @return [Fiddle::Pointer, nil] device memory pointer
+      attr_reader :ptr
+      # @return [Fiddle::Pointer, nil] CUDA stream pointer (nullable)
+      attr_reader :stream
+      # @return [Fiddle::Pointer, nil] pinned host memory pointer (nullable, for P2P staging)
+      attr_reader :pinned_host_ptr
+      # @return [Symbol] current memory owner (:nvruby, :nvccl, or :wnais)
+      attr_reader :owner
+      # @return [Integer] thread-safe reference count for shared staging
+      attr_reader :ref_count
+      # @return [Integer] unique identifier for this array instance
+      attr_reader :id
+      # @return [Time] creation timestamp
+      attr_reader :created_at
+      @@next_id = 0
+      @@id_mutex = Mutex.new
+      # Initialize a new NvArray.
+      #
+      # @param shape [Array<Integer>] tensor dimensions
+      # @param dtype [Symbol] data type
+      # @param device_id [Integer] GPU device index
+      # @param ptr [Fiddle::Pointer, nil] pre-allocated device memory pointer
+      # @param stream [Fiddle::Pointer, nil] CUDA stream pointer
+      # @param owner [Symbol] initial memory owner
+      # @raise [ArgumentError] if shape, dtype, or owner are invalid
+      def initialize(shape:, dtype:, device_id: 0, ptr: nil, stream: nil, owner: :nvruby, parent: nil)
+        validate_shape!(shape)
+        validate_dtype!(dtype)
+        validate_owner!(owner)
+        @shape = shape.dup.freeze
+        @dtype = dtype
+        @device_id = device_id
+        @ptr = ptr
+        @stream = stream
+        @pinned_host_ptr = nil
+        @owner = owner
+        @ref_count = 0
+        @mutex = Mutex.new
+        @freed = false
+        @created_at = Time.now
+        # Memory ownership: we own (and must free) the device buffer only if we
+        # allocate it ourselves. Arrays constructed with an external `ptr:` (slice,
+        # reshape, from_device_ptr) are VIEWS — they must never free it.
+        @owns_memory = ptr.nil?
+        # Views retain a reference to their parent so the parent (and its memory)
+        # stays alive for at least as long as the view does.
+        @parent = parent
+        @@id_mutex.synchronize do
+          @id = @@next_id
+          @@next_id += 1
+        end
+      end
+      # Total number of elements in the tensor.
+      # @return [Integer]
+      def numel
+        @shape.reduce(1, :*)
+      end
+      # Size in bytes of the tensor data on device.
+      # @return [Integer]
+      def size_bytes
+        numel * dtype_size
+      end
+      # Bytes per element for the current dtype.
+      # @return [Integer]
+      def dtype_size
+        DTYPE_SIZES.fetch(@dtype)
+      end
+      # ----------------------------------------------------------------
+      # Compatibility shims
+      #
+      # The Ignis kernel launcher and CUDA-X (cuBLAS/etc.) bindings were written
+      # against Ignis::NvArray. These accessors let those code paths consume a
+      # Shared::NvArray unchanged (duck typing on device_ffi_ptr/ndim/etc.),
+      # which is what lets the AI stack actually reach the GPU.
+      # ----------------------------------------------------------------
+      # @return [Integer] number of dimensions
+      def ndim
+        @shape.length
+      end
+      # @return [Integer] device index (Ignis::NvArray naming)
+      def device_index
+        @device_id
+      end
+      # @return [Boolean] whether device memory is allocated
+      def on_device?
+        !@ptr.nil?
+      end
+      # Ensure device memory is allocated. Shared arrays are device-resident, so
+      # this just allocates on first use; it exists for API parity with
+      # Ignis::NvArray#to_device.
+      # @return [self]
+      def to_device(*)
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          @ptr = allocate_device_memory(size_bytes) if @ptr.nil?
+        end
+        self
+      end
+      # Device pointer wrapped as an FFI::Pointer for FFI-bound library calls
+      # (cuBLAS/cuSOLVER/cuFFT/cuRAND/cuSPARSE and the JIT kernel launcher).
+      # @return [FFI::Pointer]
+      def device_ffi_ptr
+        to_device if @ptr.nil?
+        ::FFI::Pointer.new(@ptr.to_i)
+      end
+      # Zero the device buffer with cudaMemset (device-side). This is ~20x faster
+      # than the old `from_host(Array.new(numel, 0.0))` idiom, which allocated a
+      # huge Ruby array, packed it, and H2D-copied it on every op (0.5ms+/op, and
+      # seconds for the 38M-element LM-head weight transpose).
+      # @return [self]
+      def zero!
+        to_device if @ptr.nil?
+        status = cuda_rt.cudaMemset(@ptr, 0, size_bytes)
+        raise "cudaMemset failed with status #{status}" unless status.zero?
+        self
+      end
+      # Whether this array has been freed.
+      # @return [Boolean]
+      def freed?
+        @mutex.synchronize { @freed }
+      end
+      # Copy device memory to host and return as a flat Ruby Array.
+      #
+      # Uses cudaMemcpy with DtoH direction. The returned array contains
+      # numeric values decoded according to the dtype.
+      #
+      # @return [Array<Numeric>] flat array of host-side values
+      # @raise [RuntimeError] if array has been freed or cudaMemcpy fails
+      def to_host
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          raise "NvArray##{@id} has no device pointer" if @ptr.nil?
+        end
+        host_buf = Fiddle::Pointer.malloc(size_bytes)
+        status = cuda_rt.cudaMemcpy(host_buf, @ptr, size_bytes, MEMCPY_DEVICE_TO_HOST)
+        raise "cudaMemcpy DtoH failed with status #{status}" unless status.zero?
+        unpack_host_buffer(host_buf)
+      end
+      # Copy data from a Ruby Array to device memory.
+      #
+      # @param data [Array<Numeric>] flat array of values to copy
+      # @return [self]
+      # @raise [ArgumentError] if data size doesn't match tensor element count
+      # @raise [RuntimeError] if array has been freed or cudaMemcpy fails
+      def from_host(data)
+        unless data.is_a?(Array) && data.length == numel
+          raise ArgumentError, "Expected #{numel} elements, got #{data.length}"
+        end
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          if @ptr.nil?
+            @ptr = allocate_device_memory(size_bytes)
+          end
+        end
+        host_buf = pack_host_buffer(data)
+        status = cuda_rt.cudaMemcpy(@ptr, host_buf, size_bytes, MEMCPY_HOST_TO_DEVICE)
+        raise "cudaMemcpy HtoD failed with status #{status}" unless status.zero?
+        self
+      end
+      # Copy a raw little-endian binary string straight to device memory.
+      #
+      # The bytes must already be in the device dtype's native layout (this is how
+      # safetensors / NOVA data is stored), so no per-element conversion is done —
+      # this avoids the lossy float<->half round trip that #from_host would incur.
+      #
+      # @param bytes [String] binary string of exactly size_bytes length
+      # @return [self]
+      # @raise [ArgumentError] if the byte count doesn't match size_bytes
+      # @raise [RuntimeError] if array has been freed or cudaMemcpy fails
+      def from_host_raw(bytes)
+        unless bytes.bytesize == size_bytes
+          raise ArgumentError, "Expected #{size_bytes} bytes, got #{bytes.bytesize}"
+        end
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          @ptr = allocate_device_memory(size_bytes) if @ptr.nil?
+        end
+        host_buf = Fiddle::Pointer.malloc(bytes.bytesize)
+        host_buf[0, bytes.bytesize] = bytes
+        status = cuda_rt.cudaMemcpy(@ptr, host_buf, size_bytes, MEMCPY_HOST_TO_DEVICE)
+        raise "cudaMemcpy HtoD failed with status #{status}" unless status.zero?
+        self
+      end
+      # Deep-copy into a fresh, independently-owned device buffer (device→device
+      # cudaMemcpy). Unlike #slice, the returned array shares NO storage with self
+      # and owns its memory (registered for finalization, freed on GC).
+      #
+      # The autograd tape relies on this: it accumulates gradients in place, so it
+      # must guarantee accumulator buffers never alias. Backward closures are free
+      # to return shared buffers (e.g. `+` returns [grad, grad]); the tape clones
+      # to restore exclusive ownership. DtoD copies raw bytes, so it is exact and
+      # dtype-agnostic (no float↔half round trip).
+      #
+      # @return [NvArray] independent owned copy with identical shape/dtype/values
+      # @raise [RuntimeError] if this array has been freed or has no device pointer
+      def clone
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          raise "NvArray##{@id} has no device pointer" if @ptr.nil?
+        end
+        copy = NvArray.new(shape: @shape, dtype: @dtype, device_id: @device_id,
+                           stream: @stream, owner: @owner)
+        # allocate_into the copy so the finalizer is registered on `copy` (owns_memory).
+        dst = copy.send(:allocate_device_memory, size_bytes)
+        copy.instance_variable_set(:@ptr, dst)
+        status = cuda_rt.cudaMemcpy(dst, @ptr, size_bytes, MEMCPY_DEVICE_TO_DEVICE)
+        raise "cudaMemcpy DtoD failed with status #{status}" unless status.zero?
+        copy
+      end
+      # Copy a contiguous source array into this buffer starting at row +start_row+
+      # (device→device). Used to append K/V rows into a preallocated KV cache in
+      # O(row) instead of reallocating + recopying the whole cache each step.
+      #
+      # @param src [NvArray] contiguous source ([r, cols] matching this array's cols)
+      # @param start_row [Integer] destination row offset (0-based)
+      # @return [self]
+      # @raise [RuntimeError] if freed/unallocated, or the write would overflow
+      def write_rows!(src, start_row)
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          raise "NvArray##{@id} has no device pointer" if @ptr.nil?
+        end
+        row_bytes = (numel / @shape[0]) * dtype_size
+        offset = start_row * row_bytes
+        if offset + src.size_bytes > size_bytes
+          raise "write_rows! overflow: writing #{src.size_bytes} bytes at row #{start_row} " \
+                "(offset #{offset}) exceeds #{size_bytes}-byte buffer"
+        end
+        dst = Fiddle::Pointer.new(@ptr.to_i + offset)
+        status = cuda_rt.cudaMemcpy(dst, src.ptr, src.size_bytes, MEMCPY_DEVICE_TO_DEVICE)
+        raise "cudaMemcpy DtoD (write_rows!) failed with status #{status}" unless status.zero?
+        self
+      end
+      # Create a zero-copy slice along a dimension.
+      #
+      # Returns a new NvArray that shares the same device memory but with
+      # an offset pointer and adjusted shape. No data is copied.
+      #
+      # @param dim [Integer] dimension to slice along
+      # @param start [Integer] starting index in the dimension
+      # @param len [Integer] number of elements to include
+      # @return [NvArray] new array sharing device memory (no copy)
+      # @raise [ArgumentError] if dim, start, or len are out of bounds
+      def slice(dim, start, len)
+        raise ArgumentError, "Dimension #{dim} out of range for shape #{@shape}" unless dim >= 0 && dim < @shape.length
+        raise ArgumentError, "Slice range [#{start}, #{start + len}) exceeds dim size #{@shape[dim]}" unless start >= 0 && (start + len) <= @shape[dim]
+        # A pointer-offset view is only CONTIGUOUS — and thus correct for a plain
+        # numel-length read (to_host, kernels) — when nothing varies in the
+        # dimensions BEFORE `dim`. For dim>0 with non-unit leading dims the slice
+        # is strided (scattered across memory), which this zero-copy view cannot
+        # represent: a consumer would read the wrong, contiguous elements with no
+        # error. Fail loud instead of returning silently-wrong data.
+        leading = @shape[0...dim].reduce(1, :*)
+        if leading > 1
+          raise ArgumentError,
+                "slice(dim=#{dim}, ...) on shape #{@shape} is a strided (non-contiguous) view, " \
+                "which NvArray#slice cannot represent; only contiguous slices are supported " \
+                "(dim 0, or leading dims of size 1). Use a gather/copy kernel for strided slices."
+        end
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+        end
+        new_shape = @shape.dup
+        new_shape[dim] = len
+        # Compute byte offset: product of trailing dimensions * start * dtype_size
+        trailing = @shape[(dim + 1)..].reduce(1, :*)
+        offset_bytes = start * trailing * dtype_size
+        sliced_ptr = @ptr.nil? ? nil : Fiddle::Pointer.new(@ptr.to_i + offset_bytes, size_bytes - offset_bytes)
+        sliced = NvArray.new(
+          shape: new_shape,
+          dtype: @dtype,
+          device_id: @device_id,
+          ptr: sliced_ptr,
+          stream: @stream,
+          owner: @owner,
+          parent: self # view: non-owning, keeps parent alive (no leak / no double-free)
+        )
+        sliced
+      end
+      # Atomically transfer memory ownership to a new owner.
+      #
+      # @param new_owner [Symbol] the new owner (:nvruby, :nvccl, or :wnais)
+      # @return [Symbol] the new owner
+      # @raise [MemoryContractViolation] if ref_count > 1 during transfer
+      # @raise [ArgumentError] if new_owner is invalid
+      def transfer_ownership(new_owner)
+        validate_owner!(new_owner)
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          if @ref_count > 1
+            raise MemoryContractViolation,
+                  "Cannot transfer ownership of NvArray##{@id} while ref_count=#{@ref_count} > 1"
+          end
+          @owner = new_owner
+        end
+        new_owner
+      end
+      # Allocate pinned host memory for P2P staging.
+      #
+      # @return [Fiddle::Pointer] the pinned host pointer
+      # @raise [RuntimeError] if cudaHostAlloc fails or already pinned
+      def pin!
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          raise "NvArray##{@id} is already pinned" unless @pinned_host_ptr.nil?
+          ptr_buf = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
+          status = cuda_rt.cudaHostAlloc(ptr_buf, size_bytes, CUDA_HOST_ALLOC_DEFAULT)
+          raise "cudaHostAlloc failed with status #{status}" unless status.zero?
+          @pinned_host_ptr = Fiddle::Pointer.new(ptr_buf[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
+        end
+        @pinned_host_ptr
+      end
+      # Free pinned host memory.
+      #
+      # @return [void]
+      # @raise [RuntimeError] if no pinned memory exists or cudaFreeHost fails
+      def unpin!
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          raise "NvArray##{@id} has no pinned memory" if @pinned_host_ptr.nil?
+          status = cuda_rt.cudaFreeHost(@pinned_host_ptr)
+          raise "cudaFreeHost failed with status #{status}" unless status.zero?
+          @pinned_host_ptr = nil
+        end
+      end
+      # Free device memory. Raises if ref_count > 0.
+      #
+      # @return [void]
+      # @raise [MemoryContractViolation] if ref_count > 0
+      # @raise [RuntimeError] if already freed or cudaFree fails
+      def free!
+        @mutex.synchronize do
+          raise "NvArray##{@id} has already been freed" if @freed
+          # Refuse to free while pinned (ref_count > 0): another component holds
+          # this buffer for shared staging (the documented contract). GC-time
+          # reclamation goes through the finalizer, not free!, so this guard only
+          # gates EXPLICIT frees — it never blocks normal collection.
+          if @ref_count > 0
+            raise MemoryContractViolation,
+                  "Cannot free NvArray##{@id} while ref_count=#{@ref_count} > 0"
+          end
+          unless @pinned_host_ptr.nil?
+            cuda_rt.cudaFreeHost(@pinned_host_ptr)
+            @pinned_host_ptr = nil
+          end
+          # Only the owner frees the device buffer. Views (slice/reshape) share the
+          # parent's allocation and must NOT free it (that was the reshape/slice
+          # double-free path); the parent frees it via free! or its GC finalizer.
+          if @owns_memory && !@ptr.nil?
+            status = cuda_rt.cudaFree(@ptr)
+            raise "cudaFree failed with status #{status}" unless status.zero?
+          end
+          @ptr = nil
+          @freed = true
+        end
+        # Cancel the GC finalizer so we don't cudaFree the same pointer twice.
+        ObjectSpace.undefine_finalizer(self)
+      end
+      # Atomically increment the reference count.
+      # @return [Integer] new ref_count value
+      def increment_ref!
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          @ref_count += 1
+        end
+      end
+      # Atomically decrement the reference count.
+      # @return [Integer] new ref_count value
+      # @raise [RuntimeError] if ref_count is already 0
+      def decrement_ref!
+        @mutex.synchronize do
+          raise "NvArray##{@id} has been freed" if @freed
+          raise "NvArray##{@id} ref_count is already 0" if @ref_count <= 0
+          @ref_count -= 1
+        end
+      end
+      # Human-readable string representation.
+      # @return [String]
+      def to_s
+        "#<Ignis::Shared::NvArray id=#{@id} shape=#{@shape} dtype=#{@dtype} " \
+          "device=#{@device_id} owner=#{@owner} ref_count=#{@ref_count} freed=#{@freed}>"
+      end
+      alias_method :inspect, :to_s
+      private
+      # Validate shape parameter.
+      # @param shape [Array<Integer>]
+      # @raise [ArgumentError]
+      def validate_shape!(shape)
+        # `shape.all?` is vacuously true for [], so an empty shape slipped through
+        # and produced a numel==1 phantom-scalar array. Require at least one dim.
+        unless shape.is_a?(Array) && !shape.empty? && shape.all? { |d| d.is_a?(Integer) && d > 0 }
+          raise ArgumentError, "Shape must be a non-empty Array of positive Integers, got: #{shape.inspect}"
+        end
+      end
+      # Validate dtype parameter.
+      # @param dtype [Symbol]
+      # @raise [ArgumentError]
+      def validate_dtype!(dtype)
+        unless VALID_DTYPES.include?(dtype)
+          raise ArgumentError, "Invalid dtype #{dtype.inspect}. Valid: #{VALID_DTYPES}"
+        end
+      end
+      # Validate owner parameter.
+      # @param owner [Symbol]
+      # @raise [ArgumentError]
+      def validate_owner!(owner)
+        unless VALID_OWNERS.include?(owner)
+          raise ArgumentError, "Invalid owner #{owner.inspect}. Valid: #{VALID_OWNERS}"
+        end
+      end
+      # Get or load the CUDA runtime module for Fiddle calls.
+      # @return [Module]
+      def cuda_rt
+        NvArray.cuda_runtime
+      end
+      # Pack Ruby Array into a binary string for cudaMemcpy HtoD.
+      # @param data [Array<Numeric>]
+      # @return [Fiddle::Pointer]
+      def pack_host_buffer(data)
+        packed = case @dtype
+                 when :float32  then data.pack('e*')
+                 when :float64  then data.pack('E*')
+                 when :int32    then data.pack('l<*')
+                 when :int64    then data.pack('q<*')
+                 when :uint8    then data.pack('C*')
+                 when :float16  then data.map { |v| ::Ignis::Half.f32_to_f16(v) }.pack('v*')
+                 when :bfloat16 then data.map { |v| ::Ignis::Half.f32_to_bf16(v) }.pack('v*')
+                 end
+        buf = Fiddle::Pointer.malloc(packed.bytesize)
+        buf[0, packed.bytesize] = packed
+        buf
+      end
+      # Unpack a host buffer binary string into a Ruby Array.
+      # @param buf [Fiddle::Pointer]
+      # @return [Array<Numeric>]
+      def unpack_host_buffer(buf)
+        raw = buf[0, size_bytes]
+        case @dtype
+        when :float32  then raw.unpack('e*')
+        when :float64  then raw.unpack('E*')
+        when :int32    then raw.unpack('l<*')
+        when :int64    then raw.unpack('q<*')
+        when :uint8    then raw.unpack('C*')
+        when :float16  then raw.unpack('v*').map { |bits| half_to_float(bits) }
+        when :bfloat16 then raw.unpack('v*').map { |bits| bfloat16_to_float(bits) }
+        end
+      end
+      # Convert IEEE 754 half-precision bits to Ruby Float.
+      # Delegates to Ignis::Half (single source of truth shared with Ignis::NvArray).
+      # @param bits [Integer] 16-bit unsigned integer
+      # @return [Float]
+      def half_to_float(bits)
+        ::Ignis::Half.f16_to_f32(bits)
+      end
+      # Convert bfloat16 bits to Ruby Float.
+      # @param bits [Integer] 16-bit unsigned integer
+      # @return [Float]
+      def bfloat16_to_float(bits)
+        ::Ignis::Half.bf16_to_f32(bits)
+      end
+      # Allocate device memory via cudaMalloc.
+      # @param bytes [Integer]
+      # @return [Fiddle::Pointer]
+      def allocate_device_memory(bytes)
+        ptr_buf = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
+        status = cuda_rt.cudaMalloc(ptr_buf, bytes)
+        raise "cudaMalloc failed with status #{status} for #{bytes} bytes" unless status.zero?
+        ptr = Fiddle::Pointer.new(ptr_buf[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
+        # Free this owned allocation if the object is GC'd without an explicit
+        # free! (previously there was NO finalizer, so every dropped NvArray
+        # leaked its full GPU buffer). free! undefines this to avoid a double free.
+        ObjectSpace.define_finalizer(self, self.class.release_finalizer(ptr.to_i))
+        ptr
+      end
+      class << self
+        # CUDA runtime Fiddle bindings — lazily loaded singleton.
+        # @return [Module] module with CUDA runtime functions
+        def cuda_runtime
+          @cuda_runtime ||= load_cuda_runtime
+        end
+        # Finalizer that frees an owned device allocation on GC. Captures only the
+        # raw address (not self, which would pin the object and defeat GC) and
+        # swallows errors (interpreter shutdown may have unloaded the runtime).
+        # @param addr [Integer] device pointer address
+        # @return [Proc]
+        def release_finalizer(addr)
+          proc do
+            begin
+              cuda_runtime.cudaFree(Fiddle::Pointer.new(addr))
+            rescue StandardError
+              nil
+            end
+          end
+        end
+        private
+        # Load CUDA runtime DLL and bind essential functions.
+        # @return [Module]
+        def load_cuda_runtime
+          # Resolve CUDA runtime path per platform
+          dll_path = if defined?(Ignis::Platform)
+                       Ignis::Platform.cudart_path
+                     elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
+                       cuda_bin = File.join('C:', 'Program Files', 'NVIDIA GPU Computing Toolkit', 'CUDA', 'v13.0', 'bin')
+                       File.join(cuda_bin, 'cudart64_130.dll')
+                     else
+                       'libcudart.so.13'
+                     end
+          handle = Fiddle::Handle.new(dll_path)
+          mod = Module.new
+          mod.define_singleton_method(:handle) { handle }
+          # cudaMalloc(void **devPtr, size_t size) -> int
+          cuda_malloc = Fiddle::Function.new(
+            handle['cudaMalloc'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T],
+            Fiddle::TYPE_INT
+          )
+          mod.define_singleton_method(:cudaMalloc) { |ptr, size| cuda_malloc.call(ptr, size) }
+          # cudaMemset(void *devPtr, int value, size_t count) -> int
+          cuda_memset = Fiddle::Function.new(
+            handle['cudaMemset'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_SIZE_T],
+            Fiddle::TYPE_INT
+          )
+          mod.define_singleton_method(:cudaMemset) { |ptr, value, count| cuda_memset.call(ptr, value, count) }
+          # cudaFree(void *devPtr) -> int
+          cuda_free = Fiddle::Function.new(
+            handle['cudaFree'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          mod.define_singleton_method(:cudaFree) { |ptr| cuda_free.call(ptr) }
+          # cudaMemcpy(void *dst, const void *src, size_t count, int kind) -> int
+          cuda_memcpy = Fiddle::Function.new(
+            handle['cudaMemcpy'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT],
+            Fiddle::TYPE_INT
+          )
+          mod.define_singleton_method(:cudaMemcpy) { |dst, src, count, kind| cuda_memcpy.call(dst, src, count, kind) }
+          # cudaHostAlloc(void **pHost, size_t size, unsigned int flags) -> int
+          cuda_host_alloc = Fiddle::Function.new(
+            handle['cudaHostAlloc'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT],
+            Fiddle::TYPE_INT
+          )
+          mod.define_singleton_method(:cudaHostAlloc) { |ptr, size, flags| cuda_host_alloc.call(ptr, size, flags) }
+          # cudaFreeHost(void *ptr) -> int
+          cuda_free_host = Fiddle::Function.new(
+            handle['cudaFreeHost'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          mod.define_singleton_method(:cudaFreeHost) { |ptr| cuda_free_host.call(ptr) }
+          mod
+        end
+      end
+    end
+  end
+end
+# Public ecosystem alias: the one canonical GPU n-dimensional array. Internally the
+# class lives at Ignis::Shared::NvArray (legacy path); Ignis::NDArray is the name the
+# Ignis API exposes. (The nvmath-style Ignis::NvArray is a separate, deferred numerics
+# array — see gems/MIGRATION.md.)
+Ignis::NDArray = Ignis::Shared::NvArray