ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,710 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fiddle'
4
+ require 'fiddle/import'
5
+ require_relative '../../nvruby/half'
6
+
7
+ module Ignis
8
+ module Shared
9
+ # MemoryContractViolation is raised when ownership rules are violated.
10
+ class MemoryContractViolation < StandardError; end
11
+
12
+ # NvArray — The canonical GPU tensor type for the entire Ignis system.
13
+ #
14
+ # Ignis creates them. NvCCL moves them. WNAIS serializes them to NOVA.
15
+ # There is exactly ONE definition of NvArray in the codebase.
16
+ #
17
+ # Thread safety: owner transitions, ref_count changes, and free operations
18
+ # are protected by a per-instance Mutex.
19
+ class NvArray
20
+ # @return [Hash{Symbol => Integer}] dtype to byte size mapping
21
+ DTYPE_SIZES = {
22
+ float16: 2,
23
+ float32: 4,
24
+ float64: 8,
25
+ int32: 4,
26
+ int64: 8,
27
+ uint8: 1,
28
+ bfloat16: 2
29
+ }.freeze
30
+
31
+ # @return [Array<Symbol>] valid dtype symbols
32
+ VALID_DTYPES = DTYPE_SIZES.keys.freeze
33
+
34
+ # @return [Array<Symbol>] valid owner symbols
35
+ VALID_OWNERS = %i[nvruby nvccl wnais].freeze
36
+
37
+ # cudaMemcpy direction constants
38
+ MEMCPY_HOST_TO_DEVICE = 1
39
+ MEMCPY_DEVICE_TO_HOST = 2
40
+ MEMCPY_DEVICE_TO_DEVICE = 3
41
+
42
+ # cudaHostAlloc flags
43
+ CUDA_HOST_ALLOC_DEFAULT = 0
44
+
45
+ # @return [Array<Integer>] tensor shape dimensions
46
+ attr_reader :shape
47
+
48
+ # @return [Symbol] data type (:float16, :float32, :float64, :int32, :int64, :uint8, :bfloat16)
49
+ attr_reader :dtype
50
+
51
+ # @return [Integer] GPU device index
52
+ attr_reader :device_id
53
+
54
+ # @return [Fiddle::Pointer, nil] device memory pointer
55
+ attr_reader :ptr
56
+
57
+ # @return [Fiddle::Pointer, nil] CUDA stream pointer (nullable)
58
+ attr_reader :stream
59
+
60
+ # @return [Fiddle::Pointer, nil] pinned host memory pointer (nullable, for P2P staging)
61
+ attr_reader :pinned_host_ptr
62
+
63
+ # @return [Symbol] current memory owner (:nvruby, :nvccl, or :wnais)
64
+ attr_reader :owner
65
+
66
+ # @return [Integer] thread-safe reference count for shared staging
67
+ attr_reader :ref_count
68
+
69
+ # @return [Integer] unique identifier for this array instance
70
+ attr_reader :id
71
+
72
+ # @return [Time] creation timestamp
73
+ attr_reader :created_at
74
+
75
+ @@next_id = 0
76
+ @@id_mutex = Mutex.new
77
+
78
+ # Initialize a new NvArray.
79
+ #
80
+ # @param shape [Array<Integer>] tensor dimensions
81
+ # @param dtype [Symbol] data type
82
+ # @param device_id [Integer] GPU device index
83
+ # @param ptr [Fiddle::Pointer, nil] pre-allocated device memory pointer
84
+ # @param stream [Fiddle::Pointer, nil] CUDA stream pointer
85
+ # @param owner [Symbol] initial memory owner
86
+ # @raise [ArgumentError] if shape, dtype, or owner are invalid
87
+ def initialize(shape:, dtype:, device_id: 0, ptr: nil, stream: nil, owner: :nvruby, parent: nil)
88
+ validate_shape!(shape)
89
+ validate_dtype!(dtype)
90
+ validate_owner!(owner)
91
+
92
+ @shape = shape.dup.freeze
93
+ @dtype = dtype
94
+ @device_id = device_id
95
+ @ptr = ptr
96
+ @stream = stream
97
+ @pinned_host_ptr = nil
98
+ @owner = owner
99
+ @ref_count = 0
100
+ @mutex = Mutex.new
101
+ @freed = false
102
+ @created_at = Time.now
103
+ # Memory ownership: we own (and must free) the device buffer only if we
104
+ # allocate it ourselves. Arrays constructed with an external `ptr:` (slice,
105
+ # reshape, from_device_ptr) are VIEWS — they must never free it.
106
+ @owns_memory = ptr.nil?
107
+ # Views retain a reference to their parent so the parent (and its memory)
108
+ # stays alive for at least as long as the view does.
109
+ @parent = parent
110
+
111
+ @@id_mutex.synchronize do
112
+ @id = @@next_id
113
+ @@next_id += 1
114
+ end
115
+ end
116
+
117
+ # Total number of elements in the tensor.
118
+ # @return [Integer]
119
+ def numel
120
+ @shape.reduce(1, :*)
121
+ end
122
+
123
+ # Size in bytes of the tensor data on device.
124
+ # @return [Integer]
125
+ def size_bytes
126
+ numel * dtype_size
127
+ end
128
+
129
+ # Bytes per element for the current dtype.
130
+ # @return [Integer]
131
+ def dtype_size
132
+ DTYPE_SIZES.fetch(@dtype)
133
+ end
134
+
135
+ # ----------------------------------------------------------------
136
+ # Compatibility shims
137
+ #
138
+ # The Ignis kernel launcher and CUDA-X (cuBLAS/etc.) bindings were written
139
+ # against Ignis::NvArray. These accessors let those code paths consume a
140
+ # Shared::NvArray unchanged (duck typing on device_ffi_ptr/ndim/etc.),
141
+ # which is what lets the AI stack actually reach the GPU.
142
+ # ----------------------------------------------------------------
143
+
144
+ # @return [Integer] number of dimensions
145
+ def ndim
146
+ @shape.length
147
+ end
148
+
149
+ # @return [Integer] device index (Ignis::NvArray naming)
150
+ def device_index
151
+ @device_id
152
+ end
153
+
154
+ # @return [Boolean] whether device memory is allocated
155
+ def on_device?
156
+ !@ptr.nil?
157
+ end
158
+
159
+ # Ensure device memory is allocated. Shared arrays are device-resident, so
160
+ # this just allocates on first use; it exists for API parity with
161
+ # Ignis::NvArray#to_device.
162
+ # @return [self]
163
+ def to_device(*)
164
+ @mutex.synchronize do
165
+ raise "NvArray##{@id} has been freed" if @freed
166
+
167
+ @ptr = allocate_device_memory(size_bytes) if @ptr.nil?
168
+ end
169
+ self
170
+ end
171
+
172
+ # Device pointer wrapped as an FFI::Pointer for FFI-bound library calls
173
+ # (cuBLAS/cuSOLVER/cuFFT/cuRAND/cuSPARSE and the JIT kernel launcher).
174
+ # @return [FFI::Pointer]
175
+ def device_ffi_ptr
176
+ to_device if @ptr.nil?
177
+ ::FFI::Pointer.new(@ptr.to_i)
178
+ end
179
+
180
+ # Zero the device buffer with cudaMemset (device-side). This is ~20x faster
181
+ # than the old `from_host(Array.new(numel, 0.0))` idiom, which allocated a
182
+ # huge Ruby array, packed it, and H2D-copied it on every op (0.5ms+/op, and
183
+ # seconds for the 38M-element LM-head weight transpose).
184
+ # @return [self]
185
+ def zero!
186
+ to_device if @ptr.nil?
187
+ status = cuda_rt.cudaMemset(@ptr, 0, size_bytes)
188
+ raise "cudaMemset failed with status #{status}" unless status.zero?
189
+ self
190
+ end
191
+
192
+ # Whether this array has been freed.
193
+ # @return [Boolean]
194
+ def freed?
195
+ @mutex.synchronize { @freed }
196
+ end
197
+
198
+ # Copy device memory to host and return as a flat Ruby Array.
199
+ #
200
+ # Uses cudaMemcpy with DtoH direction. The returned array contains
201
+ # numeric values decoded according to the dtype.
202
+ #
203
+ # @return [Array<Numeric>] flat array of host-side values
204
+ # @raise [RuntimeError] if array has been freed or cudaMemcpy fails
205
+ def to_host
206
+ @mutex.synchronize do
207
+ raise "NvArray##{@id} has been freed" if @freed
208
+ raise "NvArray##{@id} has no device pointer" if @ptr.nil?
209
+ end
210
+
211
+ host_buf = Fiddle::Pointer.malloc(size_bytes)
212
+ status = cuda_rt.cudaMemcpy(host_buf, @ptr, size_bytes, MEMCPY_DEVICE_TO_HOST)
213
+ raise "cudaMemcpy DtoH failed with status #{status}" unless status.zero?
214
+
215
+ unpack_host_buffer(host_buf)
216
+ end
217
+
218
+ # Copy data from a Ruby Array to device memory.
219
+ #
220
+ # @param data [Array<Numeric>] flat array of values to copy
221
+ # @return [self]
222
+ # @raise [ArgumentError] if data size doesn't match tensor element count
223
+ # @raise [RuntimeError] if array has been freed or cudaMemcpy fails
224
+ def from_host(data)
225
+ unless data.is_a?(Array) && data.length == numel
226
+ raise ArgumentError, "Expected #{numel} elements, got #{data.length}"
227
+ end
228
+
229
+ @mutex.synchronize do
230
+ raise "NvArray##{@id} has been freed" if @freed
231
+
232
+ if @ptr.nil?
233
+ @ptr = allocate_device_memory(size_bytes)
234
+ end
235
+ end
236
+
237
+ host_buf = pack_host_buffer(data)
238
+ status = cuda_rt.cudaMemcpy(@ptr, host_buf, size_bytes, MEMCPY_HOST_TO_DEVICE)
239
+ raise "cudaMemcpy HtoD failed with status #{status}" unless status.zero?
240
+
241
+ self
242
+ end
243
+
244
+ # Copy a raw little-endian binary string straight to device memory.
245
+ #
246
+ # The bytes must already be in the device dtype's native layout (this is how
247
+ # safetensors / NOVA data is stored), so no per-element conversion is done —
248
+ # this avoids the lossy float<->half round trip that #from_host would incur.
249
+ #
250
+ # @param bytes [String] binary string of exactly size_bytes length
251
+ # @return [self]
252
+ # @raise [ArgumentError] if the byte count doesn't match size_bytes
253
+ # @raise [RuntimeError] if array has been freed or cudaMemcpy fails
254
+ def from_host_raw(bytes)
255
+ unless bytes.bytesize == size_bytes
256
+ raise ArgumentError, "Expected #{size_bytes} bytes, got #{bytes.bytesize}"
257
+ end
258
+
259
+ @mutex.synchronize do
260
+ raise "NvArray##{@id} has been freed" if @freed
261
+
262
+ @ptr = allocate_device_memory(size_bytes) if @ptr.nil?
263
+ end
264
+
265
+ host_buf = Fiddle::Pointer.malloc(bytes.bytesize)
266
+ host_buf[0, bytes.bytesize] = bytes
267
+ status = cuda_rt.cudaMemcpy(@ptr, host_buf, size_bytes, MEMCPY_HOST_TO_DEVICE)
268
+ raise "cudaMemcpy HtoD failed with status #{status}" unless status.zero?
269
+
270
+ self
271
+ end
272
+
273
+ # Deep-copy into a fresh, independently-owned device buffer (device→device
274
+ # cudaMemcpy). Unlike #slice, the returned array shares NO storage with self
275
+ # and owns its memory (registered for finalization, freed on GC).
276
+ #
277
+ # The autograd tape relies on this: it accumulates gradients in place, so it
278
+ # must guarantee accumulator buffers never alias. Backward closures are free
279
+ # to return shared buffers (e.g. `+` returns [grad, grad]); the tape clones
280
+ # to restore exclusive ownership. DtoD copies raw bytes, so it is exact and
281
+ # dtype-agnostic (no float↔half round trip).
282
+ #
283
+ # @return [NvArray] independent owned copy with identical shape/dtype/values
284
+ # @raise [RuntimeError] if this array has been freed or has no device pointer
285
+ def clone
286
+ @mutex.synchronize do
287
+ raise "NvArray##{@id} has been freed" if @freed
288
+ raise "NvArray##{@id} has no device pointer" if @ptr.nil?
289
+ end
290
+
291
+ copy = NvArray.new(shape: @shape, dtype: @dtype, device_id: @device_id,
292
+ stream: @stream, owner: @owner)
293
+ # allocate_into the copy so the finalizer is registered on `copy` (owns_memory).
294
+ dst = copy.send(:allocate_device_memory, size_bytes)
295
+ copy.instance_variable_set(:@ptr, dst)
296
+
297
+ status = cuda_rt.cudaMemcpy(dst, @ptr, size_bytes, MEMCPY_DEVICE_TO_DEVICE)
298
+ raise "cudaMemcpy DtoD failed with status #{status}" unless status.zero?
299
+
300
+ copy
301
+ end
302
+
303
+ # Copy a contiguous source array into this buffer starting at row +start_row+
304
+ # (device→device). Used to append K/V rows into a preallocated KV cache in
305
+ # O(row) instead of reallocating + recopying the whole cache each step.
306
+ #
307
+ # @param src [NvArray] contiguous source ([r, cols] matching this array's cols)
308
+ # @param start_row [Integer] destination row offset (0-based)
309
+ # @return [self]
310
+ # @raise [RuntimeError] if freed/unallocated, or the write would overflow
311
+ def write_rows!(src, start_row)
312
+ @mutex.synchronize do
313
+ raise "NvArray##{@id} has been freed" if @freed
314
+ raise "NvArray##{@id} has no device pointer" if @ptr.nil?
315
+ end
316
+ row_bytes = (numel / @shape[0]) * dtype_size
317
+ offset = start_row * row_bytes
318
+ if offset + src.size_bytes > size_bytes
319
+ raise "write_rows! overflow: writing #{src.size_bytes} bytes at row #{start_row} " \
320
+ "(offset #{offset}) exceeds #{size_bytes}-byte buffer"
321
+ end
322
+ dst = Fiddle::Pointer.new(@ptr.to_i + offset)
323
+ status = cuda_rt.cudaMemcpy(dst, src.ptr, src.size_bytes, MEMCPY_DEVICE_TO_DEVICE)
324
+ raise "cudaMemcpy DtoD (write_rows!) failed with status #{status}" unless status.zero?
325
+ self
326
+ end
327
+
328
+ # Create a zero-copy slice along a dimension.
329
+ #
330
+ # Returns a new NvArray that shares the same device memory but with
331
+ # an offset pointer and adjusted shape. No data is copied.
332
+ #
333
+ # @param dim [Integer] dimension to slice along
334
+ # @param start [Integer] starting index in the dimension
335
+ # @param len [Integer] number of elements to include
336
+ # @return [NvArray] new array sharing device memory (no copy)
337
+ # @raise [ArgumentError] if dim, start, or len are out of bounds
338
+ def slice(dim, start, len)
339
+ raise ArgumentError, "Dimension #{dim} out of range for shape #{@shape}" unless dim >= 0 && dim < @shape.length
340
+ raise ArgumentError, "Slice range [#{start}, #{start + len}) exceeds dim size #{@shape[dim]}" unless start >= 0 && (start + len) <= @shape[dim]
341
+
342
+ # A pointer-offset view is only CONTIGUOUS — and thus correct for a plain
343
+ # numel-length read (to_host, kernels) — when nothing varies in the
344
+ # dimensions BEFORE `dim`. For dim>0 with non-unit leading dims the slice
345
+ # is strided (scattered across memory), which this zero-copy view cannot
346
+ # represent: a consumer would read the wrong, contiguous elements with no
347
+ # error. Fail loud instead of returning silently-wrong data.
348
+ leading = @shape[0...dim].reduce(1, :*)
349
+ if leading > 1
350
+ raise ArgumentError,
351
+ "slice(dim=#{dim}, ...) on shape #{@shape} is a strided (non-contiguous) view, " \
352
+ "which NvArray#slice cannot represent; only contiguous slices are supported " \
353
+ "(dim 0, or leading dims of size 1). Use a gather/copy kernel for strided slices."
354
+ end
355
+
356
+ @mutex.synchronize do
357
+ raise "NvArray##{@id} has been freed" if @freed
358
+ end
359
+
360
+ new_shape = @shape.dup
361
+ new_shape[dim] = len
362
+
363
+ # Compute byte offset: product of trailing dimensions * start * dtype_size
364
+ trailing = @shape[(dim + 1)..].reduce(1, :*)
365
+ offset_bytes = start * trailing * dtype_size
366
+
367
+ sliced_ptr = @ptr.nil? ? nil : Fiddle::Pointer.new(@ptr.to_i + offset_bytes, size_bytes - offset_bytes)
368
+
369
+ sliced = NvArray.new(
370
+ shape: new_shape,
371
+ dtype: @dtype,
372
+ device_id: @device_id,
373
+ ptr: sliced_ptr,
374
+ stream: @stream,
375
+ owner: @owner,
376
+ parent: self # view: non-owning, keeps parent alive (no leak / no double-free)
377
+ )
378
+
379
+ sliced
380
+ end
381
+
382
+ # Atomically transfer memory ownership to a new owner.
383
+ #
384
+ # @param new_owner [Symbol] the new owner (:nvruby, :nvccl, or :wnais)
385
+ # @return [Symbol] the new owner
386
+ # @raise [MemoryContractViolation] if ref_count > 1 during transfer
387
+ # @raise [ArgumentError] if new_owner is invalid
388
+ def transfer_ownership(new_owner)
389
+ validate_owner!(new_owner)
390
+
391
+ @mutex.synchronize do
392
+ raise "NvArray##{@id} has been freed" if @freed
393
+
394
+ if @ref_count > 1
395
+ raise MemoryContractViolation,
396
+ "Cannot transfer ownership of NvArray##{@id} while ref_count=#{@ref_count} > 1"
397
+ end
398
+
399
+ @owner = new_owner
400
+ end
401
+
402
+ new_owner
403
+ end
404
+
405
+ # Allocate pinned host memory for P2P staging.
406
+ #
407
+ # @return [Fiddle::Pointer] the pinned host pointer
408
+ # @raise [RuntimeError] if cudaHostAlloc fails or already pinned
409
+ def pin!
410
+ @mutex.synchronize do
411
+ raise "NvArray##{@id} has been freed" if @freed
412
+ raise "NvArray##{@id} is already pinned" unless @pinned_host_ptr.nil?
413
+
414
+ ptr_buf = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
415
+ status = cuda_rt.cudaHostAlloc(ptr_buf, size_bytes, CUDA_HOST_ALLOC_DEFAULT)
416
+ raise "cudaHostAlloc failed with status #{status}" unless status.zero?
417
+
418
+ @pinned_host_ptr = Fiddle::Pointer.new(ptr_buf[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
419
+ end
420
+
421
+ @pinned_host_ptr
422
+ end
423
+
424
+ # Free pinned host memory.
425
+ #
426
+ # @return [void]
427
+ # @raise [RuntimeError] if no pinned memory exists or cudaFreeHost fails
428
+ def unpin!
429
+ @mutex.synchronize do
430
+ raise "NvArray##{@id} has been freed" if @freed
431
+ raise "NvArray##{@id} has no pinned memory" if @pinned_host_ptr.nil?
432
+
433
+ status = cuda_rt.cudaFreeHost(@pinned_host_ptr)
434
+ raise "cudaFreeHost failed with status #{status}" unless status.zero?
435
+
436
+ @pinned_host_ptr = nil
437
+ end
438
+ end
439
+
440
+ # Free device memory. Raises if ref_count > 0.
441
+ #
442
+ # @return [void]
443
+ # @raise [MemoryContractViolation] if ref_count > 0
444
+ # @raise [RuntimeError] if already freed or cudaFree fails
445
+ def free!
446
+ @mutex.synchronize do
447
+ raise "NvArray##{@id} has already been freed" if @freed
448
+
449
+ # Refuse to free while pinned (ref_count > 0): another component holds
450
+ # this buffer for shared staging (the documented contract). GC-time
451
+ # reclamation goes through the finalizer, not free!, so this guard only
452
+ # gates EXPLICIT frees — it never blocks normal collection.
453
+ if @ref_count > 0
454
+ raise MemoryContractViolation,
455
+ "Cannot free NvArray##{@id} while ref_count=#{@ref_count} > 0"
456
+ end
457
+
458
+ unless @pinned_host_ptr.nil?
459
+ cuda_rt.cudaFreeHost(@pinned_host_ptr)
460
+ @pinned_host_ptr = nil
461
+ end
462
+
463
+ # Only the owner frees the device buffer. Views (slice/reshape) share the
464
+ # parent's allocation and must NOT free it (that was the reshape/slice
465
+ # double-free path); the parent frees it via free! or its GC finalizer.
466
+ if @owns_memory && !@ptr.nil?
467
+ status = cuda_rt.cudaFree(@ptr)
468
+ raise "cudaFree failed with status #{status}" unless status.zero?
469
+ end
470
+
471
+ @ptr = nil
472
+ @freed = true
473
+ end
474
+ # Cancel the GC finalizer so we don't cudaFree the same pointer twice.
475
+ ObjectSpace.undefine_finalizer(self)
476
+ end
477
+
478
+ # Atomically increment the reference count.
479
+ # @return [Integer] new ref_count value
480
+ def increment_ref!
481
+ @mutex.synchronize do
482
+ raise "NvArray##{@id} has been freed" if @freed
483
+ @ref_count += 1
484
+ end
485
+ end
486
+
487
+ # Atomically decrement the reference count.
488
+ # @return [Integer] new ref_count value
489
+ # @raise [RuntimeError] if ref_count is already 0
490
+ def decrement_ref!
491
+ @mutex.synchronize do
492
+ raise "NvArray##{@id} has been freed" if @freed
493
+ raise "NvArray##{@id} ref_count is already 0" if @ref_count <= 0
494
+ @ref_count -= 1
495
+ end
496
+ end
497
+
498
+ # Human-readable string representation.
499
+ # @return [String]
500
+ def to_s
501
+ "#<Ignis::Shared::NvArray id=#{@id} shape=#{@shape} dtype=#{@dtype} " \
502
+ "device=#{@device_id} owner=#{@owner} ref_count=#{@ref_count} freed=#{@freed}>"
503
+ end
504
+
505
+ alias_method :inspect, :to_s
506
+
507
+ private
508
+
509
+ # Validate shape parameter.
510
+ # @param shape [Array<Integer>]
511
+ # @raise [ArgumentError]
512
+ def validate_shape!(shape)
513
+ # `shape.all?` is vacuously true for [], so an empty shape slipped through
514
+ # and produced a numel==1 phantom-scalar array. Require at least one dim.
515
+ unless shape.is_a?(Array) && !shape.empty? && shape.all? { |d| d.is_a?(Integer) && d > 0 }
516
+ raise ArgumentError, "Shape must be a non-empty Array of positive Integers, got: #{shape.inspect}"
517
+ end
518
+ end
519
+
520
+ # Validate dtype parameter.
521
+ # @param dtype [Symbol]
522
+ # @raise [ArgumentError]
523
+ def validate_dtype!(dtype)
524
+ unless VALID_DTYPES.include?(dtype)
525
+ raise ArgumentError, "Invalid dtype #{dtype.inspect}. Valid: #{VALID_DTYPES}"
526
+ end
527
+ end
528
+
529
+ # Validate owner parameter.
530
+ # @param owner [Symbol]
531
+ # @raise [ArgumentError]
532
+ def validate_owner!(owner)
533
+ unless VALID_OWNERS.include?(owner)
534
+ raise ArgumentError, "Invalid owner #{owner.inspect}. Valid: #{VALID_OWNERS}"
535
+ end
536
+ end
537
+
538
+ # Get or load the CUDA runtime module for Fiddle calls.
539
+ # @return [Module]
540
+ def cuda_rt
541
+ NvArray.cuda_runtime
542
+ end
543
+
544
+ # Pack Ruby Array into a binary string for cudaMemcpy HtoD.
545
+ # @param data [Array<Numeric>]
546
+ # @return [Fiddle::Pointer]
547
+ def pack_host_buffer(data)
548
+ packed = case @dtype
549
+ when :float32 then data.pack('e*')
550
+ when :float64 then data.pack('E*')
551
+ when :int32 then data.pack('l<*')
552
+ when :int64 then data.pack('q<*')
553
+ when :uint8 then data.pack('C*')
554
+ when :float16 then data.map { |v| ::Ignis::Half.f32_to_f16(v) }.pack('v*')
555
+ when :bfloat16 then data.map { |v| ::Ignis::Half.f32_to_bf16(v) }.pack('v*')
556
+ end
557
+
558
+ buf = Fiddle::Pointer.malloc(packed.bytesize)
559
+ buf[0, packed.bytesize] = packed
560
+ buf
561
+ end
562
+
563
+ # Unpack a host buffer binary string into a Ruby Array.
564
+ # @param buf [Fiddle::Pointer]
565
+ # @return [Array<Numeric>]
566
+ def unpack_host_buffer(buf)
567
+ raw = buf[0, size_bytes]
568
+ case @dtype
569
+ when :float32 then raw.unpack('e*')
570
+ when :float64 then raw.unpack('E*')
571
+ when :int32 then raw.unpack('l<*')
572
+ when :int64 then raw.unpack('q<*')
573
+ when :uint8 then raw.unpack('C*')
574
+ when :float16 then raw.unpack('v*').map { |bits| half_to_float(bits) }
575
+ when :bfloat16 then raw.unpack('v*').map { |bits| bfloat16_to_float(bits) }
576
+ end
577
+ end
578
+
579
+ # Convert IEEE 754 half-precision bits to Ruby Float.
580
+ # Delegates to Ignis::Half (single source of truth shared with Ignis::NvArray).
581
+ # @param bits [Integer] 16-bit unsigned integer
582
+ # @return [Float]
583
+ def half_to_float(bits)
584
+ ::Ignis::Half.f16_to_f32(bits)
585
+ end
586
+
587
+ # Convert bfloat16 bits to Ruby Float.
588
+ # @param bits [Integer] 16-bit unsigned integer
589
+ # @return [Float]
590
+ def bfloat16_to_float(bits)
591
+ ::Ignis::Half.bf16_to_f32(bits)
592
+ end
593
+
594
+ # Allocate device memory via cudaMalloc.
595
+ # @param bytes [Integer]
596
+ # @return [Fiddle::Pointer]
597
+ def allocate_device_memory(bytes)
598
+ ptr_buf = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
599
+ status = cuda_rt.cudaMalloc(ptr_buf, bytes)
600
+ raise "cudaMalloc failed with status #{status} for #{bytes} bytes" unless status.zero?
601
+ ptr = Fiddle::Pointer.new(ptr_buf[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
602
+ # Free this owned allocation if the object is GC'd without an explicit
603
+ # free! (previously there was NO finalizer, so every dropped NvArray
604
+ # leaked its full GPU buffer). free! undefines this to avoid a double free.
605
+ ObjectSpace.define_finalizer(self, self.class.release_finalizer(ptr.to_i))
606
+ ptr
607
+ end
608
+
609
+ class << self
610
+ # CUDA runtime Fiddle bindings — lazily loaded singleton.
611
+ # @return [Module] module with CUDA runtime functions
612
+ def cuda_runtime
613
+ @cuda_runtime ||= load_cuda_runtime
614
+ end
615
+
616
+ # Finalizer that frees an owned device allocation on GC. Captures only the
617
+ # raw address (not self, which would pin the object and defeat GC) and
618
+ # swallows errors (interpreter shutdown may have unloaded the runtime).
619
+ # @param addr [Integer] device pointer address
620
+ # @return [Proc]
621
+ def release_finalizer(addr)
622
+ proc do
623
+ begin
624
+ cuda_runtime.cudaFree(Fiddle::Pointer.new(addr))
625
+ rescue StandardError
626
+ nil
627
+ end
628
+ end
629
+ end
630
+
631
+ private
632
+
633
+ # Load CUDA runtime DLL and bind essential functions.
634
+ # @return [Module]
635
+ def load_cuda_runtime
636
+ # Resolve CUDA runtime path per platform
637
+ dll_path = if defined?(Ignis::Platform)
638
+ Ignis::Platform.cudart_path
639
+ elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
640
+ cuda_bin = File.join('C:', 'Program Files', 'NVIDIA GPU Computing Toolkit', 'CUDA', 'v13.0', 'bin')
641
+ File.join(cuda_bin, 'cudart64_130.dll')
642
+ else
643
+ 'libcudart.so.13'
644
+ end
645
+
646
+ handle = Fiddle::Handle.new(dll_path)
647
+
648
+ mod = Module.new
649
+ mod.define_singleton_method(:handle) { handle }
650
+
651
+ # cudaMalloc(void **devPtr, size_t size) -> int
652
+ cuda_malloc = Fiddle::Function.new(
653
+ handle['cudaMalloc'],
654
+ [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T],
655
+ Fiddle::TYPE_INT
656
+ )
657
+ mod.define_singleton_method(:cudaMalloc) { |ptr, size| cuda_malloc.call(ptr, size) }
658
+
659
+ # cudaMemset(void *devPtr, int value, size_t count) -> int
660
+ cuda_memset = Fiddle::Function.new(
661
+ handle['cudaMemset'],
662
+ [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_SIZE_T],
663
+ Fiddle::TYPE_INT
664
+ )
665
+ mod.define_singleton_method(:cudaMemset) { |ptr, value, count| cuda_memset.call(ptr, value, count) }
666
+
667
+ # cudaFree(void *devPtr) -> int
668
+ cuda_free = Fiddle::Function.new(
669
+ handle['cudaFree'],
670
+ [Fiddle::TYPE_VOIDP],
671
+ Fiddle::TYPE_INT
672
+ )
673
+ mod.define_singleton_method(:cudaFree) { |ptr| cuda_free.call(ptr) }
674
+
675
+ # cudaMemcpy(void *dst, const void *src, size_t count, int kind) -> int
676
+ cuda_memcpy = Fiddle::Function.new(
677
+ handle['cudaMemcpy'],
678
+ [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT],
679
+ Fiddle::TYPE_INT
680
+ )
681
+ mod.define_singleton_method(:cudaMemcpy) { |dst, src, count, kind| cuda_memcpy.call(dst, src, count, kind) }
682
+
683
+ # cudaHostAlloc(void **pHost, size_t size, unsigned int flags) -> int
684
+ cuda_host_alloc = Fiddle::Function.new(
685
+ handle['cudaHostAlloc'],
686
+ [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT],
687
+ Fiddle::TYPE_INT
688
+ )
689
+ mod.define_singleton_method(:cudaHostAlloc) { |ptr, size, flags| cuda_host_alloc.call(ptr, size, flags) }
690
+
691
+ # cudaFreeHost(void *ptr) -> int
692
+ cuda_free_host = Fiddle::Function.new(
693
+ handle['cudaFreeHost'],
694
+ [Fiddle::TYPE_VOIDP],
695
+ Fiddle::TYPE_INT
696
+ )
697
+ mod.define_singleton_method(:cudaFreeHost) { |ptr| cuda_free_host.call(ptr) }
698
+
699
+ mod
700
+ end
701
+ end
702
+ end
703
+ end
704
+ end
705
+
706
+ # Public ecosystem alias: the one canonical GPU n-dimensional array. Internally the
707
+ # class lives at Ignis::Shared::NvArray (legacy path); Ignis::NDArray is the name the
708
+ # Ignis API exposes. (The nvmath-style Ignis::NvArray is a separate, deferred numerics
709
+ # array — see gems/MIGRATION.md.)
710
+ Ignis::NDArray = Ignis::Shared::NvArray