ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,410 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fiddle'
4
+
5
+ module Ignis
6
+ module CUDA
7
+ # Manages GPU device memory allocation and transfers.
8
+ #
9
+ # Refactored to use Fiddle-based RuntimeAPI hot-path methods.
10
+ # No FFI::MemoryPointer usage — all pointers via Fiddle::Pointer.
11
+ class Memory
12
+ # @return [Fiddle::Pointer] Device pointer
13
+ attr_reader :device_ptr
14
+
15
+ # @return [Integer] Size in bytes
16
+ attr_reader :size
17
+
18
+ # @return [Integer] Device index
19
+ attr_reader :device_index
20
+
21
+ # @param size [Integer] Size in bytes to allocate
22
+ # @param device [Device, Integer, nil] Device to allocate on (nil for current)
23
+ # @param ptr [Fiddle::Pointer, nil] Existing pointer to wrap (nil to allocate new)
24
+ # @param owned [Boolean] Whether this object owns the memory and should free it
25
+ def initialize(size, device: nil, ptr: nil, owned: true)
26
+ @size = size
27
+ @device_index = resolve_device_index(device)
28
+ @owned = owned
29
+
30
+ if ptr
31
+ @device_ptr = ptr.is_a?(Fiddle::Pointer) ? ptr : Fiddle::Pointer.new(ptr.to_i)
32
+ else
33
+ @device_ptr = allocate_device_memory
34
+ end
35
+
36
+ @freed = false
37
+
38
+ if @owned
39
+ captured_ptr = @device_ptr
40
+ captured_device = @device_index
41
+ ObjectSpace.define_finalizer(self, self.class.release_finalizer(captured_ptr, captured_device))
42
+ end
43
+ end
44
+
45
+ # @return [Fiddle::Pointer] pointer for interop
46
+ def to_ptr
47
+ @device_ptr
48
+ end
49
+
50
+ # Device pointer wrapped as an FFI::Pointer.
51
+ #
52
+ # The hot path (this class' own cudaMemcpy etc.) is Fiddle-based, but the
53
+ # CUDA-X library bindings (cuBLAS/cuSOLVER/cuFFT/cuRAND/cuSPARSE) are FFI
54
+ # and cannot accept a Fiddle::Pointer. Use this when handing a device
55
+ # buffer to an FFI-bound library call.
56
+ # @return [FFI::Pointer]
57
+ def ffi_ptr
58
+ FFI::Pointer.new(@device_ptr.to_i)
59
+ end
60
+
61
+ # @return [Integer] raw address
62
+ def address
63
+ @device_ptr.to_i
64
+ end
65
+
66
+ # @return [Boolean]
67
+ def freed?
68
+ @freed
69
+ end
70
+
71
+ # Free the device memory.
72
+ # @return [void]
73
+ def free!
74
+ return if @freed
75
+ return unless @owned
76
+
77
+ RuntimeAPI.ensure_loaded!
78
+ ensure_correct_device do
79
+ RuntimeAPI.free(@device_ptr)
80
+ end
81
+
82
+ @freed = true
83
+ ObjectSpace.undefine_finalizer(self)
84
+ end
85
+
86
+ # Copy data from host to device.
87
+ # @param host_data [Fiddle::Pointer, String] Source data
88
+ # @param count [Integer, nil] Number of bytes to copy (defaults to size)
89
+ # @param stream [Stream, nil] Optional stream for async copy
90
+ # @return [void]
91
+ def copy_from_host(host_data, count: nil, stream: nil)
92
+ raise MemoryError, 'Memory has been freed' if @freed
93
+
94
+ count ||= @size
95
+ raise MemoryError, "Copy count #{count} exceeds allocation size #{@size}" if count > @size
96
+
97
+ RuntimeAPI.ensure_loaded!
98
+ host_ptr = prepare_host_pointer(host_data)
99
+
100
+ ensure_correct_device do
101
+ if stream
102
+ RuntimeAPI.memcpy_async(
103
+ @device_ptr, host_ptr, count,
104
+ RuntimeAPI::MEMCPY_HOST_TO_DEVICE, stream.to_ptr
105
+ )
106
+ else
107
+ RuntimeAPI.memcpy(
108
+ @device_ptr, host_ptr, count,
109
+ RuntimeAPI::MEMCPY_HOST_TO_DEVICE
110
+ )
111
+ end
112
+ end
113
+ end
114
+
115
+ # Copy data from device to host.
116
+ # @param host_buffer [Fiddle::Pointer, nil] Destination buffer (created if nil)
117
+ # @param count [Integer, nil] Number of bytes to copy (defaults to size)
118
+ # @param stream [Stream, nil] Optional stream for async copy
119
+ # @return [Fiddle::Pointer] Host buffer with data
120
+ def copy_to_host(host_buffer: nil, count: nil, stream: nil)
121
+ raise MemoryError, 'Memory has been freed' if @freed
122
+
123
+ count ||= @size
124
+ raise MemoryError, "Copy count #{count} exceeds allocation size #{@size}" if count > @size
125
+
126
+ RuntimeAPI.ensure_loaded!
127
+ host_buffer ||= Fiddle::Pointer.malloc(count)
128
+
129
+ # The destination may be an FFI::MemoryPointer (NvArray host buffer);
130
+ # bridge it to a Fiddle address for the Fiddle-based memcpy, but return
131
+ # the original object so the caller can read it back with its own API.
132
+ dst_ptr = if host_buffer.is_a?(Fiddle::Pointer)
133
+ host_buffer
134
+ elsif host_buffer.respond_to?(:address)
135
+ Fiddle::Pointer.new(host_buffer.address)
136
+ else
137
+ host_buffer
138
+ end
139
+
140
+ ensure_correct_device do
141
+ if stream
142
+ RuntimeAPI.memcpy_async(
143
+ dst_ptr, @device_ptr, count,
144
+ RuntimeAPI::MEMCPY_DEVICE_TO_HOST, stream.to_ptr
145
+ )
146
+ else
147
+ RuntimeAPI.memcpy(
148
+ dst_ptr, @device_ptr, count,
149
+ RuntimeAPI::MEMCPY_DEVICE_TO_HOST
150
+ )
151
+ end
152
+ end
153
+
154
+ host_buffer
155
+ end
156
+
157
+ # Copy data from another device memory.
158
+ # @param source [Memory] Source device memory
159
+ # @param count [Integer, nil] Number of bytes to copy
160
+ # @param stream [Stream, nil] Optional stream for async copy
161
+ # @return [void]
162
+ def copy_from_device(source, count: nil, stream: nil)
163
+ raise MemoryError, 'Memory has been freed' if @freed
164
+ raise MemoryError, 'Source memory has been freed' if source.freed?
165
+
166
+ count ||= [source.size, @size].min
167
+ raise MemoryError, "Copy count #{count} exceeds allocation size #{@size}" if count > @size
168
+
169
+ RuntimeAPI.ensure_loaded!
170
+
171
+ if stream
172
+ RuntimeAPI.memcpy_async(
173
+ @device_ptr, source.device_ptr, count,
174
+ RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE, stream.to_ptr
175
+ )
176
+ else
177
+ RuntimeAPI.memcpy(
178
+ @device_ptr, source.device_ptr, count,
179
+ RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE
180
+ )
181
+ end
182
+ end
183
+
184
+ # Set memory to a value.
185
+ # @param value [Integer] Byte value to set (0-255)
186
+ # @param count [Integer, nil] Number of bytes to set (defaults to size)
187
+ # @param stream [Stream, nil] Optional stream for async operation
188
+ # @return [void]
189
+ def memset(value, count: nil, stream: nil)
190
+ raise MemoryError, 'Memory has been freed' if @freed
191
+
192
+ count ||= @size
193
+ raise MemoryError, "Memset count #{count} exceeds allocation size #{@size}" if count > @size
194
+
195
+ RuntimeAPI.ensure_loaded!
196
+
197
+ ensure_correct_device do
198
+ if stream
199
+ RuntimeAPI.memset_async(@device_ptr, value, count, stream.to_ptr)
200
+ else
201
+ RuntimeAPI.memset(@device_ptr, value, count)
202
+ end
203
+ end
204
+ end
205
+
206
+ # Zero out the memory.
207
+ # @param stream [Stream, nil] Optional stream
208
+ # @return [void]
209
+ def zero!(stream: nil)
210
+ memset(0, stream: stream)
211
+ end
212
+
213
+ # @return [String]
214
+ def to_s
215
+ status = @freed ? 'freed' : 'allocated'
216
+ "DeviceMemory[#{@size} bytes, device #{@device_index}, #{status}]"
217
+ end
218
+
219
+ # @return [String]
220
+ def inspect
221
+ "#<Ignis::CUDA::Memory:#{object_id} size=#{@size} device=#{@device_index} " \
222
+ "ptr=0x#{@device_ptr.to_i.to_s(16)} freed=#{@freed}>"
223
+ end
224
+
225
+ class << self
226
+ # Create a finalizer proc for releasing device memory.
227
+ # @param ptr [Fiddle::Pointer] Device pointer to free
228
+ # @param device_index [Integer] Device the memory is on
229
+ # @return [Proc]
230
+ def release_finalizer(ptr, device_index)
231
+ ptr_addr = ptr.to_i
232
+ proc do
233
+ begin
234
+ RuntimeAPI.ensure_loaded!
235
+ current = RuntimeAPI.get_device
236
+ RuntimeAPI.set_device(device_index) if current != device_index
237
+ RuntimeAPI.free(Fiddle::Pointer.new(ptr_addr))
238
+ RuntimeAPI.set_device(current) if current != device_index
239
+ rescue StandardError
240
+ # Silently ignore errors during finalization
241
+ end
242
+ end
243
+ end
244
+ end
245
+
246
+ private
247
+
248
+ # Resolve device index from various inputs.
249
+ # @param device [Device, Integer, nil]
250
+ # @return [Integer]
251
+ def resolve_device_index(device)
252
+ case device
253
+ when Device
254
+ device.index
255
+ when Integer
256
+ device
257
+ when nil
258
+ Ignis.configuration.default_device
259
+ else
260
+ raise ArgumentError, "Invalid device: #{device.inspect}"
261
+ end
262
+ end
263
+
264
+ # Allocate device memory via RuntimeAPI (Fiddle hot path).
265
+ # Restores the previously-current device afterward so that allocating on a
266
+ # non-default GPU doesn't silently change the process-wide current device
267
+ # (which would misdirect later default-device kernels/copies on multi-GPU).
268
+ # @return [Fiddle::Pointer]
269
+ def allocate_device_memory
270
+ RuntimeAPI.ensure_loaded!
271
+ original = RuntimeAPI.get_device
272
+ RuntimeAPI.set_device(@device_index) if original != @device_index
273
+ ptr = RuntimeAPI.malloc(@size)
274
+ RuntimeAPI.set_device(original) if original != @device_index
275
+ ptr
276
+ end
277
+
278
+ # Ensure operations run on the correct device.
279
+ # @yield Block to run with correct device set
280
+ # @return [Object] Block return value
281
+ def ensure_correct_device
282
+ RuntimeAPI.ensure_loaded!
283
+ original = RuntimeAPI.get_device
284
+
285
+ RuntimeAPI.set_device(@device_index) if original != @device_index
286
+
287
+ result = yield
288
+
289
+ RuntimeAPI.set_device(original) if original != @device_index
290
+
291
+ result
292
+ end
293
+
294
+ # Prepare host pointer from various inputs.
295
+ #
296
+ # NvArray stores its host buffer as an FFI::MemoryPointer, while this class
297
+ # is Fiddle-based, so we bridge any FFI pointer (or other address-exposing
298
+ # object) into a non-owning Fiddle::Pointer at the same address. The caller
299
+ # must keep the source object alive for the duration of the (synchronous)
300
+ # copy — which NvArray does, since it holds the reference.
301
+ # @param data [Fiddle::Pointer, FFI::Pointer, String]
302
+ # @return [Fiddle::Pointer]
303
+ def prepare_host_pointer(data)
304
+ case data
305
+ when Fiddle::Pointer
306
+ data
307
+ when String
308
+ ptr = Fiddle::Pointer.malloc(data.bytesize)
309
+ ptr[0, data.bytesize] = data
310
+ ptr
311
+ else
312
+ raise ArgumentError, "Unsupported host data type: #{data.class}" unless data.respond_to?(:address)
313
+
314
+ Fiddle::Pointer.new(data.address)
315
+ end
316
+ end
317
+ end
318
+
319
+ # Pinned (page-locked) host memory for faster transfers.
320
+ #
321
+ # Uses RuntimeAPI.host_alloc (Fiddle) instead of FFI::MemoryPointer.
322
+ class PinnedMemory
323
+ # @return [Fiddle::Pointer] Host pointer
324
+ attr_reader :host_ptr
325
+
326
+ # @return [Integer] Size in bytes
327
+ attr_reader :size
328
+
329
+ # @param size [Integer] Size in bytes to allocate
330
+ def initialize(size)
331
+ @size = size
332
+ @host_ptr = allocate_pinned_memory
333
+ @freed = false
334
+
335
+ captured_ptr = @host_ptr
336
+ ObjectSpace.define_finalizer(self, self.class.release_finalizer(captured_ptr))
337
+ end
338
+
339
+ # @return [Boolean]
340
+ def freed?
341
+ @freed
342
+ end
343
+
344
+ # Free the pinned memory.
345
+ # @return [void]
346
+ def free!
347
+ return if @freed
348
+
349
+ RuntimeAPI.ensure_loaded!
350
+ RuntimeAPI.free_host(@host_ptr)
351
+
352
+ @freed = true
353
+ ObjectSpace.undefine_finalizer(self)
354
+ end
355
+
356
+ # Write data to the pinned memory.
357
+ # @param data [String] Data to write
358
+ # @param offset [Integer] Offset in bytes
359
+ # @return [void]
360
+ def write(data, offset: 0)
361
+ raise MemoryError, 'Memory has been freed' if @freed
362
+ raise MemoryError, 'Write exceeds buffer size' if offset + data.bytesize > @size
363
+
364
+ @host_ptr[offset, data.bytesize] = data
365
+ end
366
+
367
+ # Read data from the pinned memory.
368
+ # @param count [Integer] Number of bytes to read
369
+ # @param offset [Integer] Offset in bytes
370
+ # @return [String]
371
+ def read(count, offset: 0)
372
+ raise MemoryError, 'Memory has been freed' if @freed
373
+ raise MemoryError, 'Read exceeds buffer size' if offset + count > @size
374
+
375
+ @host_ptr[offset, count]
376
+ end
377
+
378
+ # @return [Fiddle::Pointer]
379
+ def to_ptr
380
+ @host_ptr
381
+ end
382
+
383
+ class << self
384
+ # Create a finalizer for releasing pinned memory.
385
+ # @param ptr [Fiddle::Pointer]
386
+ # @return [Proc]
387
+ def release_finalizer(ptr)
388
+ ptr_addr = ptr.to_i
389
+ proc do
390
+ begin
391
+ RuntimeAPI.ensure_loaded!
392
+ RuntimeAPI.free_host(Fiddle::Pointer.new(ptr_addr))
393
+ rescue StandardError
394
+ # Silently ignore errors during finalization
395
+ end
396
+ end
397
+ end
398
+ end
399
+
400
+ private
401
+
402
+ # Allocate pinned host memory via RuntimeAPI.
403
+ # @return [Fiddle::Pointer]
404
+ def allocate_pinned_memory
405
+ RuntimeAPI.ensure_loaded!
406
+ RuntimeAPI.host_alloc(@size)
407
+ end
408
+ end
409
+ end
410
+ end