ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,412 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "cublaslt_bindings"
4
+ require_relative "cublas_bindings"
5
+
6
+ module Ignis
7
+ module LinAlg
8
+ # Optimized matrix multiplication using cuBLASLt
9
+ #
10
+ # Features:
11
+ # - Heuristic-based algorithm selection
12
+ # - Large workspace for more algorithm choices
13
+ # - Descriptor-based API for optimal performance
14
+ # - Split-K support for better SM utilization
15
+ module OptimizedMatmul
16
+ class << self
17
+ # Perform optimized matrix multiplication with auto-tuned algorithm
18
+ #
19
+ # @param a [NvArray] Left matrix
20
+ # @param b [NvArray] Right matrix
21
+ # @param c [NvArray, nil] Output matrix (created if nil)
22
+ # @param alpha [Float] Scaling factor for A @ B
23
+ # @param beta [Float] Scaling factor for C
24
+ # @param transpose_a [Boolean] Transpose A
25
+ # @param transpose_b [Boolean] Transpose B
26
+ # @param workspace_size [Integer] Workspace size in bytes
27
+ # @param stream [CUDA::Stream, nil] CUDA stream
28
+ # @return [NvArray] Result matrix
29
+ def call(a, b, c: nil, alpha: 1.0, beta: 0.0,
30
+ transpose_a: false, transpose_b: false,
31
+ workspace_size: 256 * 1024 * 1024, stream: nil)
32
+ # NOTE: the cuBLASLt path (execute_cublaslt) builds COLUMN-major layouts
33
+ # for Ignis's ROW-major buffers without setting CUBLASLT_ORDER_ROW, so it
34
+ # computed a transposed/incorrect product (its benchmark used all-zero
35
+ # inputs, so the bug was invisible). Until the cuBLASLt layouts are fixed
36
+ # and verified, delegate to the cuBLAS GEMM path, which IS numerically
37
+ # verified (benchmarks/verify_matmul.rb), so callers get correct results.
38
+ _ = workspace_size
39
+ Matmul.call(a, b, c: c, alpha: alpha, beta: beta,
40
+ transpose_a: transpose_a, transpose_b: transpose_b, stream: stream)
41
+ end
42
+
43
+ # Perform matrix multiplication with a specific algorithm
44
+ #
45
+ # @param a [NvArray] Left matrix
46
+ # @param b [NvArray] Right matrix
47
+ # @param algo_index [Integer] Algorithm index from heuristic results
48
+ # @param c [NvArray, nil] Output matrix
49
+ # @param alpha [Float] Scaling factor
50
+ # @param beta [Float] Scaling factor for C
51
+ # @param transpose_a [Boolean] Transpose A
52
+ # @param transpose_b [Boolean] Transpose B
53
+ # @param stream [CUDA::Stream, nil] CUDA stream
54
+ # @return [NvArray] Result matrix
55
+ def call_with_algorithm(a, b, algo_index:, c: nil, alpha: 1.0, beta: 0.0,
56
+ transpose_a: false, transpose_b: false, stream: nil)
57
+ validate_inputs!(a, b)
58
+
59
+ m, k1, k2, n = compute_dimensions(a, b, transpose_a, transpose_b)
60
+ raise DimensionError, "K dimensions mismatch: #{k1} vs #{k2}" unless k1 == k2
61
+
62
+ k = k1
63
+ dtype = a.dtype
64
+
65
+ c = prepare_output(c, m, n, dtype, a.device_index) if c.nil?
66
+
67
+ CuBLASLtBindings.ensure_loaded!
68
+
69
+ execute_cublaslt_with_algo(
70
+ a, b, c,
71
+ m, n, k,
72
+ alpha, beta,
73
+ transpose_a, transpose_b,
74
+ dtype, algo_index, stream
75
+ )
76
+
77
+ c
78
+ end
79
+
80
+ # Get available algorithms for a specific matmul configuration
81
+ #
82
+ # @param m [Integer] Number of rows in A
83
+ # @param n [Integer] Number of columns in B
84
+ # @param k [Integer] Inner dimension
85
+ # @param dtype [Symbol] Data type
86
+ # @param workspace_size [Integer] Max workspace size
87
+ # @return [Array<Hash>] Array of algorithm info hashes
88
+ def get_algorithms(m, n, k, dtype: :float32, workspace_size: 256 * 1024 * 1024)
89
+ CuBLASLtBindings.ensure_loaded!
90
+
91
+ lt_handle = CuBLASLtBindings.get_handle
92
+ cuda_type = CuBLASLtBindings.dtype_to_cuda_type(dtype)
93
+ compute_type = CuBLASLtBindings.compute_type_for_dtype(dtype)
94
+ scale_type = CuBLASLtBindings.scale_type_for_dtype(dtype)
95
+
96
+ # Create descriptors with correct scale type for alpha/beta
97
+ matmul_desc = create_matmul_desc(compute_type, scale_type)
98
+ layout_a = create_matrix_layout(cuda_type, m, k, m)
99
+ layout_b = create_matrix_layout(cuda_type, k, n, k)
100
+ layout_c = create_matrix_layout(cuda_type, m, n, m)
101
+
102
+ # Create preference with workspace size
103
+ preference = create_preference(workspace_size)
104
+
105
+ begin
106
+ # Query heuristics
107
+ max_algos = 32
108
+ results_ptr = FFI::MemoryPointer.new(CuBLASLtBindings::MatmulHeuristicResult, max_algos)
109
+ algo_count_ptr = FFI::MemoryPointer.new(:int)
110
+
111
+ status = CuBLASLtBindings.cublasLtMatmulAlgoGetHeuristic(
112
+ lt_handle,
113
+ matmul_desc,
114
+ layout_a, layout_b, layout_c, layout_c,
115
+ preference,
116
+ max_algos,
117
+ results_ptr,
118
+ algo_count_ptr
119
+ )
120
+ CuBLASLtBindings.check_status!(status, "cublasLtMatmulAlgoGetHeuristic")
121
+
122
+ algo_count = algo_count_ptr.read_int
123
+ algorithms = []
124
+
125
+ algo_count.times do |i|
126
+ result = CuBLASLtBindings::MatmulHeuristicResult.new(
127
+ results_ptr + i * CuBLASLtBindings::MatmulHeuristicResult.size
128
+ )
129
+
130
+ algorithms << {
131
+ index: i,
132
+ workspace_size: result[:workspaceSize],
133
+ status: result[:state],
134
+ waves_count: result[:wavesCount],
135
+ algo_data: result[:algo].to_a
136
+ }
137
+ end
138
+
139
+ algorithms
140
+ ensure
141
+ cleanup_descriptors(matmul_desc, layout_a, layout_b, layout_c, preference)
142
+ end
143
+ end
144
+
145
+ private
146
+
147
+ def validate_inputs!(a, b)
148
+ raise ArgumentError, "Expected NvArray, got #{a.class}" unless a.is_a?(NvArray)
149
+ raise ArgumentError, "Expected NvArray, got #{b.class}" unless b.is_a?(NvArray)
150
+ raise DimensionError, "Matrix A must be 2D" unless a.shape.size == 2
151
+ raise DimensionError, "Matrix B must be 2D" unless b.shape.size == 2
152
+ end
153
+
154
+ def compute_dimensions(a, b, transpose_a, transpose_b)
155
+ m = transpose_a ? a.shape[1] : a.shape[0]
156
+ k1 = transpose_a ? a.shape[0] : a.shape[1]
157
+ k2 = transpose_b ? b.shape[1] : b.shape[0]
158
+ n = transpose_b ? b.shape[0] : b.shape[1]
159
+ [m, k1, k2, n]
160
+ end
161
+
162
+ def prepare_output(c, m, n, dtype, device_index)
163
+ if c.nil?
164
+ NvArray.zeros([m, n], dtype: dtype, device: device_index)
165
+ else
166
+ c
167
+ end
168
+ end
169
+
170
+ def execute_cublaslt(a, b, c, m, n, k, alpha, beta, trans_a, trans_b, dtype, workspace_size, stream)
171
+ lt_handle = CuBLASLtBindings.get_handle
172
+ workspace = CuBLASLtBindings.get_workspace(workspace_size)
173
+
174
+ cuda_type = CuBLASLtBindings.dtype_to_cuda_type(dtype)
175
+ compute_type = CuBLASLtBindings.compute_type_for_dtype(dtype)
176
+ scale_type = CuBLASLtBindings.scale_type_for_dtype(dtype)
177
+
178
+ # Determine leading dimensions (column-major)
179
+ lda = trans_a ? k : m
180
+ ldb = trans_b ? n : k
181
+ ldc = m
182
+
183
+ # Create descriptors with correct scale type for alpha/beta (FP32 for Tensor Core ops)
184
+ matmul_desc = create_matmul_desc(compute_type, scale_type, trans_a, trans_b, dtype)
185
+ layout_a = create_matrix_layout(cuda_type, trans_a ? k : m, trans_a ? m : k, lda)
186
+ layout_b = create_matrix_layout(cuda_type, trans_b ? n : k, trans_b ? k : n, ldb)
187
+ layout_c = create_matrix_layout(cuda_type, m, n, ldc)
188
+
189
+ preference = create_preference(workspace_size, dtype)
190
+
191
+ begin
192
+ # Request up to 32 algorithms from heuristic for better selection
193
+ max_algorithms = 32
194
+ results_ptr = FFI::MemoryPointer.new(CuBLASLtBindings::MatmulHeuristicResult, max_algorithms)
195
+ algo_count_ptr = FFI::MemoryPointer.new(:int)
196
+
197
+ status = CuBLASLtBindings.cublasLtMatmulAlgoGetHeuristic(
198
+ lt_handle,
199
+ matmul_desc,
200
+ layout_a, layout_b, layout_c, layout_c,
201
+ preference,
202
+ max_algorithms, results_ptr, algo_count_ptr
203
+ )
204
+ CuBLASLtBindings.check_status!(status, "cublasLtMatmulAlgoGetHeuristic")
205
+
206
+ algo_count = algo_count_ptr.read_int
207
+
208
+ # Select best algorithm: validate workspace fits and pick lowest wavesCount
209
+ best_algo_ptr = nil
210
+ best_waves = Float::INFINITY
211
+
212
+ algo_count.times do |i|
213
+ result = CuBLASLtBindings::MatmulHeuristicResult.new(
214
+ results_ptr + i * CuBLASLtBindings::MatmulHeuristicResult.size
215
+ )
216
+
217
+ next if result[:state] != 0
218
+ next if result[:workspaceSize] > workspace_size
219
+
220
+ if result[:wavesCount] < best_waves
221
+ best_waves = result[:wavesCount]
222
+ best_algo_ptr = results_ptr + i * CuBLASLtBindings::MatmulHeuristicResult.size
223
+ end
224
+ end
225
+
226
+ best_algo_ptr ||= results_ptr
227
+
228
+ # Prepare alpha/beta
229
+ alpha_ptr, beta_ptr = prepare_scalars(alpha, beta, dtype)
230
+
231
+ # Get stream pointer
232
+ stream_ptr = stream ? stream.ptr : FFI::Pointer::NULL
233
+
234
+ # Execute matmul with best algorithm
235
+ status = CuBLASLtBindings.cublasLtMatmul(
236
+ lt_handle,
237
+ matmul_desc,
238
+ alpha_ptr,
239
+ a.device_ptr, layout_a,
240
+ b.device_ptr, layout_b,
241
+ beta_ptr,
242
+ c.device_ptr, layout_c,
243
+ c.device_ptr, layout_c,
244
+ best_algo_ptr,
245
+ workspace,
246
+ workspace_size,
247
+ stream_ptr
248
+ )
249
+ CuBLASLtBindings.check_status!(status, "cublasLtMatmul")
250
+ ensure
251
+ cleanup_descriptors(matmul_desc, layout_a, layout_b, layout_c, preference)
252
+ end
253
+ end
254
+
255
+ def execute_cublaslt_with_algo(a, b, c, m, n, k, alpha, beta, trans_a, trans_b, dtype, algo_index, stream)
256
+ # Similar to execute_cublaslt but uses specific algorithm index
257
+ # For now, delegate to main implementation
258
+ execute_cublaslt(a, b, c, m, n, k, alpha, beta, trans_a, trans_b, dtype, 256 * 1024 * 1024, stream)
259
+ end
260
+
261
+ def create_matmul_desc(compute_type, scale_type, trans_a = false, trans_b = false, dtype = :float32)
262
+ desc_ptr = FFI::MemoryPointer.new(:pointer)
263
+ status = CuBLASLtBindings.cublasLtMatmulDescCreate(desc_ptr, compute_type, scale_type)
264
+ CuBLASLtBindings.check_status!(status, "cublasLtMatmulDescCreate")
265
+
266
+ desc = desc_ptr.read_pointer
267
+
268
+ # Set transpose attributes if needed
269
+ if trans_a
270
+ op_val = FFI::MemoryPointer.new(:int)
271
+ op_val.write_int(CuBLASBindings::CUBLAS_OP_T)
272
+ CuBLASLtBindings.cublasLtMatmulDescSetAttribute(
273
+ desc,
274
+ CuBLASLtBindings::CUBLASLT_MATMUL_DESC_TRANSA,
275
+ op_val, FFI.type_size(:int)
276
+ )
277
+ end
278
+
279
+ if trans_b
280
+ op_val = FFI::MemoryPointer.new(:int)
281
+ op_val.write_int(CuBLASBindings::CUBLAS_OP_T)
282
+ CuBLASLtBindings.cublasLtMatmulDescSetAttribute(
283
+ desc,
284
+ CuBLASLtBindings::CUBLASLT_MATMUL_DESC_TRANSB,
285
+ op_val, FFI.type_size(:int)
286
+ )
287
+ end
288
+
289
+ # NOTE: FAST_ACCUM is disabled on RTX 3060 as it causes regression.
290
+ # Enable on RTX 4090/H100/A100 for potential +5-10% FP16 performance.
291
+ # if %i[float16 half bfloat16].include?(dtype)
292
+ # fast_accum_ptr = FFI::MemoryPointer.new(:int8)
293
+ # fast_accum_ptr.write_int8(1)
294
+ # CuBLASLtBindings.cublasLtMatmulDescSetAttribute(
295
+ # desc,
296
+ # CuBLASLtBindings::CUBLASLT_MATMUL_DESC_FAST_ACCUM,
297
+ # fast_accum_ptr, 1
298
+ # )
299
+ # end
300
+
301
+ desc
302
+ end
303
+
304
+ def create_matrix_layout(data_type, rows, cols, ld)
305
+ layout_ptr = FFI::MemoryPointer.new(:pointer)
306
+ status = CuBLASLtBindings.cublasLtMatrixLayoutCreate(
307
+ layout_ptr,
308
+ data_type,
309
+ rows, cols,
310
+ ld
311
+ )
312
+ CuBLASLtBindings.check_status!(status, "cublasLtMatrixLayoutCreate")
313
+ layout_ptr.read_pointer
314
+ end
315
+
316
+ def create_preference(workspace_size, dtype = :float32)
317
+ pref_ptr = FFI::MemoryPointer.new(:pointer)
318
+ status = CuBLASLtBindings.cublasLtMatmulPreferenceCreate(pref_ptr)
319
+ CuBLASLtBindings.check_status!(status, "cublasLtMatmulPreferenceCreate")
320
+
321
+ pref = pref_ptr.read_pointer
322
+
323
+ # Set max workspace size - larger allows more algorithm choices
324
+ ws_ptr = FFI::MemoryPointer.new(:size_t)
325
+ ws_ptr.write(:size_t, workspace_size)
326
+ CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
327
+ pref,
328
+ CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
329
+ ws_ptr, FFI.type_size(:size_t)
330
+ )
331
+
332
+ # Allow all reduction schemes for Split-K algorithms
333
+ scheme_ptr = FFI::MemoryPointer.new(:uint32)
334
+ scheme_ptr.write_uint32(CuBLASLtBindings::CUBLASLT_REDUCTION_SCHEME_MASK)
335
+ CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
336
+ pref,
337
+ CuBLASLtBindings::CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
338
+ scheme_ptr, FFI.type_size(:uint32)
339
+ )
340
+
341
+ # Prefer Tensor Core implementations (HMMA for FP16/TF32)
342
+ if %i[float16 half float32 float bfloat16].include?(dtype)
343
+ impl_mask_ptr = FFI::MemoryPointer.new(:uint64)
344
+ impl_mask_ptr.write_uint64(0xFFFFFFFFFFFFFFFF)
345
+ CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
346
+ pref,
347
+ CuBLASLtBindings::CUBLASLT_MATMUL_PREF_IMPL_MASK,
348
+ impl_mask_ptr, FFI.type_size(:uint64)
349
+ )
350
+ end
351
+
352
+ # Relax alignment requirements to allow more algorithms
353
+ alignment_ptr = FFI::MemoryPointer.new(:uint32)
354
+ alignment_ptr.write_uint32(16)
355
+ CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
356
+ pref,
357
+ CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES,
358
+ alignment_ptr, FFI.type_size(:uint32)
359
+ )
360
+ CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
361
+ pref,
362
+ CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES,
363
+ alignment_ptr, FFI.type_size(:uint32)
364
+ )
365
+ CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
366
+ pref,
367
+ CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES,
368
+ alignment_ptr, FFI.type_size(:uint32)
369
+ )
370
+ CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
371
+ pref,
372
+ CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES,
373
+ alignment_ptr, FFI.type_size(:uint32)
374
+ )
375
+
376
+ pref
377
+ end
378
+
379
+ def prepare_scalars(alpha, beta, dtype)
380
+ case dtype
381
+ when :float16, :half
382
+ alpha_ptr = FFI::MemoryPointer.new(:float)
383
+ beta_ptr = FFI::MemoryPointer.new(:float)
384
+ alpha_ptr.write_float(alpha.to_f)
385
+ beta_ptr.write_float(beta.to_f)
386
+ when :float64, :double
387
+ alpha_ptr = FFI::MemoryPointer.new(:double)
388
+ beta_ptr = FFI::MemoryPointer.new(:double)
389
+ alpha_ptr.write_double(alpha.to_f)
390
+ beta_ptr.write_double(beta.to_f)
391
+ else
392
+ alpha_ptr = FFI::MemoryPointer.new(:float)
393
+ beta_ptr = FFI::MemoryPointer.new(:float)
394
+ alpha_ptr.write_float(alpha.to_f)
395
+ beta_ptr.write_float(beta.to_f)
396
+ end
397
+ [alpha_ptr, beta_ptr]
398
+ end
399
+
400
+ def cleanup_descriptors(matmul_desc, layout_a, layout_b, layout_c, preference)
401
+ CuBLASLtBindings.cublasLtMatmulDescDestroy(matmul_desc) if matmul_desc
402
+ CuBLASLtBindings.cublasLtMatrixLayoutDestroy(layout_a) if layout_a
403
+ CuBLASLtBindings.cublasLtMatrixLayoutDestroy(layout_b) if layout_b
404
+ CuBLASLtBindings.cublasLtMatrixLayoutDestroy(layout_c) if layout_c
405
+ CuBLASLtBindings.cublasLtMatmulPreferenceDestroy(preference) if preference
406
+ rescue StandardError
407
+ # Ignore cleanup errors
408
+ end
409
+ end
410
+ end
411
+ end
412
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'device_memory_resource'
4
+
5
+ module Ignis
6
+ module Memory
7
+ # Stream-ordered memory resource using cudaMallocAsync/cudaFreeAsync
8
+ # Uses CUDA's built-in stream-ordered allocator (CUDA 11.2+)
9
+ # Optimal for workloads with many allocations on the same stream
10
+ class CudaAsyncMemoryResource < DeviceMemoryResource
11
+ # cudaMemPoolAttr enum values
12
+ MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 0x1
13
+ MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC = 0x2
14
+ MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES = 0x3
15
+ MEMPOOL_ATTR_RELEASE_THRESHOLD = 0x4
16
+
17
+ # @param device_index [Integer] GPU device index
18
+ # @param initial_pool_size [Integer] Initial pool size in bytes (default: 0 = driver managed)
19
+ # @param release_threshold [Integer] Release threshold in bytes (default: 0 = driver managed)
20
+ def initialize(device_index: nil, initial_pool_size: 0, release_threshold: 0)
21
+ super(device_index: device_index)
22
+ @initial_pool_size = initial_pool_size
23
+ @release_threshold = release_threshold
24
+ @pool_handle = nil
25
+
26
+ setup_pool!
27
+ end
28
+
29
+ # @return [Boolean] true - supports stream-ordered allocation
30
+ def supports_streams?
31
+ true
32
+ end
33
+
34
+ # Release the memory pool
35
+ # @return [void]
36
+ def destroy!
37
+ return unless @pool_handle
38
+
39
+ @mutex.synchronize do
40
+ if @pool_handle && !@pool_handle.null?
41
+ CUDA::RuntimeAPI.cudaMemPoolDestroy(@pool_handle)
42
+ @pool_handle = nil
43
+ end
44
+ end
45
+ end
46
+
47
+ protected
48
+
49
+ # @param bytes [Integer]
50
+ # @param stream [Ignis::CUDA::Stream, nil]
51
+ # @return [FFI::Pointer]
52
+ def do_allocate(bytes, stream)
53
+ ensure_device do
54
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
55
+ stream_handle = stream&.handle || FFI::Pointer::NULL
56
+
57
+ if @pool_handle
58
+ status = CUDA::RuntimeAPI.cudaMallocFromPoolAsync(
59
+ ptr_ptr, bytes, @pool_handle, stream_handle
60
+ )
61
+ CUDA::RuntimeAPI.check_status!(status, "cudaMallocFromPoolAsync(#{bytes} bytes)")
62
+ else
63
+ status = CUDA::RuntimeAPI.cudaMallocAsync(ptr_ptr, bytes, stream_handle)
64
+ CUDA::RuntimeAPI.check_status!(status, "cudaMallocAsync(#{bytes} bytes)")
65
+ end
66
+
67
+ ptr_ptr.read_pointer
68
+ end
69
+ end
70
+
71
+ # @param ptr [FFI::Pointer]
72
+ # @param bytes [Integer]
73
+ # @param stream [Ignis::CUDA::Stream, nil]
74
+ # @return [void]
75
+ def do_deallocate(ptr, _bytes, stream)
76
+ ensure_device do
77
+ stream_handle = stream&.handle || FFI::Pointer::NULL
78
+ status = CUDA::RuntimeAPI.cudaFreeAsync(ptr, stream_handle)
79
+ CUDA::RuntimeAPI.check_status!(status, "cudaFreeAsync")
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ def setup_pool!
86
+ ensure_device do
87
+ pool_ptr = FFI::MemoryPointer.new(:pointer)
88
+ status = CUDA::RuntimeAPI.cudaDeviceGetDefaultMemPool(pool_ptr, @device_index)
89
+ CUDA::RuntimeAPI.check_status!(status, "cudaDeviceGetDefaultMemPool")
90
+ @pool_handle = pool_ptr.read_pointer
91
+
92
+ if @release_threshold > 0
93
+ threshold_ptr = FFI::MemoryPointer.new(:uint64)
94
+ threshold_ptr.write_uint64(@release_threshold)
95
+ status = CUDA::RuntimeAPI.cudaMemPoolSetAttribute(
96
+ @pool_handle, MEMPOOL_ATTR_RELEASE_THRESHOLD, threshold_ptr
97
+ )
98
+ CUDA::RuntimeAPI.check_status!(status, "cudaMemPoolSetAttribute(RELEASE_THRESHOLD)")
99
+ end
100
+ end
101
+ end
102
+
103
+ def ensure_device
104
+ current_ptr = FFI::MemoryPointer.new(:int)
105
+ CUDA::RuntimeAPI.cudaGetDevice(current_ptr)
106
+ original = current_ptr.read_int
107
+
108
+ if original != @device_index
109
+ status = CUDA::RuntimeAPI.cudaSetDevice(@device_index)
110
+ CUDA::RuntimeAPI.check_status!(status, "cudaSetDevice(#{@device_index})")
111
+ end
112
+
113
+ result = yield
114
+
115
+ if original != @device_index
116
+ CUDA::RuntimeAPI.cudaSetDevice(original)
117
+ end
118
+
119
+ result
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'device_memory_resource'
4
+
5
+ module Ignis
6
+ module Memory
7
+ # Simple memory resource using cudaMalloc/cudaFree
8
+ # This is the baseline allocator with no pooling
9
+ class CudaMemoryResource < DeviceMemoryResource
10
+ # @param device_index [Integer] GPU device index
11
+ def initialize(device_index: nil)
12
+ super
13
+ CUDA::RuntimeAPI.ensure_loaded!
14
+ end
15
+
16
+ # @return [Boolean] false - synchronous allocation
17
+ def supports_streams?
18
+ false
19
+ end
20
+
21
+ protected
22
+
23
+ # @param bytes [Integer]
24
+ # @param stream [Ignis::CUDA::Stream, nil] Ignored for sync allocation
25
+ # @return [FFI::Pointer]
26
+ def do_allocate(bytes, _stream)
27
+ ensure_device do
28
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
29
+ status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, bytes)
30
+ CUDA::RuntimeAPI.check_status!(status, "cudaMalloc(#{bytes} bytes)")
31
+ ptr_ptr.read_pointer
32
+ end
33
+ end
34
+
35
+ # @param ptr [FFI::Pointer]
36
+ # @param bytes [Integer]
37
+ # @param stream [Ignis::CUDA::Stream, nil] Ignored for sync deallocation
38
+ # @return [void]
39
+ def do_deallocate(ptr, _bytes, _stream)
40
+ ensure_device do
41
+ status = CUDA::RuntimeAPI.cudaFree(ptr)
42
+ CUDA::RuntimeAPI.check_status!(status, "cudaFree")
43
+ end
44
+ end
45
+
46
+ private
47
+
48
+ def ensure_device
49
+ current_ptr = FFI::MemoryPointer.new(:int)
50
+ CUDA::RuntimeAPI.cudaGetDevice(current_ptr)
51
+ original = current_ptr.read_int
52
+
53
+ if original != @device_index
54
+ status = CUDA::RuntimeAPI.cudaSetDevice(@device_index)
55
+ CUDA::RuntimeAPI.check_status!(status, "cudaSetDevice(#{@device_index})")
56
+ end
57
+
58
+ result = yield
59
+
60
+ if original != @device_index
61
+ CUDA::RuntimeAPI.cudaSetDevice(original)
62
+ end
63
+
64
+ result
65
+ end
66
+ end
67
+ end
68
+ end