ignis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis.rb +94 -0
- data/lib/nnw/platform.rb +304 -0
- data/lib/nnw/shared/event_bus.rb +240 -0
- data/lib/nnw/shared/ffi_loader.rb +63 -0
- data/lib/nnw/shared/memory_contract.rb +204 -0
- data/lib/nnw/shared/nv_array.rb +710 -0
- data/lib/nnw/shared/recovery_protocol.rb +307 -0
- data/lib/nvruby/configuration.rb +217 -0
- data/lib/nvruby/cuda/device.rb +275 -0
- data/lib/nvruby/cuda/device_props.rb +202 -0
- data/lib/nvruby/cuda/graph.rb +265 -0
- data/lib/nvruby/cuda/graph_bindings.rb +119 -0
- data/lib/nvruby/cuda/library_loader.rb +285 -0
- data/lib/nvruby/cuda/memory.rb +410 -0
- data/lib/nvruby/cuda/runtime_api.rb +804 -0
- data/lib/nvruby/cuda/stream.rb +234 -0
- data/lib/nvruby/dtype.rb +139 -0
- data/lib/nvruby/epilogues.rb +438 -0
- data/lib/nvruby/errors.rb +303 -0
- data/lib/nvruby/half.rb +97 -0
- data/lib/nvruby/jit/compiled_kernel.rb +80 -0
- data/lib/nvruby/jit/compiler.rb +231 -0
- data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
- data/lib/nvruby/jit/kernel.rb +240 -0
- data/lib/nvruby/jit/kernel_module.rb +133 -0
- data/lib/nvruby/jit/kernels/activations.rb +179 -0
- data/lib/nvruby/jit/kernels/attention.rb +504 -0
- data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
- data/lib/nvruby/jit/kernels/loss.rb +213 -0
- data/lib/nvruby/jit/kernels/normalization.rb +200 -0
- data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
- data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
- data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
- data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
- data/lib/nvruby/linalg/epilog.rb +67 -0
- data/lib/nvruby/linalg/matmul.rb +247 -0
- data/lib/nvruby/linalg/matmul_plan.rb +229 -0
- data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
- data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
- data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
- data/lib/nvruby/memory/device_memory_resource.rb +106 -0
- data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
- data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
- data/lib/nvruby/memory/stats.rb +107 -0
- data/lib/nvruby/memory.rb +124 -0
- data/lib/nvruby/version.rb +5 -0
- metadata +108 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "cublaslt_bindings"
|
|
4
|
+
require_relative "cublas_bindings"
|
|
5
|
+
|
|
6
|
+
module Ignis
|
|
7
|
+
module LinAlg
|
|
8
|
+
# Optimized matrix multiplication using cuBLASLt
|
|
9
|
+
#
|
|
10
|
+
# Features:
|
|
11
|
+
# - Heuristic-based algorithm selection
|
|
12
|
+
# - Large workspace for more algorithm choices
|
|
13
|
+
# - Descriptor-based API for optimal performance
|
|
14
|
+
# - Split-K support for better SM utilization
|
|
15
|
+
module OptimizedMatmul
|
|
16
|
+
class << self
|
|
17
|
+
# Perform optimized matrix multiplication with auto-tuned algorithm
|
|
18
|
+
#
|
|
19
|
+
# @param a [NvArray] Left matrix
|
|
20
|
+
# @param b [NvArray] Right matrix
|
|
21
|
+
# @param c [NvArray, nil] Output matrix (created if nil)
|
|
22
|
+
# @param alpha [Float] Scaling factor for A @ B
|
|
23
|
+
# @param beta [Float] Scaling factor for C
|
|
24
|
+
# @param transpose_a [Boolean] Transpose A
|
|
25
|
+
# @param transpose_b [Boolean] Transpose B
|
|
26
|
+
# @param workspace_size [Integer] Workspace size in bytes
|
|
27
|
+
# @param stream [CUDA::Stream, nil] CUDA stream
|
|
28
|
+
# @return [NvArray] Result matrix
|
|
29
|
+
def call(a, b, c: nil, alpha: 1.0, beta: 0.0,
|
|
30
|
+
transpose_a: false, transpose_b: false,
|
|
31
|
+
workspace_size: 256 * 1024 * 1024, stream: nil)
|
|
32
|
+
# NOTE: the cuBLASLt path (execute_cublaslt) builds COLUMN-major layouts
|
|
33
|
+
# for Ignis's ROW-major buffers without setting CUBLASLT_ORDER_ROW, so it
|
|
34
|
+
# computed a transposed/incorrect product (its benchmark used all-zero
|
|
35
|
+
# inputs, so the bug was invisible). Until the cuBLASLt layouts are fixed
|
|
36
|
+
# and verified, delegate to the cuBLAS GEMM path, which IS numerically
|
|
37
|
+
# verified (benchmarks/verify_matmul.rb), so callers get correct results.
|
|
38
|
+
_ = workspace_size
|
|
39
|
+
Matmul.call(a, b, c: c, alpha: alpha, beta: beta,
|
|
40
|
+
transpose_a: transpose_a, transpose_b: transpose_b, stream: stream)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Perform matrix multiplication with a specific algorithm
|
|
44
|
+
#
|
|
45
|
+
# @param a [NvArray] Left matrix
|
|
46
|
+
# @param b [NvArray] Right matrix
|
|
47
|
+
# @param algo_index [Integer] Algorithm index from heuristic results
|
|
48
|
+
# @param c [NvArray, nil] Output matrix
|
|
49
|
+
# @param alpha [Float] Scaling factor
|
|
50
|
+
# @param beta [Float] Scaling factor for C
|
|
51
|
+
# @param transpose_a [Boolean] Transpose A
|
|
52
|
+
# @param transpose_b [Boolean] Transpose B
|
|
53
|
+
# @param stream [CUDA::Stream, nil] CUDA stream
|
|
54
|
+
# @return [NvArray] Result matrix
|
|
55
|
+
def call_with_algorithm(a, b, algo_index:, c: nil, alpha: 1.0, beta: 0.0,
|
|
56
|
+
transpose_a: false, transpose_b: false, stream: nil)
|
|
57
|
+
validate_inputs!(a, b)
|
|
58
|
+
|
|
59
|
+
m, k1, k2, n = compute_dimensions(a, b, transpose_a, transpose_b)
|
|
60
|
+
raise DimensionError, "K dimensions mismatch: #{k1} vs #{k2}" unless k1 == k2
|
|
61
|
+
|
|
62
|
+
k = k1
|
|
63
|
+
dtype = a.dtype
|
|
64
|
+
|
|
65
|
+
c = prepare_output(c, m, n, dtype, a.device_index) if c.nil?
|
|
66
|
+
|
|
67
|
+
CuBLASLtBindings.ensure_loaded!
|
|
68
|
+
|
|
69
|
+
execute_cublaslt_with_algo(
|
|
70
|
+
a, b, c,
|
|
71
|
+
m, n, k,
|
|
72
|
+
alpha, beta,
|
|
73
|
+
transpose_a, transpose_b,
|
|
74
|
+
dtype, algo_index, stream
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
c
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Get available algorithms for a specific matmul configuration
|
|
81
|
+
#
|
|
82
|
+
# @param m [Integer] Number of rows in A
|
|
83
|
+
# @param n [Integer] Number of columns in B
|
|
84
|
+
# @param k [Integer] Inner dimension
|
|
85
|
+
# @param dtype [Symbol] Data type
|
|
86
|
+
# @param workspace_size [Integer] Max workspace size
|
|
87
|
+
# @return [Array<Hash>] Array of algorithm info hashes
|
|
88
|
+
def get_algorithms(m, n, k, dtype: :float32, workspace_size: 256 * 1024 * 1024)
|
|
89
|
+
CuBLASLtBindings.ensure_loaded!
|
|
90
|
+
|
|
91
|
+
lt_handle = CuBLASLtBindings.get_handle
|
|
92
|
+
cuda_type = CuBLASLtBindings.dtype_to_cuda_type(dtype)
|
|
93
|
+
compute_type = CuBLASLtBindings.compute_type_for_dtype(dtype)
|
|
94
|
+
scale_type = CuBLASLtBindings.scale_type_for_dtype(dtype)
|
|
95
|
+
|
|
96
|
+
# Create descriptors with correct scale type for alpha/beta
|
|
97
|
+
matmul_desc = create_matmul_desc(compute_type, scale_type)
|
|
98
|
+
layout_a = create_matrix_layout(cuda_type, m, k, m)
|
|
99
|
+
layout_b = create_matrix_layout(cuda_type, k, n, k)
|
|
100
|
+
layout_c = create_matrix_layout(cuda_type, m, n, m)
|
|
101
|
+
|
|
102
|
+
# Create preference with workspace size
|
|
103
|
+
preference = create_preference(workspace_size)
|
|
104
|
+
|
|
105
|
+
begin
|
|
106
|
+
# Query heuristics
|
|
107
|
+
max_algos = 32
|
|
108
|
+
results_ptr = FFI::MemoryPointer.new(CuBLASLtBindings::MatmulHeuristicResult, max_algos)
|
|
109
|
+
algo_count_ptr = FFI::MemoryPointer.new(:int)
|
|
110
|
+
|
|
111
|
+
status = CuBLASLtBindings.cublasLtMatmulAlgoGetHeuristic(
|
|
112
|
+
lt_handle,
|
|
113
|
+
matmul_desc,
|
|
114
|
+
layout_a, layout_b, layout_c, layout_c,
|
|
115
|
+
preference,
|
|
116
|
+
max_algos,
|
|
117
|
+
results_ptr,
|
|
118
|
+
algo_count_ptr
|
|
119
|
+
)
|
|
120
|
+
CuBLASLtBindings.check_status!(status, "cublasLtMatmulAlgoGetHeuristic")
|
|
121
|
+
|
|
122
|
+
algo_count = algo_count_ptr.read_int
|
|
123
|
+
algorithms = []
|
|
124
|
+
|
|
125
|
+
algo_count.times do |i|
|
|
126
|
+
result = CuBLASLtBindings::MatmulHeuristicResult.new(
|
|
127
|
+
results_ptr + i * CuBLASLtBindings::MatmulHeuristicResult.size
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
algorithms << {
|
|
131
|
+
index: i,
|
|
132
|
+
workspace_size: result[:workspaceSize],
|
|
133
|
+
status: result[:state],
|
|
134
|
+
waves_count: result[:wavesCount],
|
|
135
|
+
algo_data: result[:algo].to_a
|
|
136
|
+
}
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
algorithms
|
|
140
|
+
ensure
|
|
141
|
+
cleanup_descriptors(matmul_desc, layout_a, layout_b, layout_c, preference)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
private
|
|
146
|
+
|
|
147
|
+
def validate_inputs!(a, b)
|
|
148
|
+
raise ArgumentError, "Expected NvArray, got #{a.class}" unless a.is_a?(NvArray)
|
|
149
|
+
raise ArgumentError, "Expected NvArray, got #{b.class}" unless b.is_a?(NvArray)
|
|
150
|
+
raise DimensionError, "Matrix A must be 2D" unless a.shape.size == 2
|
|
151
|
+
raise DimensionError, "Matrix B must be 2D" unless b.shape.size == 2
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def compute_dimensions(a, b, transpose_a, transpose_b)
|
|
155
|
+
m = transpose_a ? a.shape[1] : a.shape[0]
|
|
156
|
+
k1 = transpose_a ? a.shape[0] : a.shape[1]
|
|
157
|
+
k2 = transpose_b ? b.shape[1] : b.shape[0]
|
|
158
|
+
n = transpose_b ? b.shape[0] : b.shape[1]
|
|
159
|
+
[m, k1, k2, n]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def prepare_output(c, m, n, dtype, device_index)
|
|
163
|
+
if c.nil?
|
|
164
|
+
NvArray.zeros([m, n], dtype: dtype, device: device_index)
|
|
165
|
+
else
|
|
166
|
+
c
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def execute_cublaslt(a, b, c, m, n, k, alpha, beta, trans_a, trans_b, dtype, workspace_size, stream)
|
|
171
|
+
lt_handle = CuBLASLtBindings.get_handle
|
|
172
|
+
workspace = CuBLASLtBindings.get_workspace(workspace_size)
|
|
173
|
+
|
|
174
|
+
cuda_type = CuBLASLtBindings.dtype_to_cuda_type(dtype)
|
|
175
|
+
compute_type = CuBLASLtBindings.compute_type_for_dtype(dtype)
|
|
176
|
+
scale_type = CuBLASLtBindings.scale_type_for_dtype(dtype)
|
|
177
|
+
|
|
178
|
+
# Determine leading dimensions (column-major)
|
|
179
|
+
lda = trans_a ? k : m
|
|
180
|
+
ldb = trans_b ? n : k
|
|
181
|
+
ldc = m
|
|
182
|
+
|
|
183
|
+
# Create descriptors with correct scale type for alpha/beta (FP32 for Tensor Core ops)
|
|
184
|
+
matmul_desc = create_matmul_desc(compute_type, scale_type, trans_a, trans_b, dtype)
|
|
185
|
+
layout_a = create_matrix_layout(cuda_type, trans_a ? k : m, trans_a ? m : k, lda)
|
|
186
|
+
layout_b = create_matrix_layout(cuda_type, trans_b ? n : k, trans_b ? k : n, ldb)
|
|
187
|
+
layout_c = create_matrix_layout(cuda_type, m, n, ldc)
|
|
188
|
+
|
|
189
|
+
preference = create_preference(workspace_size, dtype)
|
|
190
|
+
|
|
191
|
+
begin
|
|
192
|
+
# Request up to 32 algorithms from heuristic for better selection
|
|
193
|
+
max_algorithms = 32
|
|
194
|
+
results_ptr = FFI::MemoryPointer.new(CuBLASLtBindings::MatmulHeuristicResult, max_algorithms)
|
|
195
|
+
algo_count_ptr = FFI::MemoryPointer.new(:int)
|
|
196
|
+
|
|
197
|
+
status = CuBLASLtBindings.cublasLtMatmulAlgoGetHeuristic(
|
|
198
|
+
lt_handle,
|
|
199
|
+
matmul_desc,
|
|
200
|
+
layout_a, layout_b, layout_c, layout_c,
|
|
201
|
+
preference,
|
|
202
|
+
max_algorithms, results_ptr, algo_count_ptr
|
|
203
|
+
)
|
|
204
|
+
CuBLASLtBindings.check_status!(status, "cublasLtMatmulAlgoGetHeuristic")
|
|
205
|
+
|
|
206
|
+
algo_count = algo_count_ptr.read_int
|
|
207
|
+
|
|
208
|
+
# Select best algorithm: validate workspace fits and pick lowest wavesCount
|
|
209
|
+
best_algo_ptr = nil
|
|
210
|
+
best_waves = Float::INFINITY
|
|
211
|
+
|
|
212
|
+
algo_count.times do |i|
|
|
213
|
+
result = CuBLASLtBindings::MatmulHeuristicResult.new(
|
|
214
|
+
results_ptr + i * CuBLASLtBindings::MatmulHeuristicResult.size
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
next if result[:state] != 0
|
|
218
|
+
next if result[:workspaceSize] > workspace_size
|
|
219
|
+
|
|
220
|
+
if result[:wavesCount] < best_waves
|
|
221
|
+
best_waves = result[:wavesCount]
|
|
222
|
+
best_algo_ptr = results_ptr + i * CuBLASLtBindings::MatmulHeuristicResult.size
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
best_algo_ptr ||= results_ptr
|
|
227
|
+
|
|
228
|
+
# Prepare alpha/beta
|
|
229
|
+
alpha_ptr, beta_ptr = prepare_scalars(alpha, beta, dtype)
|
|
230
|
+
|
|
231
|
+
# Get stream pointer
|
|
232
|
+
stream_ptr = stream ? stream.ptr : FFI::Pointer::NULL
|
|
233
|
+
|
|
234
|
+
# Execute matmul with best algorithm
|
|
235
|
+
status = CuBLASLtBindings.cublasLtMatmul(
|
|
236
|
+
lt_handle,
|
|
237
|
+
matmul_desc,
|
|
238
|
+
alpha_ptr,
|
|
239
|
+
a.device_ptr, layout_a,
|
|
240
|
+
b.device_ptr, layout_b,
|
|
241
|
+
beta_ptr,
|
|
242
|
+
c.device_ptr, layout_c,
|
|
243
|
+
c.device_ptr, layout_c,
|
|
244
|
+
best_algo_ptr,
|
|
245
|
+
workspace,
|
|
246
|
+
workspace_size,
|
|
247
|
+
stream_ptr
|
|
248
|
+
)
|
|
249
|
+
CuBLASLtBindings.check_status!(status, "cublasLtMatmul")
|
|
250
|
+
ensure
|
|
251
|
+
cleanup_descriptors(matmul_desc, layout_a, layout_b, layout_c, preference)
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def execute_cublaslt_with_algo(a, b, c, m, n, k, alpha, beta, trans_a, trans_b, dtype, algo_index, stream)
|
|
256
|
+
# Similar to execute_cublaslt but uses specific algorithm index
|
|
257
|
+
# For now, delegate to main implementation
|
|
258
|
+
execute_cublaslt(a, b, c, m, n, k, alpha, beta, trans_a, trans_b, dtype, 256 * 1024 * 1024, stream)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def create_matmul_desc(compute_type, scale_type, trans_a = false, trans_b = false, dtype = :float32)
|
|
262
|
+
desc_ptr = FFI::MemoryPointer.new(:pointer)
|
|
263
|
+
status = CuBLASLtBindings.cublasLtMatmulDescCreate(desc_ptr, compute_type, scale_type)
|
|
264
|
+
CuBLASLtBindings.check_status!(status, "cublasLtMatmulDescCreate")
|
|
265
|
+
|
|
266
|
+
desc = desc_ptr.read_pointer
|
|
267
|
+
|
|
268
|
+
# Set transpose attributes if needed
|
|
269
|
+
if trans_a
|
|
270
|
+
op_val = FFI::MemoryPointer.new(:int)
|
|
271
|
+
op_val.write_int(CuBLASBindings::CUBLAS_OP_T)
|
|
272
|
+
CuBLASLtBindings.cublasLtMatmulDescSetAttribute(
|
|
273
|
+
desc,
|
|
274
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_DESC_TRANSA,
|
|
275
|
+
op_val, FFI.type_size(:int)
|
|
276
|
+
)
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
if trans_b
|
|
280
|
+
op_val = FFI::MemoryPointer.new(:int)
|
|
281
|
+
op_val.write_int(CuBLASBindings::CUBLAS_OP_T)
|
|
282
|
+
CuBLASLtBindings.cublasLtMatmulDescSetAttribute(
|
|
283
|
+
desc,
|
|
284
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_DESC_TRANSB,
|
|
285
|
+
op_val, FFI.type_size(:int)
|
|
286
|
+
)
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# NOTE: FAST_ACCUM is disabled on RTX 3060 as it causes regression.
|
|
290
|
+
# Enable on RTX 4090/H100/A100 for potential +5-10% FP16 performance.
|
|
291
|
+
# if %i[float16 half bfloat16].include?(dtype)
|
|
292
|
+
# fast_accum_ptr = FFI::MemoryPointer.new(:int8)
|
|
293
|
+
# fast_accum_ptr.write_int8(1)
|
|
294
|
+
# CuBLASLtBindings.cublasLtMatmulDescSetAttribute(
|
|
295
|
+
# desc,
|
|
296
|
+
# CuBLASLtBindings::CUBLASLT_MATMUL_DESC_FAST_ACCUM,
|
|
297
|
+
# fast_accum_ptr, 1
|
|
298
|
+
# )
|
|
299
|
+
# end
|
|
300
|
+
|
|
301
|
+
desc
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
def create_matrix_layout(data_type, rows, cols, ld)
|
|
305
|
+
layout_ptr = FFI::MemoryPointer.new(:pointer)
|
|
306
|
+
status = CuBLASLtBindings.cublasLtMatrixLayoutCreate(
|
|
307
|
+
layout_ptr,
|
|
308
|
+
data_type,
|
|
309
|
+
rows, cols,
|
|
310
|
+
ld
|
|
311
|
+
)
|
|
312
|
+
CuBLASLtBindings.check_status!(status, "cublasLtMatrixLayoutCreate")
|
|
313
|
+
layout_ptr.read_pointer
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def create_preference(workspace_size, dtype = :float32)
|
|
317
|
+
pref_ptr = FFI::MemoryPointer.new(:pointer)
|
|
318
|
+
status = CuBLASLtBindings.cublasLtMatmulPreferenceCreate(pref_ptr)
|
|
319
|
+
CuBLASLtBindings.check_status!(status, "cublasLtMatmulPreferenceCreate")
|
|
320
|
+
|
|
321
|
+
pref = pref_ptr.read_pointer
|
|
322
|
+
|
|
323
|
+
# Set max workspace size - larger allows more algorithm choices
|
|
324
|
+
ws_ptr = FFI::MemoryPointer.new(:size_t)
|
|
325
|
+
ws_ptr.write(:size_t, workspace_size)
|
|
326
|
+
CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
|
|
327
|
+
pref,
|
|
328
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
|
329
|
+
ws_ptr, FFI.type_size(:size_t)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Allow all reduction schemes for Split-K algorithms
|
|
333
|
+
scheme_ptr = FFI::MemoryPointer.new(:uint32)
|
|
334
|
+
scheme_ptr.write_uint32(CuBLASLtBindings::CUBLASLT_REDUCTION_SCHEME_MASK)
|
|
335
|
+
CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
|
|
336
|
+
pref,
|
|
337
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
|
338
|
+
scheme_ptr, FFI.type_size(:uint32)
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Prefer Tensor Core implementations (HMMA for FP16/TF32)
|
|
342
|
+
if %i[float16 half float32 float bfloat16].include?(dtype)
|
|
343
|
+
impl_mask_ptr = FFI::MemoryPointer.new(:uint64)
|
|
344
|
+
impl_mask_ptr.write_uint64(0xFFFFFFFFFFFFFFFF)
|
|
345
|
+
CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
|
|
346
|
+
pref,
|
|
347
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_PREF_IMPL_MASK,
|
|
348
|
+
impl_mask_ptr, FFI.type_size(:uint64)
|
|
349
|
+
)
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# Relax alignment requirements to allow more algorithms
|
|
353
|
+
alignment_ptr = FFI::MemoryPointer.new(:uint32)
|
|
354
|
+
alignment_ptr.write_uint32(16)
|
|
355
|
+
CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
|
|
356
|
+
pref,
|
|
357
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES,
|
|
358
|
+
alignment_ptr, FFI.type_size(:uint32)
|
|
359
|
+
)
|
|
360
|
+
CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
|
|
361
|
+
pref,
|
|
362
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES,
|
|
363
|
+
alignment_ptr, FFI.type_size(:uint32)
|
|
364
|
+
)
|
|
365
|
+
CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
|
|
366
|
+
pref,
|
|
367
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES,
|
|
368
|
+
alignment_ptr, FFI.type_size(:uint32)
|
|
369
|
+
)
|
|
370
|
+
CuBLASLtBindings.cublasLtMatmulPreferenceSetAttribute(
|
|
371
|
+
pref,
|
|
372
|
+
CuBLASLtBindings::CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES,
|
|
373
|
+
alignment_ptr, FFI.type_size(:uint32)
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
pref
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
def prepare_scalars(alpha, beta, dtype)
|
|
380
|
+
case dtype
|
|
381
|
+
when :float16, :half
|
|
382
|
+
alpha_ptr = FFI::MemoryPointer.new(:float)
|
|
383
|
+
beta_ptr = FFI::MemoryPointer.new(:float)
|
|
384
|
+
alpha_ptr.write_float(alpha.to_f)
|
|
385
|
+
beta_ptr.write_float(beta.to_f)
|
|
386
|
+
when :float64, :double
|
|
387
|
+
alpha_ptr = FFI::MemoryPointer.new(:double)
|
|
388
|
+
beta_ptr = FFI::MemoryPointer.new(:double)
|
|
389
|
+
alpha_ptr.write_double(alpha.to_f)
|
|
390
|
+
beta_ptr.write_double(beta.to_f)
|
|
391
|
+
else
|
|
392
|
+
alpha_ptr = FFI::MemoryPointer.new(:float)
|
|
393
|
+
beta_ptr = FFI::MemoryPointer.new(:float)
|
|
394
|
+
alpha_ptr.write_float(alpha.to_f)
|
|
395
|
+
beta_ptr.write_float(beta.to_f)
|
|
396
|
+
end
|
|
397
|
+
[alpha_ptr, beta_ptr]
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
def cleanup_descriptors(matmul_desc, layout_a, layout_b, layout_c, preference)
|
|
401
|
+
CuBLASLtBindings.cublasLtMatmulDescDestroy(matmul_desc) if matmul_desc
|
|
402
|
+
CuBLASLtBindings.cublasLtMatrixLayoutDestroy(layout_a) if layout_a
|
|
403
|
+
CuBLASLtBindings.cublasLtMatrixLayoutDestroy(layout_b) if layout_b
|
|
404
|
+
CuBLASLtBindings.cublasLtMatrixLayoutDestroy(layout_c) if layout_c
|
|
405
|
+
CuBLASLtBindings.cublasLtMatmulPreferenceDestroy(preference) if preference
|
|
406
|
+
rescue StandardError
|
|
407
|
+
# Ignore cleanup errors
|
|
408
|
+
end
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
end
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'device_memory_resource'
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module Memory
|
|
7
|
+
# Stream-ordered memory resource using cudaMallocAsync/cudaFreeAsync
|
|
8
|
+
# Uses CUDA's built-in stream-ordered allocator (CUDA 11.2+)
|
|
9
|
+
# Optimal for workloads with many allocations on the same stream
|
|
10
|
+
class CudaAsyncMemoryResource < DeviceMemoryResource
|
|
11
|
+
# cudaMemPoolAttr enum values
|
|
12
|
+
MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 0x1
|
|
13
|
+
MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC = 0x2
|
|
14
|
+
MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES = 0x3
|
|
15
|
+
MEMPOOL_ATTR_RELEASE_THRESHOLD = 0x4
|
|
16
|
+
|
|
17
|
+
# @param device_index [Integer] GPU device index
|
|
18
|
+
# @param initial_pool_size [Integer] Initial pool size in bytes (default: 0 = driver managed)
|
|
19
|
+
# @param release_threshold [Integer] Release threshold in bytes (default: 0 = driver managed)
|
|
20
|
+
def initialize(device_index: nil, initial_pool_size: 0, release_threshold: 0)
|
|
21
|
+
super(device_index: device_index)
|
|
22
|
+
@initial_pool_size = initial_pool_size
|
|
23
|
+
@release_threshold = release_threshold
|
|
24
|
+
@pool_handle = nil
|
|
25
|
+
|
|
26
|
+
setup_pool!
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @return [Boolean] true - supports stream-ordered allocation
|
|
30
|
+
def supports_streams?
|
|
31
|
+
true
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Release the memory pool
|
|
35
|
+
# @return [void]
|
|
36
|
+
def destroy!
|
|
37
|
+
return unless @pool_handle
|
|
38
|
+
|
|
39
|
+
@mutex.synchronize do
|
|
40
|
+
if @pool_handle && !@pool_handle.null?
|
|
41
|
+
CUDA::RuntimeAPI.cudaMemPoolDestroy(@pool_handle)
|
|
42
|
+
@pool_handle = nil
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
protected
|
|
48
|
+
|
|
49
|
+
# @param bytes [Integer]
|
|
50
|
+
# @param stream [Ignis::CUDA::Stream, nil]
|
|
51
|
+
# @return [FFI::Pointer]
|
|
52
|
+
def do_allocate(bytes, stream)
|
|
53
|
+
ensure_device do
|
|
54
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
55
|
+
stream_handle = stream&.handle || FFI::Pointer::NULL
|
|
56
|
+
|
|
57
|
+
if @pool_handle
|
|
58
|
+
status = CUDA::RuntimeAPI.cudaMallocFromPoolAsync(
|
|
59
|
+
ptr_ptr, bytes, @pool_handle, stream_handle
|
|
60
|
+
)
|
|
61
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaMallocFromPoolAsync(#{bytes} bytes)")
|
|
62
|
+
else
|
|
63
|
+
status = CUDA::RuntimeAPI.cudaMallocAsync(ptr_ptr, bytes, stream_handle)
|
|
64
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaMallocAsync(#{bytes} bytes)")
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
ptr_ptr.read_pointer
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# @param ptr [FFI::Pointer]
|
|
72
|
+
# @param bytes [Integer]
|
|
73
|
+
# @param stream [Ignis::CUDA::Stream, nil]
|
|
74
|
+
# @return [void]
|
|
75
|
+
def do_deallocate(ptr, _bytes, stream)
|
|
76
|
+
ensure_device do
|
|
77
|
+
stream_handle = stream&.handle || FFI::Pointer::NULL
|
|
78
|
+
status = CUDA::RuntimeAPI.cudaFreeAsync(ptr, stream_handle)
|
|
79
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaFreeAsync")
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def setup_pool!
|
|
86
|
+
ensure_device do
|
|
87
|
+
pool_ptr = FFI::MemoryPointer.new(:pointer)
|
|
88
|
+
status = CUDA::RuntimeAPI.cudaDeviceGetDefaultMemPool(pool_ptr, @device_index)
|
|
89
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaDeviceGetDefaultMemPool")
|
|
90
|
+
@pool_handle = pool_ptr.read_pointer
|
|
91
|
+
|
|
92
|
+
if @release_threshold > 0
|
|
93
|
+
threshold_ptr = FFI::MemoryPointer.new(:uint64)
|
|
94
|
+
threshold_ptr.write_uint64(@release_threshold)
|
|
95
|
+
status = CUDA::RuntimeAPI.cudaMemPoolSetAttribute(
|
|
96
|
+
@pool_handle, MEMPOOL_ATTR_RELEASE_THRESHOLD, threshold_ptr
|
|
97
|
+
)
|
|
98
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaMemPoolSetAttribute(RELEASE_THRESHOLD)")
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def ensure_device
|
|
104
|
+
current_ptr = FFI::MemoryPointer.new(:int)
|
|
105
|
+
CUDA::RuntimeAPI.cudaGetDevice(current_ptr)
|
|
106
|
+
original = current_ptr.read_int
|
|
107
|
+
|
|
108
|
+
if original != @device_index
|
|
109
|
+
status = CUDA::RuntimeAPI.cudaSetDevice(@device_index)
|
|
110
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaSetDevice(#{@device_index})")
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
result = yield
|
|
114
|
+
|
|
115
|
+
if original != @device_index
|
|
116
|
+
CUDA::RuntimeAPI.cudaSetDevice(original)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
result
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'device_memory_resource'
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module Memory
|
|
7
|
+
# Simple memory resource using cudaMalloc/cudaFree
|
|
8
|
+
# This is the baseline allocator with no pooling
|
|
9
|
+
class CudaMemoryResource < DeviceMemoryResource
|
|
10
|
+
# @param device_index [Integer] GPU device index
|
|
11
|
+
def initialize(device_index: nil)
|
|
12
|
+
super
|
|
13
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @return [Boolean] false - synchronous allocation
|
|
17
|
+
def supports_streams?
|
|
18
|
+
false
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
protected
|
|
22
|
+
|
|
23
|
+
# @param bytes [Integer]
|
|
24
|
+
# @param stream [Ignis::CUDA::Stream, nil] Ignored for sync allocation
|
|
25
|
+
# @return [FFI::Pointer]
|
|
26
|
+
def do_allocate(bytes, _stream)
|
|
27
|
+
ensure_device do
|
|
28
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
29
|
+
status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, bytes)
|
|
30
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaMalloc(#{bytes} bytes)")
|
|
31
|
+
ptr_ptr.read_pointer
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @param ptr [FFI::Pointer]
|
|
36
|
+
# @param bytes [Integer]
|
|
37
|
+
# @param stream [Ignis::CUDA::Stream, nil] Ignored for sync deallocation
|
|
38
|
+
# @return [void]
|
|
39
|
+
def do_deallocate(ptr, _bytes, _stream)
|
|
40
|
+
ensure_device do
|
|
41
|
+
status = CUDA::RuntimeAPI.cudaFree(ptr)
|
|
42
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaFree")
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def ensure_device
|
|
49
|
+
current_ptr = FFI::MemoryPointer.new(:int)
|
|
50
|
+
CUDA::RuntimeAPI.cudaGetDevice(current_ptr)
|
|
51
|
+
original = current_ptr.read_int
|
|
52
|
+
|
|
53
|
+
if original != @device_index
|
|
54
|
+
status = CUDA::RuntimeAPI.cudaSetDevice(@device_index)
|
|
55
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaSetDevice(#{@device_index})")
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
result = yield
|
|
59
|
+
|
|
60
|
+
if original != @device_index
|
|
61
|
+
CUDA::RuntimeAPI.cudaSetDevice(original)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
result
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|