ignis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis.rb +94 -0
- data/lib/nnw/platform.rb +304 -0
- data/lib/nnw/shared/event_bus.rb +240 -0
- data/lib/nnw/shared/ffi_loader.rb +63 -0
- data/lib/nnw/shared/memory_contract.rb +204 -0
- data/lib/nnw/shared/nv_array.rb +710 -0
- data/lib/nnw/shared/recovery_protocol.rb +307 -0
- data/lib/nvruby/configuration.rb +217 -0
- data/lib/nvruby/cuda/device.rb +275 -0
- data/lib/nvruby/cuda/device_props.rb +202 -0
- data/lib/nvruby/cuda/graph.rb +265 -0
- data/lib/nvruby/cuda/graph_bindings.rb +119 -0
- data/lib/nvruby/cuda/library_loader.rb +285 -0
- data/lib/nvruby/cuda/memory.rb +410 -0
- data/lib/nvruby/cuda/runtime_api.rb +804 -0
- data/lib/nvruby/cuda/stream.rb +234 -0
- data/lib/nvruby/dtype.rb +139 -0
- data/lib/nvruby/epilogues.rb +438 -0
- data/lib/nvruby/errors.rb +303 -0
- data/lib/nvruby/half.rb +97 -0
- data/lib/nvruby/jit/compiled_kernel.rb +80 -0
- data/lib/nvruby/jit/compiler.rb +231 -0
- data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
- data/lib/nvruby/jit/kernel.rb +240 -0
- data/lib/nvruby/jit/kernel_module.rb +133 -0
- data/lib/nvruby/jit/kernels/activations.rb +179 -0
- data/lib/nvruby/jit/kernels/attention.rb +504 -0
- data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
- data/lib/nvruby/jit/kernels/loss.rb +213 -0
- data/lib/nvruby/jit/kernels/normalization.rb +200 -0
- data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
- data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
- data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
- data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
- data/lib/nvruby/linalg/epilog.rb +67 -0
- data/lib/nvruby/linalg/matmul.rb +247 -0
- data/lib/nvruby/linalg/matmul_plan.rb +229 -0
- data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
- data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
- data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
- data/lib/nvruby/memory/device_memory_resource.rb +106 -0
- data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
- data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
- data/lib/nvruby/memory/stats.rb +107 -0
- data/lib/nvruby/memory.rb +124 -0
- data/lib/nvruby/version.rb +5 -0
- metadata +108 -0
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
# Advanced fused epilogues for GPU operations
|
|
5
|
+
# Provides GELU, ReLU, SiLU, Bias addition as fused CUDA kernels
|
|
6
|
+
#
|
|
7
|
+
# @example Apply GELU activation
|
|
8
|
+
# output = Ignis::Epilogues.gelu(input)
|
|
9
|
+
#
|
|
10
|
+
# @example Fused GEMM + GELU + Bias
|
|
11
|
+
# output = Ignis::Epilogues.gemm_gelu_bias(a, b, bias)
|
|
12
|
+
#
|
|
13
|
+
module Epilogues
|
|
14
|
+
# GELU approximation constant
|
|
15
|
+
GELU_COEF_A = 0.7978845608028654
|
|
16
|
+
GELU_COEF_B = 0.044715
|
|
17
|
+
|
|
18
|
+
# JIT CUDA kernels for epilogues
|
|
19
|
+
module Kernels
|
|
20
|
+
GELU_KERNEL = <<~CUDA
|
|
21
|
+
extern "C" __global__ void gelu_forward(
|
|
22
|
+
const float* __restrict__ input,
|
|
23
|
+
float* __restrict__ output,
|
|
24
|
+
int n
|
|
25
|
+
) {
|
|
26
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
27
|
+
if (idx < n) {
|
|
28
|
+
float x = input[idx];
|
|
29
|
+
// Approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
|
|
30
|
+
float x3 = x * x * x;
|
|
31
|
+
float tanh_arg = 0.7978845608f * (x + 0.044715f * x3);
|
|
32
|
+
output[idx] = 0.5f * x * (1.0f + tanhf(tanh_arg));
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
CUDA
|
|
36
|
+
|
|
37
|
+
GELU_EXACT_KERNEL = <<~CUDA
|
|
38
|
+
extern "C" __global__ void gelu_exact_forward(
|
|
39
|
+
const float* __restrict__ input,
|
|
40
|
+
float* __restrict__ output,
|
|
41
|
+
int n
|
|
42
|
+
) {
|
|
43
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
44
|
+
if (idx < n) {
|
|
45
|
+
float x = input[idx];
|
|
46
|
+
// Exact: x * 0.5 * (1 + erf(x / sqrt(2)))
|
|
47
|
+
output[idx] = x * 0.5f * (1.0f + erff(x * 0.7071067811865476f));
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
CUDA
|
|
51
|
+
|
|
52
|
+
SILU_KERNEL = <<~CUDA
|
|
53
|
+
extern "C" __global__ void silu_forward(
|
|
54
|
+
const float* __restrict__ input,
|
|
55
|
+
float* __restrict__ output,
|
|
56
|
+
int n
|
|
57
|
+
) {
|
|
58
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
59
|
+
if (idx < n) {
|
|
60
|
+
float x = input[idx];
|
|
61
|
+
// SiLU: x * sigmoid(x) = x / (1 + exp(-x))
|
|
62
|
+
output[idx] = x / (1.0f + expf(-x));
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
CUDA
|
|
66
|
+
|
|
67
|
+
RELU_KERNEL = <<~CUDA
|
|
68
|
+
extern "C" __global__ void relu_forward(
|
|
69
|
+
const float* __restrict__ input,
|
|
70
|
+
float* __restrict__ output,
|
|
71
|
+
int n
|
|
72
|
+
) {
|
|
73
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
74
|
+
if (idx < n) {
|
|
75
|
+
output[idx] = fmaxf(0.0f, input[idx]);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
CUDA
|
|
79
|
+
|
|
80
|
+
LEAKY_RELU_KERNEL = <<~CUDA
|
|
81
|
+
extern "C" __global__ void leaky_relu_forward(
|
|
82
|
+
const float* __restrict__ input,
|
|
83
|
+
float* __restrict__ output,
|
|
84
|
+
int n,
|
|
85
|
+
float negative_slope
|
|
86
|
+
) {
|
|
87
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
88
|
+
if (idx < n) {
|
|
89
|
+
float x = input[idx];
|
|
90
|
+
output[idx] = x > 0.0f ? x : x * negative_slope;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
CUDA
|
|
94
|
+
|
|
95
|
+
BIAS_ADD_KERNEL = <<~CUDA
|
|
96
|
+
extern "C" __global__ void bias_add(
|
|
97
|
+
const float* __restrict__ input,
|
|
98
|
+
const float* __restrict__ bias,
|
|
99
|
+
float* __restrict__ output,
|
|
100
|
+
int rows,
|
|
101
|
+
int cols
|
|
102
|
+
) {
|
|
103
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
104
|
+
int total = rows * cols;
|
|
105
|
+
if (idx < total) {
|
|
106
|
+
int col = idx % cols;
|
|
107
|
+
output[idx] = input[idx] + bias[col];
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
CUDA
|
|
111
|
+
|
|
112
|
+
GELU_BIAS_KERNEL = <<~CUDA
|
|
113
|
+
extern "C" __global__ void gelu_bias_forward(
|
|
114
|
+
const float* __restrict__ input,
|
|
115
|
+
const float* __restrict__ bias,
|
|
116
|
+
float* __restrict__ output,
|
|
117
|
+
int rows,
|
|
118
|
+
int cols
|
|
119
|
+
) {
|
|
120
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
121
|
+
int total = rows * cols;
|
|
122
|
+
if (idx < total) {
|
|
123
|
+
int col = idx % cols;
|
|
124
|
+
float x = input[idx] + bias[col];
|
|
125
|
+
float x3 = x * x * x;
|
|
126
|
+
float tanh_arg = 0.7978845608f * (x + 0.044715f * x3);
|
|
127
|
+
output[idx] = 0.5f * x * (1.0f + tanhf(tanh_arg));
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
CUDA
|
|
131
|
+
|
|
132
|
+
SILU_BIAS_KERNEL = <<~CUDA
|
|
133
|
+
extern "C" __global__ void silu_bias_forward(
|
|
134
|
+
const float* __restrict__ input,
|
|
135
|
+
const float* __restrict__ bias,
|
|
136
|
+
float* __restrict__ output,
|
|
137
|
+
int rows,
|
|
138
|
+
int cols
|
|
139
|
+
) {
|
|
140
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
141
|
+
int total = rows * cols;
|
|
142
|
+
if (idx < total) {
|
|
143
|
+
int col = idx % cols;
|
|
144
|
+
float x = input[idx] + bias[col];
|
|
145
|
+
output[idx] = x / (1.0f + expf(-x));
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
CUDA
|
|
149
|
+
|
|
150
|
+
RESIDUAL_ADD_KERNEL = <<~CUDA
|
|
151
|
+
extern "C" __global__ void residual_add(
|
|
152
|
+
const float* __restrict__ input,
|
|
153
|
+
const float* __restrict__ residual,
|
|
154
|
+
float* __restrict__ output,
|
|
155
|
+
int n
|
|
156
|
+
) {
|
|
157
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
158
|
+
if (idx < n) {
|
|
159
|
+
output[idx] = input[idx] + residual[idx];
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
CUDA
|
|
163
|
+
|
|
164
|
+
SCALE_KERNEL = <<~CUDA
|
|
165
|
+
extern "C" __global__ void scale(
|
|
166
|
+
const float* __restrict__ input,
|
|
167
|
+
float* __restrict__ output,
|
|
168
|
+
float scale_factor,
|
|
169
|
+
int n
|
|
170
|
+
) {
|
|
171
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
172
|
+
if (idx < n) {
|
|
173
|
+
output[idx] = input[idx] * scale_factor;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
CUDA
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
class << self
|
|
180
|
+
# Apply GELU activation (approximation)
|
|
181
|
+
#
|
|
182
|
+
# @param input [NvArray] Input tensor
|
|
183
|
+
# @param out [NvArray, nil] Output tensor (optional)
|
|
184
|
+
# @return [NvArray] Output with GELU applied
|
|
185
|
+
def gelu(input, out: nil)
|
|
186
|
+
apply_unary(input, out, :gelu, Kernels::GELU_KERNEL, "gelu_forward")
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Apply exact GELU activation
|
|
190
|
+
#
|
|
191
|
+
# @param input [NvArray] Input tensor
|
|
192
|
+
# @param out [NvArray, nil] Output tensor
|
|
193
|
+
# @return [NvArray]
|
|
194
|
+
def gelu_exact(input, out: nil)
|
|
195
|
+
apply_unary(input, out, :gelu_exact, Kernels::GELU_EXACT_KERNEL, "gelu_exact_forward")
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Apply SiLU (Swish) activation
|
|
199
|
+
#
|
|
200
|
+
# @param input [NvArray] Input tensor
|
|
201
|
+
# @param out [NvArray, nil] Output tensor
|
|
202
|
+
# @return [NvArray]
|
|
203
|
+
def silu(input, out: nil)
|
|
204
|
+
apply_unary(input, out, :silu, Kernels::SILU_KERNEL, "silu_forward")
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Apply ReLU activation
|
|
208
|
+
#
|
|
209
|
+
# @param input [NvArray] Input tensor
|
|
210
|
+
# @param out [NvArray, nil] Output tensor
|
|
211
|
+
# @return [NvArray]
|
|
212
|
+
def relu(input, out: nil)
|
|
213
|
+
apply_unary(input, out, :relu, Kernels::RELU_KERNEL, "relu_forward")
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Apply Leaky ReLU activation
|
|
217
|
+
#
|
|
218
|
+
# @param input [NvArray] Input tensor
|
|
219
|
+
# @param negative_slope [Float] Slope for negative values
|
|
220
|
+
# @param out [NvArray, nil] Output tensor
|
|
221
|
+
# @return [NvArray]
|
|
222
|
+
def leaky_relu(input, negative_slope: 0.01, out: nil)
|
|
223
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
224
|
+
|
|
225
|
+
n = input.size
|
|
226
|
+
device = input.respond_to?(:device_index) ? input.device_index : 0
|
|
227
|
+
out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
|
|
228
|
+
|
|
229
|
+
kernel = get_kernel(:leaky_relu, Kernels::LEAKY_RELU_KERNEL, "leaky_relu_forward")
|
|
230
|
+
|
|
231
|
+
block_size = 256
|
|
232
|
+
grid_size = (n + block_size - 1) / block_size
|
|
233
|
+
|
|
234
|
+
kernel.launch(
|
|
235
|
+
grid: [grid_size, 1, 1],
|
|
236
|
+
block: [block_size, 1, 1],
|
|
237
|
+
args: [input.device_ffi_ptr, out.device_ffi_ptr, n, negative_slope]
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
241
|
+
out
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Add bias to tensor
|
|
245
|
+
#
|
|
246
|
+
# @param input [NvArray] Input tensor (rows x cols)
|
|
247
|
+
# @param bias [NvArray] Bias vector (cols)
|
|
248
|
+
# @param out [NvArray, nil] Output tensor
|
|
249
|
+
# @return [NvArray]
|
|
250
|
+
def bias_add(input, bias, out: nil)
|
|
251
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
252
|
+
|
|
253
|
+
shape = input.shape
|
|
254
|
+
rows = shape.size == 1 ? 1 : shape[0]
|
|
255
|
+
cols = shape.size == 1 ? shape[0] : shape[1]
|
|
256
|
+
n = rows * cols
|
|
257
|
+
device = input.respond_to?(:device_index) ? input.device_index : 0
|
|
258
|
+
|
|
259
|
+
out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
|
|
260
|
+
|
|
261
|
+
kernel = get_kernel(:bias_add, Kernels::BIAS_ADD_KERNEL, "bias_add")
|
|
262
|
+
|
|
263
|
+
block_size = 256
|
|
264
|
+
grid_size = (n + block_size - 1) / block_size
|
|
265
|
+
|
|
266
|
+
kernel.launch(
|
|
267
|
+
grid: [grid_size, 1, 1],
|
|
268
|
+
block: [block_size, 1, 1],
|
|
269
|
+
args: [input.device_ffi_ptr, bias.device_ffi_ptr, out.device_ffi_ptr, rows, cols]
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
273
|
+
out
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Fused GELU + Bias
|
|
277
|
+
#
|
|
278
|
+
# @param input [NvArray] Input tensor
|
|
279
|
+
# @param bias [NvArray] Bias vector
|
|
280
|
+
# @param out [NvArray, nil] Output tensor
|
|
281
|
+
# @return [NvArray]
|
|
282
|
+
def gelu_bias(input, bias, out: nil)
|
|
283
|
+
apply_fused_bias(input, bias, out, :gelu_bias, Kernels::GELU_BIAS_KERNEL, "gelu_bias_forward")
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Fused SiLU + Bias
|
|
287
|
+
#
|
|
288
|
+
# @param input [NvArray] Input tensor
|
|
289
|
+
# @param bias [NvArray] Bias vector
|
|
290
|
+
# @param out [NvArray, nil] Output tensor
|
|
291
|
+
# @return [NvArray]
|
|
292
|
+
def silu_bias(input, bias, out: nil)
|
|
293
|
+
apply_fused_bias(input, bias, out, :silu_bias, Kernels::SILU_BIAS_KERNEL, "silu_bias_forward")
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# Residual addition
|
|
297
|
+
#
|
|
298
|
+
# @param input [NvArray] Input tensor
|
|
299
|
+
# @param residual [NvArray] Residual tensor
|
|
300
|
+
# @param out [NvArray, nil] Output tensor
|
|
301
|
+
# @return [NvArray]
|
|
302
|
+
def residual_add(input, residual, out: nil)
|
|
303
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
304
|
+
|
|
305
|
+
n = input.size
|
|
306
|
+
device = input.respond_to?(:device_index) ? input.device_index : 0
|
|
307
|
+
out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
|
|
308
|
+
|
|
309
|
+
kernel = get_kernel(:residual_add, Kernels::RESIDUAL_ADD_KERNEL, "residual_add")
|
|
310
|
+
|
|
311
|
+
block_size = 256
|
|
312
|
+
grid_size = (n + block_size - 1) / block_size
|
|
313
|
+
|
|
314
|
+
kernel.launch(
|
|
315
|
+
grid: [grid_size, 1, 1],
|
|
316
|
+
block: [block_size, 1, 1],
|
|
317
|
+
args: [input.device_ffi_ptr, residual.device_ffi_ptr, out.device_ffi_ptr, n]
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
321
|
+
out
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
# Scale tensor by factor
|
|
325
|
+
#
|
|
326
|
+
# @param input [NvArray] Input tensor
|
|
327
|
+
# @param factor [Float] Scale factor
|
|
328
|
+
# @param out [NvArray, nil] Output tensor
|
|
329
|
+
# @return [NvArray]
|
|
330
|
+
def scale(input, factor, out: nil)
|
|
331
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
332
|
+
|
|
333
|
+
n = input.size
|
|
334
|
+
device = input.respond_to?(:device_index) ? input.device_index : 0
|
|
335
|
+
out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
|
|
336
|
+
|
|
337
|
+
kernel = get_kernel(:scale, Kernels::SCALE_KERNEL, "scale")
|
|
338
|
+
|
|
339
|
+
block_size = 256
|
|
340
|
+
grid_size = (n + block_size - 1) / block_size
|
|
341
|
+
|
|
342
|
+
kernel.launch(
|
|
343
|
+
grid: [grid_size, 1, 1],
|
|
344
|
+
block: [block_size, 1, 1],
|
|
345
|
+
args: [input.device_ffi_ptr, out.device_ffi_ptr, factor, n]
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
349
|
+
out
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# Fused GEMM + epilogue
|
|
353
|
+
#
|
|
354
|
+
# @param a [NvArray] First matrix
|
|
355
|
+
# @param b [NvArray] Second matrix
|
|
356
|
+
# @param epilogue [Symbol] :gelu, :relu, :silu
|
|
357
|
+
# @param bias [NvArray, nil] Optional bias
|
|
358
|
+
# @return [NvArray]
|
|
359
|
+
def gemm_epilogue(a, b, epilogue:, bias: nil)
|
|
360
|
+
# Perform GEMM
|
|
361
|
+
c = Ignis::LinAlg.matmul(a, b)
|
|
362
|
+
|
|
363
|
+
# Apply epilogue
|
|
364
|
+
result = case epilogue
|
|
365
|
+
when :gelu
|
|
366
|
+
bias ? gelu_bias(c, bias) : gelu(c)
|
|
367
|
+
when :relu
|
|
368
|
+
temp = bias ? bias_add(c, bias) : c
|
|
369
|
+
relu(temp)
|
|
370
|
+
when :silu
|
|
371
|
+
bias ? silu_bias(c, bias) : silu(c)
|
|
372
|
+
else
|
|
373
|
+
bias ? bias_add(c, bias) : c
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
result
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
private
|
|
380
|
+
|
|
381
|
+
def apply_unary(input, out, name, kernel_code, kernel_name)
|
|
382
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
383
|
+
|
|
384
|
+
n = input.size
|
|
385
|
+
device = input.respond_to?(:device_index) ? input.device_index : 0
|
|
386
|
+
out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
|
|
387
|
+
|
|
388
|
+
kernel = get_kernel(name, kernel_code, kernel_name)
|
|
389
|
+
|
|
390
|
+
block_size = 256
|
|
391
|
+
grid_size = (n + block_size - 1) / block_size
|
|
392
|
+
|
|
393
|
+
kernel.launch(
|
|
394
|
+
grid: [grid_size, 1, 1],
|
|
395
|
+
block: [block_size, 1, 1],
|
|
396
|
+
args: [input.device_ffi_ptr, out.device_ffi_ptr, n]
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
400
|
+
out
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
def apply_fused_bias(input, bias, out, name, kernel_code, kernel_name)
|
|
404
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
405
|
+
|
|
406
|
+
shape = input.shape
|
|
407
|
+
rows = shape.size == 1 ? 1 : shape[0]
|
|
408
|
+
cols = shape.size == 1 ? shape[0] : shape[1]
|
|
409
|
+
n = rows * cols
|
|
410
|
+
device = input.respond_to?(:device_index) ? input.device_index : 0
|
|
411
|
+
|
|
412
|
+
out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
|
|
413
|
+
|
|
414
|
+
kernel = get_kernel(name, kernel_code, kernel_name)
|
|
415
|
+
|
|
416
|
+
block_size = 256
|
|
417
|
+
grid_size = (n + block_size - 1) / block_size
|
|
418
|
+
|
|
419
|
+
kernel.launch(
|
|
420
|
+
grid: [grid_size, 1, 1],
|
|
421
|
+
block: [block_size, 1, 1],
|
|
422
|
+
args: [input.device_ffi_ptr, bias.device_ffi_ptr, out.device_ffi_ptr, rows, cols]
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
426
|
+
out
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
def get_kernel(name, kernel_code, kernel_name)
|
|
430
|
+
@kernels ||= {}
|
|
431
|
+
@kernels[name] ||= begin
|
|
432
|
+
compiler = CUDA::JITCompiler.new
|
|
433
|
+
compiler.compile(kernel_code, kernel_name)
|
|
434
|
+
end
|
|
435
|
+
end
|
|
436
|
+
end
|
|
437
|
+
end
|
|
438
|
+
end
|