ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,438 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ # Advanced fused epilogues for GPU operations
5
+ # Provides GELU, ReLU, SiLU, Bias addition as fused CUDA kernels
6
+ #
7
+ # @example Apply GELU activation
8
+ # output = Ignis::Epilogues.gelu(input)
9
+ #
10
+ # @example Fused GEMM + GELU + Bias
11
+ # output = Ignis::Epilogues.gemm_gelu_bias(a, b, bias)
12
+ #
13
+ module Epilogues
14
+ # GELU approximation constant
15
+ GELU_COEF_A = 0.7978845608028654
16
+ GELU_COEF_B = 0.044715
17
+
18
+ # JIT CUDA kernels for epilogues
19
+ module Kernels
20
+ GELU_KERNEL = <<~CUDA
21
+ extern "C" __global__ void gelu_forward(
22
+ const float* __restrict__ input,
23
+ float* __restrict__ output,
24
+ int n
25
+ ) {
26
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
27
+ if (idx < n) {
28
+ float x = input[idx];
29
+ // Approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
30
+ float x3 = x * x * x;
31
+ float tanh_arg = 0.7978845608f * (x + 0.044715f * x3);
32
+ output[idx] = 0.5f * x * (1.0f + tanhf(tanh_arg));
33
+ }
34
+ }
35
+ CUDA
36
+
37
+ GELU_EXACT_KERNEL = <<~CUDA
38
+ extern "C" __global__ void gelu_exact_forward(
39
+ const float* __restrict__ input,
40
+ float* __restrict__ output,
41
+ int n
42
+ ) {
43
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
44
+ if (idx < n) {
45
+ float x = input[idx];
46
+ // Exact: x * 0.5 * (1 + erf(x / sqrt(2)))
47
+ output[idx] = x * 0.5f * (1.0f + erff(x * 0.7071067811865476f));
48
+ }
49
+ }
50
+ CUDA
51
+
52
+ SILU_KERNEL = <<~CUDA
53
+ extern "C" __global__ void silu_forward(
54
+ const float* __restrict__ input,
55
+ float* __restrict__ output,
56
+ int n
57
+ ) {
58
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
59
+ if (idx < n) {
60
+ float x = input[idx];
61
+ // SiLU: x * sigmoid(x) = x / (1 + exp(-x))
62
+ output[idx] = x / (1.0f + expf(-x));
63
+ }
64
+ }
65
+ CUDA
66
+
67
+ RELU_KERNEL = <<~CUDA
68
+ extern "C" __global__ void relu_forward(
69
+ const float* __restrict__ input,
70
+ float* __restrict__ output,
71
+ int n
72
+ ) {
73
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
74
+ if (idx < n) {
75
+ output[idx] = fmaxf(0.0f, input[idx]);
76
+ }
77
+ }
78
+ CUDA
79
+
80
+ LEAKY_RELU_KERNEL = <<~CUDA
81
+ extern "C" __global__ void leaky_relu_forward(
82
+ const float* __restrict__ input,
83
+ float* __restrict__ output,
84
+ int n,
85
+ float negative_slope
86
+ ) {
87
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
88
+ if (idx < n) {
89
+ float x = input[idx];
90
+ output[idx] = x > 0.0f ? x : x * negative_slope;
91
+ }
92
+ }
93
+ CUDA
94
+
95
+ BIAS_ADD_KERNEL = <<~CUDA
96
+ extern "C" __global__ void bias_add(
97
+ const float* __restrict__ input,
98
+ const float* __restrict__ bias,
99
+ float* __restrict__ output,
100
+ int rows,
101
+ int cols
102
+ ) {
103
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
104
+ int total = rows * cols;
105
+ if (idx < total) {
106
+ int col = idx % cols;
107
+ output[idx] = input[idx] + bias[col];
108
+ }
109
+ }
110
+ CUDA
111
+
112
+ GELU_BIAS_KERNEL = <<~CUDA
113
+ extern "C" __global__ void gelu_bias_forward(
114
+ const float* __restrict__ input,
115
+ const float* __restrict__ bias,
116
+ float* __restrict__ output,
117
+ int rows,
118
+ int cols
119
+ ) {
120
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
121
+ int total = rows * cols;
122
+ if (idx < total) {
123
+ int col = idx % cols;
124
+ float x = input[idx] + bias[col];
125
+ float x3 = x * x * x;
126
+ float tanh_arg = 0.7978845608f * (x + 0.044715f * x3);
127
+ output[idx] = 0.5f * x * (1.0f + tanhf(tanh_arg));
128
+ }
129
+ }
130
+ CUDA
131
+
132
+ SILU_BIAS_KERNEL = <<~CUDA
133
+ extern "C" __global__ void silu_bias_forward(
134
+ const float* __restrict__ input,
135
+ const float* __restrict__ bias,
136
+ float* __restrict__ output,
137
+ int rows,
138
+ int cols
139
+ ) {
140
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
141
+ int total = rows * cols;
142
+ if (idx < total) {
143
+ int col = idx % cols;
144
+ float x = input[idx] + bias[col];
145
+ output[idx] = x / (1.0f + expf(-x));
146
+ }
147
+ }
148
+ CUDA
149
+
150
+ RESIDUAL_ADD_KERNEL = <<~CUDA
151
+ extern "C" __global__ void residual_add(
152
+ const float* __restrict__ input,
153
+ const float* __restrict__ residual,
154
+ float* __restrict__ output,
155
+ int n
156
+ ) {
157
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
158
+ if (idx < n) {
159
+ output[idx] = input[idx] + residual[idx];
160
+ }
161
+ }
162
+ CUDA
163
+
164
+ SCALE_KERNEL = <<~CUDA
165
+ extern "C" __global__ void scale(
166
+ const float* __restrict__ input,
167
+ float* __restrict__ output,
168
+ float scale_factor,
169
+ int n
170
+ ) {
171
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
172
+ if (idx < n) {
173
+ output[idx] = input[idx] * scale_factor;
174
+ }
175
+ }
176
+ CUDA
177
+ end
178
+
179
+ class << self
180
+ # Apply GELU activation (approximation)
181
+ #
182
+ # @param input [NvArray] Input tensor
183
+ # @param out [NvArray, nil] Output tensor (optional)
184
+ # @return [NvArray] Output with GELU applied
185
+ def gelu(input, out: nil)
186
+ apply_unary(input, out, :gelu, Kernels::GELU_KERNEL, "gelu_forward")
187
+ end
188
+
189
+ # Apply exact GELU activation
190
+ #
191
+ # @param input [NvArray] Input tensor
192
+ # @param out [NvArray, nil] Output tensor
193
+ # @return [NvArray]
194
+ def gelu_exact(input, out: nil)
195
+ apply_unary(input, out, :gelu_exact, Kernels::GELU_EXACT_KERNEL, "gelu_exact_forward")
196
+ end
197
+
198
+ # Apply SiLU (Swish) activation
199
+ #
200
+ # @param input [NvArray] Input tensor
201
+ # @param out [NvArray, nil] Output tensor
202
+ # @return [NvArray]
203
+ def silu(input, out: nil)
204
+ apply_unary(input, out, :silu, Kernels::SILU_KERNEL, "silu_forward")
205
+ end
206
+
207
+ # Apply ReLU activation
208
+ #
209
+ # @param input [NvArray] Input tensor
210
+ # @param out [NvArray, nil] Output tensor
211
+ # @return [NvArray]
212
+ def relu(input, out: nil)
213
+ apply_unary(input, out, :relu, Kernels::RELU_KERNEL, "relu_forward")
214
+ end
215
+
216
+ # Apply Leaky ReLU activation
217
+ #
218
+ # @param input [NvArray] Input tensor
219
+ # @param negative_slope [Float] Slope for negative values
220
+ # @param out [NvArray, nil] Output tensor
221
+ # @return [NvArray]
222
+ def leaky_relu(input, negative_slope: 0.01, out: nil)
223
+ CUDA::RuntimeAPI.ensure_loaded!
224
+
225
+ n = input.size
226
+ device = input.respond_to?(:device_index) ? input.device_index : 0
227
+ out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
228
+
229
+ kernel = get_kernel(:leaky_relu, Kernels::LEAKY_RELU_KERNEL, "leaky_relu_forward")
230
+
231
+ block_size = 256
232
+ grid_size = (n + block_size - 1) / block_size
233
+
234
+ kernel.launch(
235
+ grid: [grid_size, 1, 1],
236
+ block: [block_size, 1, 1],
237
+ args: [input.device_ffi_ptr, out.device_ffi_ptr, n, negative_slope]
238
+ )
239
+
240
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
241
+ out
242
+ end
243
+
244
+ # Add bias to tensor
245
+ #
246
+ # @param input [NvArray] Input tensor (rows x cols)
247
+ # @param bias [NvArray] Bias vector (cols)
248
+ # @param out [NvArray, nil] Output tensor
249
+ # @return [NvArray]
250
+ def bias_add(input, bias, out: nil)
251
+ CUDA::RuntimeAPI.ensure_loaded!
252
+
253
+ shape = input.shape
254
+ rows = shape.size == 1 ? 1 : shape[0]
255
+ cols = shape.size == 1 ? shape[0] : shape[1]
256
+ n = rows * cols
257
+ device = input.respond_to?(:device_index) ? input.device_index : 0
258
+
259
+ out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
260
+
261
+ kernel = get_kernel(:bias_add, Kernels::BIAS_ADD_KERNEL, "bias_add")
262
+
263
+ block_size = 256
264
+ grid_size = (n + block_size - 1) / block_size
265
+
266
+ kernel.launch(
267
+ grid: [grid_size, 1, 1],
268
+ block: [block_size, 1, 1],
269
+ args: [input.device_ffi_ptr, bias.device_ffi_ptr, out.device_ffi_ptr, rows, cols]
270
+ )
271
+
272
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
273
+ out
274
+ end
275
+
276
+ # Fused GELU + Bias
277
+ #
278
+ # @param input [NvArray] Input tensor
279
+ # @param bias [NvArray] Bias vector
280
+ # @param out [NvArray, nil] Output tensor
281
+ # @return [NvArray]
282
+ def gelu_bias(input, bias, out: nil)
283
+ apply_fused_bias(input, bias, out, :gelu_bias, Kernels::GELU_BIAS_KERNEL, "gelu_bias_forward")
284
+ end
285
+
286
+ # Fused SiLU + Bias
287
+ #
288
+ # @param input [NvArray] Input tensor
289
+ # @param bias [NvArray] Bias vector
290
+ # @param out [NvArray, nil] Output tensor
291
+ # @return [NvArray]
292
+ def silu_bias(input, bias, out: nil)
293
+ apply_fused_bias(input, bias, out, :silu_bias, Kernels::SILU_BIAS_KERNEL, "silu_bias_forward")
294
+ end
295
+
296
+ # Residual addition
297
+ #
298
+ # @param input [NvArray] Input tensor
299
+ # @param residual [NvArray] Residual tensor
300
+ # @param out [NvArray, nil] Output tensor
301
+ # @return [NvArray]
302
+ def residual_add(input, residual, out: nil)
303
+ CUDA::RuntimeAPI.ensure_loaded!
304
+
305
+ n = input.size
306
+ device = input.respond_to?(:device_index) ? input.device_index : 0
307
+ out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
308
+
309
+ kernel = get_kernel(:residual_add, Kernels::RESIDUAL_ADD_KERNEL, "residual_add")
310
+
311
+ block_size = 256
312
+ grid_size = (n + block_size - 1) / block_size
313
+
314
+ kernel.launch(
315
+ grid: [grid_size, 1, 1],
316
+ block: [block_size, 1, 1],
317
+ args: [input.device_ffi_ptr, residual.device_ffi_ptr, out.device_ffi_ptr, n]
318
+ )
319
+
320
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
321
+ out
322
+ end
323
+
324
+ # Scale tensor by factor
325
+ #
326
+ # @param input [NvArray] Input tensor
327
+ # @param factor [Float] Scale factor
328
+ # @param out [NvArray, nil] Output tensor
329
+ # @return [NvArray]
330
+ def scale(input, factor, out: nil)
331
+ CUDA::RuntimeAPI.ensure_loaded!
332
+
333
+ n = input.size
334
+ device = input.respond_to?(:device_index) ? input.device_index : 0
335
+ out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
336
+
337
+ kernel = get_kernel(:scale, Kernels::SCALE_KERNEL, "scale")
338
+
339
+ block_size = 256
340
+ grid_size = (n + block_size - 1) / block_size
341
+
342
+ kernel.launch(
343
+ grid: [grid_size, 1, 1],
344
+ block: [block_size, 1, 1],
345
+ args: [input.device_ffi_ptr, out.device_ffi_ptr, factor, n]
346
+ )
347
+
348
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
349
+ out
350
+ end
351
+
352
+ # Fused GEMM + epilogue
353
+ #
354
+ # @param a [NvArray] First matrix
355
+ # @param b [NvArray] Second matrix
356
+ # @param epilogue [Symbol] :gelu, :relu, :silu
357
+ # @param bias [NvArray, nil] Optional bias
358
+ # @return [NvArray]
359
+ def gemm_epilogue(a, b, epilogue:, bias: nil)
360
+ # Perform GEMM
361
+ c = Ignis::LinAlg.matmul(a, b)
362
+
363
+ # Apply epilogue
364
+ result = case epilogue
365
+ when :gelu
366
+ bias ? gelu_bias(c, bias) : gelu(c)
367
+ when :relu
368
+ temp = bias ? bias_add(c, bias) : c
369
+ relu(temp)
370
+ when :silu
371
+ bias ? silu_bias(c, bias) : silu(c)
372
+ else
373
+ bias ? bias_add(c, bias) : c
374
+ end
375
+
376
+ result
377
+ end
378
+
379
+ private
380
+
381
+ def apply_unary(input, out, name, kernel_code, kernel_name)
382
+ CUDA::RuntimeAPI.ensure_loaded!
383
+
384
+ n = input.size
385
+ device = input.respond_to?(:device_index) ? input.device_index : 0
386
+ out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
387
+
388
+ kernel = get_kernel(name, kernel_code, kernel_name)
389
+
390
+ block_size = 256
391
+ grid_size = (n + block_size - 1) / block_size
392
+
393
+ kernel.launch(
394
+ grid: [grid_size, 1, 1],
395
+ block: [block_size, 1, 1],
396
+ args: [input.device_ffi_ptr, out.device_ffi_ptr, n]
397
+ )
398
+
399
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
400
+ out
401
+ end
402
+
403
+ def apply_fused_bias(input, bias, out, name, kernel_code, kernel_name)
404
+ CUDA::RuntimeAPI.ensure_loaded!
405
+
406
+ shape = input.shape
407
+ rows = shape.size == 1 ? 1 : shape[0]
408
+ cols = shape.size == 1 ? shape[0] : shape[1]
409
+ n = rows * cols
410
+ device = input.respond_to?(:device_index) ? input.device_index : 0
411
+
412
+ out ||= Ignis::NvArray.zeros(input.shape, dtype: input.dtype, device: device)
413
+
414
+ kernel = get_kernel(name, kernel_code, kernel_name)
415
+
416
+ block_size = 256
417
+ grid_size = (n + block_size - 1) / block_size
418
+
419
+ kernel.launch(
420
+ grid: [grid_size, 1, 1],
421
+ block: [block_size, 1, 1],
422
+ args: [input.device_ffi_ptr, bias.device_ffi_ptr, out.device_ffi_ptr, rows, cols]
423
+ )
424
+
425
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
426
+ out
427
+ end
428
+
429
+ def get_kernel(name, kernel_code, kernel_name)
430
+ @kernels ||= {}
431
+ @kernels[name] ||= begin
432
+ compiler = CUDA::JITCompiler.new
433
+ compiler.compile(kernel_code, kernel_name)
434
+ end
435
+ end
436
+ end
437
+ end
438
+ end