ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,133 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module JIT
5
+ # Represents a loaded CUDA module on a specific device
6
+ # Manages the CUmodule handle and CUfunction extraction
7
+ # Handles automatic cleanup via finalizer
8
+ class KernelModule
9
+ # @return [CompiledKernel] The compiled kernel this module was loaded from
10
+ attr_reader :compiled_kernel
11
+
12
+ # @return [Integer] Device ID this module is loaded on
13
+ attr_reader :device_id
14
+
15
+ # @return [FFI::Pointer] The CUmodule handle
16
+ attr_reader :module_handle
17
+
18
+ # @return [FFI::Pointer] The CUfunction handle
19
+ attr_reader :function_handle
20
+
21
+ # @return [Time] When this module was loaded
22
+ attr_reader :loaded_at
23
+
24
+ # Create a new KernelModule by loading compiled code onto a device
25
+ # @param compiled_kernel [CompiledKernel] The compiled kernel to load
26
+ # @param device_id [Integer] Device to load onto
27
+ # @raise [CudaDriverError] If loading fails
28
+ def initialize(compiled_kernel, device_id: 0)
29
+ @compiled_kernel = compiled_kernel
30
+ @device_id = device_id
31
+ @loaded_at = Time.now
32
+ @destroyed = false
33
+ @mutex = Mutex.new
34
+
35
+ load_module!
36
+ extract_function!
37
+
38
+ setup_finalizer
39
+ end
40
+
41
+ # Get the kernel function name
42
+ # @return [String]
43
+ def kernel_name
44
+ @compiled_kernel.kernel_name
45
+ end
46
+
47
+ # Check if this module has been destroyed
48
+ # @return [Boolean]
49
+ def destroyed?
50
+ @mutex.synchronize { @destroyed }
51
+ end
52
+
53
+ # Create a Kernel instance for execution
54
+ # @return [Kernel] Executable kernel wrapper
55
+ def to_kernel
56
+ Kernel.new(self)
57
+ end
58
+
59
+ # Unload the module from GPU memory
60
+ # @return [void]
61
+ def destroy!
62
+ @mutex.synchronize do
63
+ return if @destroyed
64
+
65
+ if @module_handle && !@module_handle.null?
66
+ DriverAPIBindings.unload_module(@module_handle)
67
+ end
68
+
69
+ @module_handle = nil
70
+ @function_handle = nil
71
+ @destroyed = true
72
+
73
+ Ignis.logger.debug("Unloaded kernel module: #{kernel_name} from device #{@device_id}")
74
+ end
75
+ end
76
+
77
+ # Get a string representation
78
+ # @return [String]
79
+ def to_s
80
+ "#<Ignis::JIT::KernelModule #{kernel_name} device=#{@device_id}>"
81
+ end
82
+
83
+ # Get detailed inspection
84
+ # @return [String]
85
+ def inspect
86
+ status = @destroyed ? "destroyed" : "loaded"
87
+ "#<Ignis::JIT::KernelModule:0x#{object_id.to_s(16)} " \
88
+ "kernel=#{kernel_name.inspect} " \
89
+ "device=#{@device_id} " \
90
+ "status=#{status}>"
91
+ end
92
+
93
+ private
94
+
95
+ # Load the CUBIN data as a module
96
+ # @return [void]
97
+ def load_module!
98
+ Ignis.set_device(@device_id)
99
+ @module_handle = DriverAPIBindings.load_module_data(@compiled_kernel.cubin_data)
100
+ Ignis.logger.debug("Loaded module from CUBIN (#{@compiled_kernel.cubin_size} bytes)")
101
+ end
102
+
103
+ # Extract the kernel function from the module
104
+ # @return [void]
105
+ def extract_function!
106
+ @function_handle = DriverAPIBindings.get_module_function(@module_handle, kernel_name)
107
+ Ignis.logger.debug("Extracted function: #{kernel_name}")
108
+ end
109
+
110
+ # Setup finalizer for automatic cleanup
111
+ # @return [void]
112
+ def setup_finalizer
113
+ destructor = destructor_proc(@module_handle)
114
+ ObjectSpace.define_finalizer(self, destructor)
115
+ end
116
+
117
+ # Create a destructor proc that doesn't capture self
118
+ # @param module_handle [FFI::Pointer] Module handle to unload
119
+ # @return [Proc]
120
+ def destructor_proc(module_handle)
121
+ proc do
122
+ if module_handle && !module_handle.null?
123
+ begin
124
+ DriverAPIBindings.unload_module(module_handle)
125
+ rescue StandardError
126
+ nil
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,179 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module JIT
5
+ module Kernels
6
+ # Activation function CUDA kernels for AI training and inference.
7
+ # Each kernel has a forward and backward variant for autograd compatibility.
8
+ # All kernels are compiled via NVRTC on first use and cached.
9
+ module Activations
10
+ class << self
11
+ # ReLU forward: max(0, x)
12
+ # @param n [Integer] number of elements
13
+ # @return [Ignis::JIT::Kernel]
14
+ def relu_forward(n)
15
+ source = <<~CUDA
16
+ extern "C" __global__
17
+ void relu_forward(const float* __restrict__ input,
18
+ float* __restrict__ output,
19
+ const int n) {
20
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
21
+ if (idx < n) {
22
+ output[idx] = fmaxf(input[idx], 0.0f);
23
+ }
24
+ }
25
+ CUDA
26
+ compile_cached(source, "relu_forward")
27
+ end
28
+
29
+ # ReLU backward: grad * (x > 0 ? 1 : 0)
30
+ # @return [Ignis::JIT::Kernel]
31
+ def relu_backward
32
+ source = <<~CUDA
33
+ extern "C" __global__
34
+ void relu_backward(const float* __restrict__ grad_output,
35
+ const float* __restrict__ input,
36
+ float* __restrict__ grad_input,
37
+ const int n) {
38
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
39
+ if (idx < n) {
40
+ grad_input[idx] = input[idx] > 0.0f ? grad_output[idx] : 0.0f;
41
+ }
42
+ }
43
+ CUDA
44
+ compile_cached(source, "relu_backward")
45
+ end
46
+
47
+ # GELU forward: x * Φ(x) using tanh approximation
48
+ # GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))
49
+ # @return [Ignis::JIT::Kernel]
50
+ def gelu_forward
51
+ source = <<~CUDA
52
+ extern "C" __global__
53
+ void gelu_forward(const float* __restrict__ input,
54
+ float* __restrict__ output,
55
+ const int n) {
56
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
57
+ if (idx < n) {
58
+ float x = input[idx];
59
+ float cdf = 0.5f * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x)));
60
+ output[idx] = x * cdf;
61
+ }
62
+ }
63
+ CUDA
64
+ compile_cached(source, "gelu_forward")
65
+ end
66
+
67
+ # GELU backward: Φ(x) + x * φ(x)
68
+ # @return [Ignis::JIT::Kernel]
69
+ def gelu_backward
70
+ source = <<~CUDA
71
+ extern "C" __global__
72
+ void gelu_backward(const float* __restrict__ grad_output,
73
+ const float* __restrict__ input,
74
+ float* __restrict__ grad_input,
75
+ const int n) {
76
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
77
+ if (idx < n) {
78
+ float x = input[idx];
79
+ float s = 0.7978845608f * (x + 0.044715f * x * x * x);
80
+ float tanh_s = tanhf(s);
81
+ float cdf = 0.5f * (1.0f + tanh_s);
82
+ float pdf_term = 0.5f * (1.0f - tanh_s * tanh_s) * 0.7978845608f * (1.0f + 3.0f * 0.044715f * x * x);
83
+ grad_input[idx] = grad_output[idx] * (cdf + x * pdf_term);
84
+ }
85
+ }
86
+ CUDA
87
+ compile_cached(source, "gelu_backward")
88
+ end
89
+
90
+ # SiLU forward: x * sigmoid(x)
91
+ # @return [Ignis::JIT::Kernel]
92
+ def silu_forward
93
+ source = <<~CUDA
94
+ extern "C" __global__
95
+ void silu_forward(const float* __restrict__ input,
96
+ float* __restrict__ output,
97
+ const int n) {
98
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
99
+ if (idx < n) {
100
+ float x = input[idx];
101
+ float sig = 1.0f / (1.0f + expf(-x));
102
+ output[idx] = x * sig;
103
+ }
104
+ }
105
+ CUDA
106
+ compile_cached(source, "silu_forward")
107
+ end
108
+
109
+ # SiLU backward: sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))
110
+ # = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
111
+ # @return [Ignis::JIT::Kernel]
112
+ def silu_backward
113
+ source = <<~CUDA
114
+ extern "C" __global__
115
+ void silu_backward(const float* __restrict__ grad_output,
116
+ const float* __restrict__ input,
117
+ float* __restrict__ grad_input,
118
+ const int n) {
119
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
120
+ if (idx < n) {
121
+ float x = input[idx];
122
+ float sig = 1.0f / (1.0f + expf(-x));
123
+ grad_input[idx] = grad_output[idx] * (sig * (1.0f + x * (1.0f - sig)));
124
+ }
125
+ }
126
+ CUDA
127
+ compile_cached(source, "silu_backward")
128
+ end
129
+
130
+ # Sigmoid forward: 1 / (1 + exp(-x))
131
+ # @return [Ignis::JIT::Kernel]
132
+ def sigmoid_forward
133
+ source = <<~CUDA
134
+ extern "C" __global__
135
+ void sigmoid_forward(const float* __restrict__ input,
136
+ float* __restrict__ output,
137
+ const int n) {
138
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
139
+ if (idx < n) {
140
+ output[idx] = 1.0f / (1.0f + expf(-input[idx]));
141
+ }
142
+ }
143
+ CUDA
144
+ compile_cached(source, "sigmoid_forward")
145
+ end
146
+
147
+ # Sigmoid backward: sigmoid(x) * (1 - sigmoid(x)) * grad
148
+ # @return [Ignis::JIT::Kernel]
149
+ def sigmoid_backward
150
+ source = <<~CUDA
151
+ extern "C" __global__
152
+ void sigmoid_backward(const float* __restrict__ grad_output,
153
+ const float* __restrict__ output,
154
+ float* __restrict__ grad_input,
155
+ const int n) {
156
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
157
+ if (idx < n) {
158
+ float s = output[idx];
159
+ grad_input[idx] = grad_output[idx] * s * (1.0f - s);
160
+ }
161
+ }
162
+ CUDA
163
+ compile_cached(source, "sigmoid_backward")
164
+ end
165
+
166
+ private
167
+
168
+ # @param source [String] CUDA source code
169
+ # @param name [String] kernel function name
170
+ # @param device_id [Integer] target device
171
+ # @return [Ignis::JIT::Kernel]
172
+ def compile_cached(source, name, device_id: 0)
173
+ Ignis::JIT::Compiler.compile(source, name, device_id: device_id)
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end