ignis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis.rb +94 -0
- data/lib/nnw/platform.rb +304 -0
- data/lib/nnw/shared/event_bus.rb +240 -0
- data/lib/nnw/shared/ffi_loader.rb +63 -0
- data/lib/nnw/shared/memory_contract.rb +204 -0
- data/lib/nnw/shared/nv_array.rb +710 -0
- data/lib/nnw/shared/recovery_protocol.rb +307 -0
- data/lib/nvruby/configuration.rb +217 -0
- data/lib/nvruby/cuda/device.rb +275 -0
- data/lib/nvruby/cuda/device_props.rb +202 -0
- data/lib/nvruby/cuda/graph.rb +265 -0
- data/lib/nvruby/cuda/graph_bindings.rb +119 -0
- data/lib/nvruby/cuda/library_loader.rb +285 -0
- data/lib/nvruby/cuda/memory.rb +410 -0
- data/lib/nvruby/cuda/runtime_api.rb +804 -0
- data/lib/nvruby/cuda/stream.rb +234 -0
- data/lib/nvruby/dtype.rb +139 -0
- data/lib/nvruby/epilogues.rb +438 -0
- data/lib/nvruby/errors.rb +303 -0
- data/lib/nvruby/half.rb +97 -0
- data/lib/nvruby/jit/compiled_kernel.rb +80 -0
- data/lib/nvruby/jit/compiler.rb +231 -0
- data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
- data/lib/nvruby/jit/kernel.rb +240 -0
- data/lib/nvruby/jit/kernel_module.rb +133 -0
- data/lib/nvruby/jit/kernels/activations.rb +179 -0
- data/lib/nvruby/jit/kernels/attention.rb +504 -0
- data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
- data/lib/nvruby/jit/kernels/loss.rb +213 -0
- data/lib/nvruby/jit/kernels/normalization.rb +200 -0
- data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
- data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
- data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
- data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
- data/lib/nvruby/linalg/epilog.rb +67 -0
- data/lib/nvruby/linalg/matmul.rb +247 -0
- data/lib/nvruby/linalg/matmul_plan.rb +229 -0
- data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
- data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
- data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
- data/lib/nvruby/memory/device_memory_resource.rb +106 -0
- data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
- data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
- data/lib/nvruby/memory/stats.rb +107 -0
- data/lib/nvruby/memory.rb +124 -0
- data/lib/nvruby/version.rb +5 -0
- metadata +108 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module JIT
|
|
5
|
+
# Represents a loaded CUDA module on a specific device
|
|
6
|
+
# Manages the CUmodule handle and CUfunction extraction
|
|
7
|
+
# Handles automatic cleanup via finalizer
|
|
8
|
+
class KernelModule
|
|
9
|
+
# @return [CompiledKernel] The compiled kernel this module was loaded from
|
|
10
|
+
attr_reader :compiled_kernel
|
|
11
|
+
|
|
12
|
+
# @return [Integer] Device ID this module is loaded on
|
|
13
|
+
attr_reader :device_id
|
|
14
|
+
|
|
15
|
+
# @return [FFI::Pointer] The CUmodule handle
|
|
16
|
+
attr_reader :module_handle
|
|
17
|
+
|
|
18
|
+
# @return [FFI::Pointer] The CUfunction handle
|
|
19
|
+
attr_reader :function_handle
|
|
20
|
+
|
|
21
|
+
# @return [Time] When this module was loaded
|
|
22
|
+
attr_reader :loaded_at
|
|
23
|
+
|
|
24
|
+
# Create a new KernelModule by loading compiled code onto a device
|
|
25
|
+
# @param compiled_kernel [CompiledKernel] The compiled kernel to load
|
|
26
|
+
# @param device_id [Integer] Device to load onto
|
|
27
|
+
# @raise [CudaDriverError] If loading fails
|
|
28
|
+
def initialize(compiled_kernel, device_id: 0)
|
|
29
|
+
@compiled_kernel = compiled_kernel
|
|
30
|
+
@device_id = device_id
|
|
31
|
+
@loaded_at = Time.now
|
|
32
|
+
@destroyed = false
|
|
33
|
+
@mutex = Mutex.new
|
|
34
|
+
|
|
35
|
+
load_module!
|
|
36
|
+
extract_function!
|
|
37
|
+
|
|
38
|
+
setup_finalizer
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Get the kernel function name
|
|
42
|
+
# @return [String]
|
|
43
|
+
def kernel_name
|
|
44
|
+
@compiled_kernel.kernel_name
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Check if this module has been destroyed
|
|
48
|
+
# @return [Boolean]
|
|
49
|
+
def destroyed?
|
|
50
|
+
@mutex.synchronize { @destroyed }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Create a Kernel instance for execution
|
|
54
|
+
# @return [Kernel] Executable kernel wrapper
|
|
55
|
+
def to_kernel
|
|
56
|
+
Kernel.new(self)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Unload the module from GPU memory
|
|
60
|
+
# @return [void]
|
|
61
|
+
def destroy!
|
|
62
|
+
@mutex.synchronize do
|
|
63
|
+
return if @destroyed
|
|
64
|
+
|
|
65
|
+
if @module_handle && !@module_handle.null?
|
|
66
|
+
DriverAPIBindings.unload_module(@module_handle)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
@module_handle = nil
|
|
70
|
+
@function_handle = nil
|
|
71
|
+
@destroyed = true
|
|
72
|
+
|
|
73
|
+
Ignis.logger.debug("Unloaded kernel module: #{kernel_name} from device #{@device_id}")
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Get a string representation
|
|
78
|
+
# @return [String]
|
|
79
|
+
def to_s
|
|
80
|
+
"#<Ignis::JIT::KernelModule #{kernel_name} device=#{@device_id}>"
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Get detailed inspection
|
|
84
|
+
# @return [String]
|
|
85
|
+
def inspect
|
|
86
|
+
status = @destroyed ? "destroyed" : "loaded"
|
|
87
|
+
"#<Ignis::JIT::KernelModule:0x#{object_id.to_s(16)} " \
|
|
88
|
+
"kernel=#{kernel_name.inspect} " \
|
|
89
|
+
"device=#{@device_id} " \
|
|
90
|
+
"status=#{status}>"
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
# Load the CUBIN data as a module
|
|
96
|
+
# @return [void]
|
|
97
|
+
def load_module!
|
|
98
|
+
Ignis.set_device(@device_id)
|
|
99
|
+
@module_handle = DriverAPIBindings.load_module_data(@compiled_kernel.cubin_data)
|
|
100
|
+
Ignis.logger.debug("Loaded module from CUBIN (#{@compiled_kernel.cubin_size} bytes)")
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Extract the kernel function from the module
|
|
104
|
+
# @return [void]
|
|
105
|
+
def extract_function!
|
|
106
|
+
@function_handle = DriverAPIBindings.get_module_function(@module_handle, kernel_name)
|
|
107
|
+
Ignis.logger.debug("Extracted function: #{kernel_name}")
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Setup finalizer for automatic cleanup
|
|
111
|
+
# @return [void]
|
|
112
|
+
def setup_finalizer
|
|
113
|
+
destructor = destructor_proc(@module_handle)
|
|
114
|
+
ObjectSpace.define_finalizer(self, destructor)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Create a destructor proc that doesn't capture self
|
|
118
|
+
# @param module_handle [FFI::Pointer] Module handle to unload
|
|
119
|
+
# @return [Proc]
|
|
120
|
+
def destructor_proc(module_handle)
|
|
121
|
+
proc do
|
|
122
|
+
if module_handle && !module_handle.null?
|
|
123
|
+
begin
|
|
124
|
+
DriverAPIBindings.unload_module(module_handle)
|
|
125
|
+
rescue StandardError
|
|
126
|
+
nil
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module JIT
|
|
5
|
+
module Kernels
|
|
6
|
+
# Activation function CUDA kernels for AI training and inference.
|
|
7
|
+
# Each kernel has a forward and backward variant for autograd compatibility.
|
|
8
|
+
# All kernels are compiled via NVRTC on first use and cached.
|
|
9
|
+
module Activations
|
|
10
|
+
class << self
|
|
11
|
+
# ReLU forward: max(0, x)
|
|
12
|
+
# @param n [Integer] number of elements
|
|
13
|
+
# @return [Ignis::JIT::Kernel]
|
|
14
|
+
def relu_forward(n)
|
|
15
|
+
source = <<~CUDA
|
|
16
|
+
extern "C" __global__
|
|
17
|
+
void relu_forward(const float* __restrict__ input,
|
|
18
|
+
float* __restrict__ output,
|
|
19
|
+
const int n) {
|
|
20
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
21
|
+
if (idx < n) {
|
|
22
|
+
output[idx] = fmaxf(input[idx], 0.0f);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
CUDA
|
|
26
|
+
compile_cached(source, "relu_forward")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# ReLU backward: grad * (x > 0 ? 1 : 0)
|
|
30
|
+
# @return [Ignis::JIT::Kernel]
|
|
31
|
+
def relu_backward
|
|
32
|
+
source = <<~CUDA
|
|
33
|
+
extern "C" __global__
|
|
34
|
+
void relu_backward(const float* __restrict__ grad_output,
|
|
35
|
+
const float* __restrict__ input,
|
|
36
|
+
float* __restrict__ grad_input,
|
|
37
|
+
const int n) {
|
|
38
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
39
|
+
if (idx < n) {
|
|
40
|
+
grad_input[idx] = input[idx] > 0.0f ? grad_output[idx] : 0.0f;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
CUDA
|
|
44
|
+
compile_cached(source, "relu_backward")
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# GELU forward: x * Φ(x) using tanh approximation
|
|
48
|
+
# GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))
|
|
49
|
+
# @return [Ignis::JIT::Kernel]
|
|
50
|
+
def gelu_forward
|
|
51
|
+
source = <<~CUDA
|
|
52
|
+
extern "C" __global__
|
|
53
|
+
void gelu_forward(const float* __restrict__ input,
|
|
54
|
+
float* __restrict__ output,
|
|
55
|
+
const int n) {
|
|
56
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
57
|
+
if (idx < n) {
|
|
58
|
+
float x = input[idx];
|
|
59
|
+
float cdf = 0.5f * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x)));
|
|
60
|
+
output[idx] = x * cdf;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
CUDA
|
|
64
|
+
compile_cached(source, "gelu_forward")
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# GELU backward: Φ(x) + x * φ(x)
|
|
68
|
+
# @return [Ignis::JIT::Kernel]
|
|
69
|
+
def gelu_backward
|
|
70
|
+
source = <<~CUDA
|
|
71
|
+
extern "C" __global__
|
|
72
|
+
void gelu_backward(const float* __restrict__ grad_output,
|
|
73
|
+
const float* __restrict__ input,
|
|
74
|
+
float* __restrict__ grad_input,
|
|
75
|
+
const int n) {
|
|
76
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
77
|
+
if (idx < n) {
|
|
78
|
+
float x = input[idx];
|
|
79
|
+
float s = 0.7978845608f * (x + 0.044715f * x * x * x);
|
|
80
|
+
float tanh_s = tanhf(s);
|
|
81
|
+
float cdf = 0.5f * (1.0f + tanh_s);
|
|
82
|
+
float pdf_term = 0.5f * (1.0f - tanh_s * tanh_s) * 0.7978845608f * (1.0f + 3.0f * 0.044715f * x * x);
|
|
83
|
+
grad_input[idx] = grad_output[idx] * (cdf + x * pdf_term);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
CUDA
|
|
87
|
+
compile_cached(source, "gelu_backward")
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# SiLU forward: x * sigmoid(x)
|
|
91
|
+
# @return [Ignis::JIT::Kernel]
|
|
92
|
+
def silu_forward
|
|
93
|
+
source = <<~CUDA
|
|
94
|
+
extern "C" __global__
|
|
95
|
+
void silu_forward(const float* __restrict__ input,
|
|
96
|
+
float* __restrict__ output,
|
|
97
|
+
const int n) {
|
|
98
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
99
|
+
if (idx < n) {
|
|
100
|
+
float x = input[idx];
|
|
101
|
+
float sig = 1.0f / (1.0f + expf(-x));
|
|
102
|
+
output[idx] = x * sig;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
CUDA
|
|
106
|
+
compile_cached(source, "silu_forward")
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# SiLU backward: sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))
|
|
110
|
+
# = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
|
|
111
|
+
# @return [Ignis::JIT::Kernel]
|
|
112
|
+
def silu_backward
|
|
113
|
+
source = <<~CUDA
|
|
114
|
+
extern "C" __global__
|
|
115
|
+
void silu_backward(const float* __restrict__ grad_output,
|
|
116
|
+
const float* __restrict__ input,
|
|
117
|
+
float* __restrict__ grad_input,
|
|
118
|
+
const int n) {
|
|
119
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
120
|
+
if (idx < n) {
|
|
121
|
+
float x = input[idx];
|
|
122
|
+
float sig = 1.0f / (1.0f + expf(-x));
|
|
123
|
+
grad_input[idx] = grad_output[idx] * (sig * (1.0f + x * (1.0f - sig)));
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
CUDA
|
|
127
|
+
compile_cached(source, "silu_backward")
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Sigmoid forward: 1 / (1 + exp(-x))
|
|
131
|
+
# @return [Ignis::JIT::Kernel]
|
|
132
|
+
def sigmoid_forward
|
|
133
|
+
source = <<~CUDA
|
|
134
|
+
extern "C" __global__
|
|
135
|
+
void sigmoid_forward(const float* __restrict__ input,
|
|
136
|
+
float* __restrict__ output,
|
|
137
|
+
const int n) {
|
|
138
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
139
|
+
if (idx < n) {
|
|
140
|
+
output[idx] = 1.0f / (1.0f + expf(-input[idx]));
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
CUDA
|
|
144
|
+
compile_cached(source, "sigmoid_forward")
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Sigmoid backward: sigmoid(x) * (1 - sigmoid(x)) * grad
|
|
148
|
+
# @return [Ignis::JIT::Kernel]
|
|
149
|
+
def sigmoid_backward
|
|
150
|
+
source = <<~CUDA
|
|
151
|
+
extern "C" __global__
|
|
152
|
+
void sigmoid_backward(const float* __restrict__ grad_output,
|
|
153
|
+
const float* __restrict__ output,
|
|
154
|
+
float* __restrict__ grad_input,
|
|
155
|
+
const int n) {
|
|
156
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
157
|
+
if (idx < n) {
|
|
158
|
+
float s = output[idx];
|
|
159
|
+
grad_input[idx] = grad_output[idx] * s * (1.0f - s);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
CUDA
|
|
163
|
+
compile_cached(source, "sigmoid_backward")
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
private
|
|
167
|
+
|
|
168
|
+
# @param source [String] CUDA source code
|
|
169
|
+
# @param name [String] kernel function name
|
|
170
|
+
# @param device_id [Integer] target device
|
|
171
|
+
# @return [Ignis::JIT::Kernel]
|
|
172
|
+
def compile_cached(source, name, device_id: 0)
|
|
173
|
+
Ignis::JIT::Compiler.compile(source, name, device_id: device_id)
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|