ignis-numerics 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis-numerics.rb +62 -0
- data/lib/nvruby/array.rb +646 -0
- data/lib/nvruby/fft/cufft_bindings.rb +134 -0
- data/lib/nvruby/fft/fft_plan.rb +288 -0
- data/lib/nvruby/fft/operations.rb +364 -0
- data/lib/nvruby/linalg/cutensor_bindings.rb +107 -0
- data/lib/nvruby/mathdx/fft_kernel.rb +258 -0
- data/lib/nvruby/mathdx/gemm_kernel.rb +293 -0
- data/lib/nvruby/mathdx.rb +73 -0
- data/lib/nvruby/random/curand_bindings.rb +115 -0
- data/lib/nvruby/random/generator.rb +305 -0
- data/lib/nvruby/solver/amgx_bindings.rb +172 -0
- data/lib/nvruby/solver/amgx_config.rb +142 -0
- data/lib/nvruby/solver/amgx_solver.rb +251 -0
- data/lib/nvruby/solver/cudss_bindings.rb +115 -0
- data/lib/nvruby/solver/cusolver_bindings.rb +358 -0
- data/lib/nvruby/solver/eigen.rb +226 -0
- data/lib/nvruby/solver/lu.rb +265 -0
- data/lib/nvruby/solver/sparse_solver.rb +429 -0
- data/lib/nvruby/solver/svd.rb +266 -0
- data/lib/nvruby/solver.rb +122 -0
- data/lib/nvruby/sparse/cusparse_bindings.rb +231 -0
- data/lib/nvruby/sparse/sparse_matrix.rb +456 -0
- data/lib/nvruby/tensor/contraction.rb +218 -0
- data/lib/nvruby/tensor.rb +42 -0
- metadata +85 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module MathDx
|
|
5
|
+
# Device-side FFT kernel using cuFFTDx patterns
|
|
6
|
+
# Generates and compiles CUDA C++ code for thread block FFT operations
|
|
7
|
+
#
|
|
8
|
+
# cuFFTDx enables embedding FFT operations inside CUDA kernels, allowing
|
|
9
|
+
# fusion with other operations to reduce memory bandwidth.
|
|
10
|
+
#
|
|
11
|
+
# @example Basic FFT kernel
|
|
12
|
+
# kernel = FftKernel.new(size: 64, dtype: :complex64, direction: :forward)
|
|
13
|
+
# kernel.compile!
|
|
14
|
+
# kernel.execute(input, output)
|
|
15
|
+
class FftKernel
|
|
16
|
+
# @return [Integer] FFT size
|
|
17
|
+
attr_reader :size
|
|
18
|
+
|
|
19
|
+
# @return [Symbol] Data type
|
|
20
|
+
attr_reader :dtype
|
|
21
|
+
|
|
22
|
+
# @return [Symbol] FFT direction
|
|
23
|
+
attr_reader :direction
|
|
24
|
+
|
|
25
|
+
# @return [Integer] Elements per thread
|
|
26
|
+
attr_reader :elements_per_thread
|
|
27
|
+
|
|
28
|
+
# @return [Boolean] Whether kernel is compiled
|
|
29
|
+
attr_reader :compiled
|
|
30
|
+
|
|
31
|
+
# Supported FFT sizes (powers of 2)
|
|
32
|
+
SUPPORTED_SIZES = [16, 32, 64, 128, 256, 512, 1024].freeze
|
|
33
|
+
|
|
34
|
+
# Initialize FFT kernel configuration
|
|
35
|
+
# @param size [Integer] FFT size (power of 2, <= 1024)
|
|
36
|
+
# @param dtype [Symbol] Data type (:complex64, :complex128)
|
|
37
|
+
# @param direction [Symbol] Direction (:forward, :inverse)
|
|
38
|
+
# @param elements_per_thread [Integer] Elements per thread (default: 8)
|
|
39
|
+
def initialize(size:, dtype: :complex64, direction: :forward, elements_per_thread: 8)
|
|
40
|
+
validate_size!(size)
|
|
41
|
+
validate_dtype!(dtype)
|
|
42
|
+
validate_direction!(direction)
|
|
43
|
+
|
|
44
|
+
@size = size
|
|
45
|
+
@dtype = dtype
|
|
46
|
+
@direction = direction
|
|
47
|
+
@elements_per_thread = elements_per_thread
|
|
48
|
+
@compiled = false
|
|
49
|
+
@kernel = nil
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Compile the FFT kernel
|
|
53
|
+
# @param device_id [Integer] Target GPU device
|
|
54
|
+
# @return [self]
|
|
55
|
+
def compile!(device_id: 0)
|
|
56
|
+
source = generate_source
|
|
57
|
+
@kernel = Ignis::JIT::Compiler.compile(
|
|
58
|
+
source,
|
|
59
|
+
"cufftdx_fft",
|
|
60
|
+
device_id: device_id,
|
|
61
|
+
options: nvrtc_options
|
|
62
|
+
)
|
|
63
|
+
@compiled = true
|
|
64
|
+
self
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Execute the FFT kernel
|
|
68
|
+
# @param input [NvArray] Input array (complex)
|
|
69
|
+
# @param output [NvArray, nil] Output array (created if nil)
|
|
70
|
+
# @param batch [Integer] Number of batched FFTs
|
|
71
|
+
# @param stream [CUDA::Stream, nil] CUDA stream
|
|
72
|
+
# @return [NvArray] Result array
|
|
73
|
+
def execute(input, output: nil, batch: 1, stream: nil)
|
|
74
|
+
raise StateError, "Kernel not compiled. Call compile! first." unless @compiled
|
|
75
|
+
|
|
76
|
+
validate_execution_input!(input)
|
|
77
|
+
|
|
78
|
+
# Ensure input is on device
|
|
79
|
+
input_dev = input.on_device? ? input : input.to_device
|
|
80
|
+
|
|
81
|
+
# Create output if needed
|
|
82
|
+
output_dev = if output
|
|
83
|
+
output.on_device? ? output : output.to_device
|
|
84
|
+
else
|
|
85
|
+
NvArray.zeros(input.shape, dtype: @dtype, device: input_dev.device_index).to_device
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Calculate grid dimensions
|
|
89
|
+
threads_per_fft = @size / @elements_per_thread
|
|
90
|
+
blocks = batch
|
|
91
|
+
|
|
92
|
+
# Launch kernel
|
|
93
|
+
@kernel.launch(
|
|
94
|
+
grid: [blocks],
|
|
95
|
+
block: [threads_per_fft],
|
|
96
|
+
shared_memory: shared_memory_size,
|
|
97
|
+
args: [
|
|
98
|
+
input_dev.device_ptr,
|
|
99
|
+
output_dev.device_ptr,
|
|
100
|
+
@size,
|
|
101
|
+
batch
|
|
102
|
+
],
|
|
103
|
+
stream: stream
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
output_dev
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Release kernel resources
|
|
110
|
+
# @return [void]
|
|
111
|
+
def destroy!
|
|
112
|
+
@kernel = nil
|
|
113
|
+
@compiled = false
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
# Validate FFT size
|
|
119
|
+
def validate_size!(size)
|
|
120
|
+
unless size.positive? && (size & (size - 1)).zero? && size <= 1024
|
|
121
|
+
raise ArgumentError, "FFT size must be power of 2 <= 1024, got #{size}"
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Validate data type
|
|
126
|
+
def validate_dtype!(dtype)
|
|
127
|
+
unless %i[complex64 complex128].include?(dtype)
|
|
128
|
+
raise UnsupportedDTypeError.new(dtype, operation: "cuFFTDx FFT")
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Validate direction
|
|
133
|
+
def validate_direction!(direction)
|
|
134
|
+
unless %i[forward inverse].include?(direction)
|
|
135
|
+
raise ArgumentError, "Direction must be :forward or :inverse, got #{direction}"
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Validate execution input
|
|
140
|
+
def validate_execution_input!(input)
|
|
141
|
+
unless input.is_a?(NvArray)
|
|
142
|
+
raise ArgumentError, "Expected NvArray input"
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
unless input.dtype == @dtype
|
|
146
|
+
raise ArgumentError, "Input dtype must be #{@dtype}, got #{input.dtype}"
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Calculate shared memory size
|
|
151
|
+
# @return [Integer] Bytes of shared memory
|
|
152
|
+
def shared_memory_size
|
|
153
|
+
bytes_per_element = @dtype == :complex64 ? 8 : 16
|
|
154
|
+
@size * bytes_per_element
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Get NVRTC compilation options
|
|
158
|
+
# @return [Array<String>]
|
|
159
|
+
def nvrtc_options
|
|
160
|
+
opts = [
|
|
161
|
+
"-I#{MathDx.include_path}",
|
|
162
|
+
"--std=c++17",
|
|
163
|
+
"-DNDEBUG"
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
cuda_include = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/include"
|
|
167
|
+
opts << "-I#{cuda_include}" if File.exist?(cuda_include)
|
|
168
|
+
|
|
169
|
+
opts
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Generate CUDA C++ source code
|
|
173
|
+
# @return [String]
|
|
174
|
+
def generate_source
|
|
175
|
+
complex_type = @dtype == :complex64 ? "float2" : "double2"
|
|
176
|
+
base_type = @dtype == :complex64 ? "float" : "double"
|
|
177
|
+
sign = @direction == :forward ? -1 : 1
|
|
178
|
+
|
|
179
|
+
<<~CUDA
|
|
180
|
+
#include <cuda_runtime.h>
|
|
181
|
+
#include <math.h>
|
|
182
|
+
|
|
183
|
+
// Complex multiplication
|
|
184
|
+
__device__ inline #{complex_type} cmul(#{complex_type} a, #{complex_type} b) {
|
|
185
|
+
return make_#{complex_type}(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Complex addition
|
|
189
|
+
__device__ inline #{complex_type} cadd(#{complex_type} a, #{complex_type} b) {
|
|
190
|
+
return make_#{complex_type}(a.x + b.x, a.y + b.y);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Complex subtraction
|
|
194
|
+
__device__ inline #{complex_type} csub(#{complex_type} a, #{complex_type} b) {
|
|
195
|
+
return make_#{complex_type}(a.x - b.x, a.y - b.y);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Twiddle factor
|
|
199
|
+
__device__ inline #{complex_type} twiddle(int k, int N) {
|
|
200
|
+
#{base_type} angle = #{sign} * 2.0#{base_type == "double" ? "" : "f"} * M_PI * k / N;
|
|
201
|
+
return make_#{complex_type}(cos#{base_type == "double" ? "" : "f"}(angle), sin#{base_type == "double" ? "" : "f"}(angle));
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Cooley-Tukey radix-2 FFT in shared memory
|
|
205
|
+
extern "C" __global__ void cufftdx_fft(
|
|
206
|
+
const #{complex_type}* __restrict__ input,
|
|
207
|
+
#{complex_type}* __restrict__ output,
|
|
208
|
+
int N,
|
|
209
|
+
int batch
|
|
210
|
+
) {
|
|
211
|
+
extern __shared__ #{complex_type} shared[];
|
|
212
|
+
|
|
213
|
+
int batch_idx = blockIdx.x;
|
|
214
|
+
int tid = threadIdx.x;
|
|
215
|
+
int threads = blockDim.x;
|
|
216
|
+
|
|
217
|
+
// Load data into shared memory with bit-reversal
|
|
218
|
+
for (int i = tid; i < N; i += threads) {
|
|
219
|
+
// Bit-reversal permutation
|
|
220
|
+
int rev = 0;
|
|
221
|
+
int val = i;
|
|
222
|
+
for (int j = 1; j < N; j <<= 1) {
|
|
223
|
+
rev = (rev << 1) | (val & 1);
|
|
224
|
+
val >>= 1;
|
|
225
|
+
}
|
|
226
|
+
shared[i] = input[batch_idx * N + rev];
|
|
227
|
+
}
|
|
228
|
+
__syncthreads();
|
|
229
|
+
|
|
230
|
+
// Iterative Cooley-Tukey FFT
|
|
231
|
+
for (int s = 1; s < N; s <<= 1) {
|
|
232
|
+
int m = s << 1;
|
|
233
|
+
for (int k = tid; k < N / 2; k += threads) {
|
|
234
|
+
int j = k / s;
|
|
235
|
+
int i = k % s;
|
|
236
|
+
int idx1 = j * m + i;
|
|
237
|
+
int idx2 = idx1 + s;
|
|
238
|
+
|
|
239
|
+
#{complex_type} w = twiddle(i, m);
|
|
240
|
+
#{complex_type} t = cmul(w, shared[idx2]);
|
|
241
|
+
#{complex_type} u = shared[idx1];
|
|
242
|
+
|
|
243
|
+
shared[idx1] = cadd(u, t);
|
|
244
|
+
shared[idx2] = csub(u, t);
|
|
245
|
+
}
|
|
246
|
+
__syncthreads();
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Write output
|
|
250
|
+
for (int i = tid; i < N; i += threads) {
|
|
251
|
+
#{@direction == :inverse ? "output[batch_idx * N + i] = make_#{complex_type}(shared[i].x / N, shared[i].y / N);" : "output[batch_idx * N + i] = shared[i];"}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
CUDA
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
end
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module MathDx
|
|
5
|
+
# Device-side GEMM kernel using cuBLASDx
|
|
6
|
+
# Generates and compiles CUDA C++ code that uses cuBLASDx for thread block GEMM
|
|
7
|
+
#
|
|
8
|
+
# cuBLASDx enables embedding GEMM operations inside CUDA kernels, allowing
|
|
9
|
+
# fusion with other operations (epilogs) to reduce memory bandwidth.
|
|
10
|
+
#
|
|
11
|
+
# @example Basic GEMM kernel
|
|
12
|
+
# kernel = GemmKernel.new(m: 64, n: 64, k: 64, dtype: :float16)
|
|
13
|
+
# kernel.compile!
|
|
14
|
+
# kernel.execute(a, b, c)
|
|
15
|
+
class GemmKernel
|
|
16
|
+
# @return [Integer] Matrix M dimension
|
|
17
|
+
attr_reader :m
|
|
18
|
+
|
|
19
|
+
# @return [Integer] Matrix N dimension
|
|
20
|
+
attr_reader :n
|
|
21
|
+
|
|
22
|
+
# @return [Integer] Matrix K dimension
|
|
23
|
+
attr_reader :k
|
|
24
|
+
|
|
25
|
+
# @return [Symbol] Data type
|
|
26
|
+
attr_reader :dtype
|
|
27
|
+
|
|
28
|
+
# @return [Symbol, nil] Epilog operation
|
|
29
|
+
attr_reader :epilog
|
|
30
|
+
|
|
31
|
+
# @return [Integer] Thread block size
|
|
32
|
+
attr_reader :block_size
|
|
33
|
+
|
|
34
|
+
# @return [Boolean] Whether kernel is compiled
|
|
35
|
+
attr_reader :compiled
|
|
36
|
+
|
|
37
|
+
# Initialize GEMM kernel configuration
|
|
38
|
+
# @param m [Integer] Output rows (must be power of 2, <= 128)
|
|
39
|
+
# @param n [Integer] Output columns (must be power of 2, <= 128)
|
|
40
|
+
# @param k [Integer] Inner dimension (must be power of 2)
|
|
41
|
+
# @param dtype [Symbol] Data type (:float16, :float32, :float64)
|
|
42
|
+
# @param epilog [Symbol, nil] Epilog (:relu, :gelu, :sigmoid, :tanh, nil)
|
|
43
|
+
# @param block_size [Integer] Threads per block
|
|
44
|
+
def initialize(m:, n:, k:, dtype: :float32, epilog: nil, block_size: 128)
|
|
45
|
+
validate_dimensions!(m, n, k)
|
|
46
|
+
validate_dtype!(dtype)
|
|
47
|
+
validate_epilog!(epilog) if epilog
|
|
48
|
+
|
|
49
|
+
@m = m
|
|
50
|
+
@n = n
|
|
51
|
+
@k = k
|
|
52
|
+
@dtype = dtype
|
|
53
|
+
@epilog = epilog
|
|
54
|
+
@block_size = block_size
|
|
55
|
+
@compiled = false
|
|
56
|
+
@kernel = nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Compile the GEMM kernel
|
|
60
|
+
# @param device_id [Integer] Target GPU device
|
|
61
|
+
# @return [self]
|
|
62
|
+
def compile!(device_id: 0)
|
|
63
|
+
source = generate_source
|
|
64
|
+
@kernel = Ignis::JIT::Compiler.compile(
|
|
65
|
+
source,
|
|
66
|
+
"cublasdx_gemm",
|
|
67
|
+
device_id: device_id,
|
|
68
|
+
options: nvrtc_options
|
|
69
|
+
)
|
|
70
|
+
@compiled = true
|
|
71
|
+
self
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Execute the GEMM kernel: C = alpha * A @ B + beta * C
|
|
75
|
+
# @param a [NvArray] Input matrix A (M x K)
|
|
76
|
+
# @param b [NvArray] Input matrix B (K x N)
|
|
77
|
+
# @param c [NvArray] Output matrix C (M x N)
|
|
78
|
+
# @param alpha [Float] Scaling factor for A @ B
|
|
79
|
+
# @param beta [Float] Scaling factor for C
|
|
80
|
+
# @param stream [CUDA::Stream, nil] CUDA stream
|
|
81
|
+
# @return [NvArray] Result matrix C
|
|
82
|
+
def execute(a, b, c, alpha: 1.0, beta: 0.0, stream: nil)
|
|
83
|
+
raise StateError, "Kernel not compiled. Call compile! first." unless @compiled
|
|
84
|
+
|
|
85
|
+
validate_execution_inputs!(a, b, c)
|
|
86
|
+
|
|
87
|
+
# Ensure arrays are on device
|
|
88
|
+
a_dev = a.on_device? ? a : a.to_device
|
|
89
|
+
b_dev = b.on_device? ? b : b.to_device
|
|
90
|
+
c_dev = c.on_device? ? c : c.to_device
|
|
91
|
+
|
|
92
|
+
# Calculate grid dimensions
|
|
93
|
+
grid_m = (a_dev.shape[0] + @m - 1) / @m
|
|
94
|
+
grid_n = (b_dev.shape[1] + @n - 1) / @n
|
|
95
|
+
|
|
96
|
+
# Launch kernel
|
|
97
|
+
@kernel.launch(
|
|
98
|
+
grid: [grid_m, grid_n],
|
|
99
|
+
block: [@block_size],
|
|
100
|
+
args: [
|
|
101
|
+
a_dev.device_ptr,
|
|
102
|
+
b_dev.device_ptr,
|
|
103
|
+
c_dev.device_ptr,
|
|
104
|
+
a_dev.shape[0], # M
|
|
105
|
+
b_dev.shape[1], # N
|
|
106
|
+
a_dev.shape[1], # K
|
|
107
|
+
a_dev.shape[1], # lda
|
|
108
|
+
b_dev.shape[1], # ldb
|
|
109
|
+
c_dev.shape[1], # ldc
|
|
110
|
+
alpha,
|
|
111
|
+
beta
|
|
112
|
+
],
|
|
113
|
+
stream: stream
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
c_dev
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Release kernel resources
|
|
120
|
+
# @return [void]
|
|
121
|
+
def destroy!
|
|
122
|
+
@kernel = nil
|
|
123
|
+
@compiled = false
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
private
|
|
127
|
+
|
|
128
|
+
# Validate dimensions
|
|
129
|
+
def validate_dimensions!(m, n, k)
|
|
130
|
+
[m, n, k].each do |dim|
|
|
131
|
+
unless dim.positive? && (dim & (dim - 1)).zero? && dim <= 128
|
|
132
|
+
raise ArgumentError, "Dimensions must be positive powers of 2 <= 128, got #{dim}"
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Validate data type
|
|
138
|
+
def validate_dtype!(dtype)
|
|
139
|
+
unless %i[float16 float32 float64].include?(dtype)
|
|
140
|
+
raise UnsupportedDTypeError.new(dtype, operation: "cuBLASDx GEMM")
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Validate epilog
|
|
145
|
+
def validate_epilog!(epilog)
|
|
146
|
+
unless %i[relu gelu sigmoid tanh].include?(epilog)
|
|
147
|
+
raise ArgumentError, "Unknown epilog: #{epilog}. Supported: relu, gelu, sigmoid, tanh"
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Validate execution inputs
|
|
152
|
+
def validate_execution_inputs!(a, b, c)
|
|
153
|
+
unless a.is_a?(NvArray) && b.is_a?(NvArray) && c.is_a?(NvArray)
|
|
154
|
+
raise ArgumentError, "Expected NvArray inputs"
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
unless a.dtype == @dtype && b.dtype == @dtype && c.dtype == @dtype
|
|
158
|
+
raise ArgumentError, "Array dtypes must match kernel dtype #{@dtype}"
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Get NVRTC compilation options
|
|
163
|
+
# @return [Array<String>]
|
|
164
|
+
def nvrtc_options
|
|
165
|
+
opts = [
|
|
166
|
+
"-I#{MathDx.include_path}",
|
|
167
|
+
"--std=c++17",
|
|
168
|
+
"-DNDEBUG"
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
# Add CUDA include path
|
|
172
|
+
cuda_include = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/include"
|
|
173
|
+
opts << "-I#{cuda_include}" if File.exist?(cuda_include)
|
|
174
|
+
|
|
175
|
+
opts
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Generate CUDA C++ source code
|
|
179
|
+
# @return [String]
|
|
180
|
+
def generate_source
|
|
181
|
+
cuda_type = case @dtype
|
|
182
|
+
when :float16 then "__half"
|
|
183
|
+
when :float32 then "float"
|
|
184
|
+
when :float64 then "double"
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
epilog_code = generate_epilog_code
|
|
188
|
+
|
|
189
|
+
<<~CUDA
|
|
190
|
+
#include <cuda_fp16.h>
|
|
191
|
+
|
|
192
|
+
// cuBLASDx fallback implementation when SDK not available
|
|
193
|
+
// Uses shared memory tiled GEMM algorithm
|
|
194
|
+
|
|
195
|
+
#{epilog_code[:device_function]}
|
|
196
|
+
|
|
197
|
+
extern "C" __global__ void cublasdx_gemm(
|
|
198
|
+
const #{cuda_type}* __restrict__ A,
|
|
199
|
+
const #{cuda_type}* __restrict__ B,
|
|
200
|
+
#{cuda_type}* __restrict__ C,
|
|
201
|
+
int M, int N, int K,
|
|
202
|
+
int lda, int ldb, int ldc,
|
|
203
|
+
#{cuda_type} alpha, #{cuda_type} beta
|
|
204
|
+
) {
|
|
205
|
+
// Thread block GEMM with shared memory tiling
|
|
206
|
+
const int TILE_M = #{@m};
|
|
207
|
+
const int TILE_N = #{@n};
|
|
208
|
+
const int TILE_K = #{@k};
|
|
209
|
+
|
|
210
|
+
__shared__ #{cuda_type} As[TILE_M][TILE_K];
|
|
211
|
+
__shared__ #{cuda_type} Bs[TILE_K][TILE_N];
|
|
212
|
+
|
|
213
|
+
int bx = blockIdx.x;
|
|
214
|
+
int by = blockIdx.y;
|
|
215
|
+
int tx = threadIdx.x % TILE_N;
|
|
216
|
+
int ty = threadIdx.x / TILE_N;
|
|
217
|
+
|
|
218
|
+
int row = bx * TILE_M + ty;
|
|
219
|
+
int col = by * TILE_N + tx;
|
|
220
|
+
|
|
221
|
+
#{cuda_type} acc = 0;
|
|
222
|
+
|
|
223
|
+
for (int t = 0; t < (K + TILE_K - 1) / TILE_K; ++t) {
|
|
224
|
+
// Load tile of A into shared memory
|
|
225
|
+
if (row < M && t * TILE_K + tx < K) {
|
|
226
|
+
As[ty][tx] = A[row * lda + t * TILE_K + tx];
|
|
227
|
+
} else {
|
|
228
|
+
As[ty][tx] = 0;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Load tile of B into shared memory
|
|
232
|
+
if (t * TILE_K + ty < K && col < N) {
|
|
233
|
+
Bs[ty][tx] = B[(t * TILE_K + ty) * ldb + col];
|
|
234
|
+
} else {
|
|
235
|
+
Bs[ty][tx] = 0;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
__syncthreads();
|
|
239
|
+
|
|
240
|
+
// Compute partial dot product
|
|
241
|
+
#pragma unroll
|
|
242
|
+
for (int k = 0; k < TILE_K; ++k) {
|
|
243
|
+
acc += As[ty][k] * Bs[k][tx];
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
__syncthreads();
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Write result with epilog
|
|
250
|
+
if (row < M && col < N) {
|
|
251
|
+
#{cuda_type} result = alpha * acc + beta * C[row * ldc + col];
|
|
252
|
+
#{epilog_code[:apply]}
|
|
253
|
+
C[row * ldc + col] = result;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
CUDA
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Generate epilog code
|
|
260
|
+
# @return [Hash] :device_function and :apply code
|
|
261
|
+
def generate_epilog_code
|
|
262
|
+
case @epilog
|
|
263
|
+
when :relu
|
|
264
|
+
{
|
|
265
|
+
device_function: "__device__ inline float relu(float x) { return fmaxf(x, 0.0f); }",
|
|
266
|
+
apply: "result = relu(result);"
|
|
267
|
+
}
|
|
268
|
+
when :gelu
|
|
269
|
+
{
|
|
270
|
+
device_function: "__device__ inline float gelu(float x) { " \
|
|
271
|
+
"return 0.5f * x * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x))); }",
|
|
272
|
+
apply: "result = gelu(result);"
|
|
273
|
+
}
|
|
274
|
+
when :sigmoid
|
|
275
|
+
{
|
|
276
|
+
device_function: "__device__ inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }",
|
|
277
|
+
apply: "result = sigmoid(result);"
|
|
278
|
+
}
|
|
279
|
+
when :tanh
|
|
280
|
+
{
|
|
281
|
+
device_function: "",
|
|
282
|
+
apply: "result = tanhf(result);"
|
|
283
|
+
}
|
|
284
|
+
else
|
|
285
|
+
{ device_function: "", apply: "" }
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# State error for kernel operations
|
|
291
|
+
class StateError < StandardError; end
|
|
292
|
+
end
|
|
293
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Ignis MathDx Module
|
|
4
|
+
# Device-side BLAS/FFT extensions using cuBLASDx and cuFFTDx
|
|
5
|
+
|
|
6
|
+
require_relative "mathdx/gemm_kernel"
|
|
7
|
+
require_relative "mathdx/fft_kernel"
|
|
8
|
+
|
|
9
|
+
module Ignis
|
|
10
|
+
# MathDx module for device-side extensions
|
|
11
|
+
# Provides high-performance fused kernel generation using cuBLASDx and cuFFTDx
|
|
12
|
+
#
|
|
13
|
+
# Note: MathDx libraries (cuBLASDx, cuFFTDx) are header-only C++ template libraries
|
|
14
|
+
# that generate optimized device code at compile time. This module generates
|
|
15
|
+
# CUDA C++ source code that uses MathDx and compiles it via NVRTC.
|
|
16
|
+
#
|
|
17
|
+
# @example Fused GEMM + epilog kernel
|
|
18
|
+
# kernel = Ignis::MathDx::GemmKernel.new(
|
|
19
|
+
# m: 64, n: 64, k: 64,
|
|
20
|
+
# dtype: :float16,
|
|
21
|
+
# epilog: :relu
|
|
22
|
+
# )
|
|
23
|
+
# kernel.compile!
|
|
24
|
+
# kernel.execute(a, b, c)
|
|
25
|
+
module MathDx
|
|
26
|
+
# MathDx installation path (set via environment or default)
|
|
27
|
+
MATHDX_PATH = ENV.fetch("MATHDX_PATH", "C:/Program Files/NVIDIA/MathDx")
|
|
28
|
+
|
|
29
|
+
# Check if MathDx SDK is available
|
|
30
|
+
# @return [Boolean]
|
|
31
|
+
def self.available?
|
|
32
|
+
cublasdx_header = File.join(MATHDX_PATH, "include", "cublasdx.hpp")
|
|
33
|
+
cufftdx_header = File.join(MATHDX_PATH, "include", "cufftdx.hpp")
|
|
34
|
+
File.exist?(cublasdx_header) || File.exist?(cufftdx_header)
|
|
35
|
+
rescue StandardError
|
|
36
|
+
false
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Get MathDx include path for NVRTC compilation
|
|
40
|
+
# @return [String]
|
|
41
|
+
def self.include_path
|
|
42
|
+
File.join(MATHDX_PATH, "include")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Supported compute capabilities for MathDx
|
|
46
|
+
SUPPORTED_SM_VERSIONS = [70, 75, 80, 86, 89, 90].freeze
|
|
47
|
+
|
|
48
|
+
class << self
|
|
49
|
+
# Create a fused GEMM kernel with device-side MathDx acceleration
|
|
50
|
+
# @param m [Integer] Output rows
|
|
51
|
+
# @param n [Integer] Output columns
|
|
52
|
+
# @param k [Integer] Inner dimension
|
|
53
|
+
# @param dtype [Symbol] Data type (:float16, :float32, :float64)
|
|
54
|
+
# @param epilog [Symbol, nil] Epilog operation (:relu, :gelu, :sigmoid, nil)
|
|
55
|
+
# @param block_size [Integer] Thread block size
|
|
56
|
+
# @return [GemmKernel]
|
|
57
|
+
def gemm_kernel(m:, n:, k:, dtype: :float32, epilog: nil, block_size: 128)
|
|
58
|
+
GemmKernel.new(m: m, n: n, k: k, dtype: dtype, epilog: epilog, block_size: block_size)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Create a fused FFT kernel with device-side MathDx acceleration
|
|
62
|
+
# @param size [Integer] FFT size
|
|
63
|
+
# @param dtype [Symbol] Data type (:complex64, :complex128)
|
|
64
|
+
# @param direction [Symbol] Direction (:forward, :inverse)
|
|
65
|
+
# @param elements_per_thread [Integer] Elements per thread
|
|
66
|
+
# @return [FftKernel]
|
|
67
|
+
def fft_kernel(size:, dtype: :complex64, direction: :forward, elements_per_thread: 8)
|
|
68
|
+
FftKernel.new(size: size, dtype: dtype, direction: direction,
|
|
69
|
+
elements_per_thread: elements_per_thread)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|