ignis-numerics 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,258 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module MathDx
5
+ # Device-side FFT kernel using cuFFTDx patterns
6
+ # Generates and compiles CUDA C++ code for thread block FFT operations
7
+ #
8
+ # cuFFTDx enables embedding FFT operations inside CUDA kernels, allowing
9
+ # fusion with other operations to reduce memory bandwidth.
10
+ #
11
+ # @example Basic FFT kernel
12
+ # kernel = FftKernel.new(size: 64, dtype: :complex64, direction: :forward)
13
+ # kernel.compile!
14
+ # kernel.execute(input, output)
15
+ class FftKernel
16
+ # @return [Integer] FFT size
17
+ attr_reader :size
18
+
19
+ # @return [Symbol] Data type
20
+ attr_reader :dtype
21
+
22
+ # @return [Symbol] FFT direction
23
+ attr_reader :direction
24
+
25
+ # @return [Integer] Elements per thread
26
+ attr_reader :elements_per_thread
27
+
28
+ # @return [Boolean] Whether kernel is compiled
29
+ attr_reader :compiled
30
+
31
+ # Supported FFT sizes (powers of 2)
32
+ SUPPORTED_SIZES = [16, 32, 64, 128, 256, 512, 1024].freeze
33
+
34
+ # Initialize FFT kernel configuration
35
+ # @param size [Integer] FFT size (power of 2, <= 1024)
36
+ # @param dtype [Symbol] Data type (:complex64, :complex128)
37
+ # @param direction [Symbol] Direction (:forward, :inverse)
38
+ # @param elements_per_thread [Integer] Elements per thread (default: 8)
39
+ def initialize(size:, dtype: :complex64, direction: :forward, elements_per_thread: 8)
40
+ validate_size!(size)
41
+ validate_dtype!(dtype)
42
+ validate_direction!(direction)
43
+
44
+ @size = size
45
+ @dtype = dtype
46
+ @direction = direction
47
+ @elements_per_thread = elements_per_thread
48
+ @compiled = false
49
+ @kernel = nil
50
+ end
51
+
52
+ # Compile the FFT kernel
53
+ # @param device_id [Integer] Target GPU device
54
+ # @return [self]
55
+ def compile!(device_id: 0)
56
+ source = generate_source
57
+ @kernel = Ignis::JIT::Compiler.compile(
58
+ source,
59
+ "cufftdx_fft",
60
+ device_id: device_id,
61
+ options: nvrtc_options
62
+ )
63
+ @compiled = true
64
+ self
65
+ end
66
+
67
+ # Execute the FFT kernel
68
+ # @param input [NvArray] Input array (complex)
69
+ # @param output [NvArray, nil] Output array (created if nil)
70
+ # @param batch [Integer] Number of batched FFTs
71
+ # @param stream [CUDA::Stream, nil] CUDA stream
72
+ # @return [NvArray] Result array
73
+ def execute(input, output: nil, batch: 1, stream: nil)
74
+ raise StateError, "Kernel not compiled. Call compile! first." unless @compiled
75
+
76
+ validate_execution_input!(input)
77
+
78
+ # Ensure input is on device
79
+ input_dev = input.on_device? ? input : input.to_device
80
+
81
+ # Create output if needed
82
+ output_dev = if output
83
+ output.on_device? ? output : output.to_device
84
+ else
85
+ NvArray.zeros(input.shape, dtype: @dtype, device: input_dev.device_index).to_device
86
+ end
87
+
88
+ # Calculate grid dimensions
89
+ threads_per_fft = @size / @elements_per_thread
90
+ blocks = batch
91
+
92
+ # Launch kernel
93
+ @kernel.launch(
94
+ grid: [blocks],
95
+ block: [threads_per_fft],
96
+ shared_memory: shared_memory_size,
97
+ args: [
98
+ input_dev.device_ptr,
99
+ output_dev.device_ptr,
100
+ @size,
101
+ batch
102
+ ],
103
+ stream: stream
104
+ )
105
+
106
+ output_dev
107
+ end
108
+
109
+ # Release kernel resources
110
+ # @return [void]
111
+ def destroy!
112
+ @kernel = nil
113
+ @compiled = false
114
+ end
115
+
116
+ private
117
+
118
+ # Validate FFT size
119
+ def validate_size!(size)
120
+ unless size.positive? && (size & (size - 1)).zero? && size <= 1024
121
+ raise ArgumentError, "FFT size must be power of 2 <= 1024, got #{size}"
122
+ end
123
+ end
124
+
125
+ # Validate data type
126
+ def validate_dtype!(dtype)
127
+ unless %i[complex64 complex128].include?(dtype)
128
+ raise UnsupportedDTypeError.new(dtype, operation: "cuFFTDx FFT")
129
+ end
130
+ end
131
+
132
+ # Validate direction
133
+ def validate_direction!(direction)
134
+ unless %i[forward inverse].include?(direction)
135
+ raise ArgumentError, "Direction must be :forward or :inverse, got #{direction}"
136
+ end
137
+ end
138
+
139
+ # Validate execution input
140
+ def validate_execution_input!(input)
141
+ unless input.is_a?(NvArray)
142
+ raise ArgumentError, "Expected NvArray input"
143
+ end
144
+
145
+ unless input.dtype == @dtype
146
+ raise ArgumentError, "Input dtype must be #{@dtype}, got #{input.dtype}"
147
+ end
148
+ end
149
+
150
+ # Calculate shared memory size
151
+ # @return [Integer] Bytes of shared memory
152
+ def shared_memory_size
153
+ bytes_per_element = @dtype == :complex64 ? 8 : 16
154
+ @size * bytes_per_element
155
+ end
156
+
157
+ # Get NVRTC compilation options
158
+ # @return [Array<String>]
159
+ def nvrtc_options
160
+ opts = [
161
+ "-I#{MathDx.include_path}",
162
+ "--std=c++17",
163
+ "-DNDEBUG"
164
+ ]
165
+
166
+ cuda_include = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/include"
167
+ opts << "-I#{cuda_include}" if File.exist?(cuda_include)
168
+
169
+ opts
170
+ end
171
+
172
+ # Generate CUDA C++ source code
173
+ # @return [String]
174
+ def generate_source
175
+ complex_type = @dtype == :complex64 ? "float2" : "double2"
176
+ base_type = @dtype == :complex64 ? "float" : "double"
177
+ sign = @direction == :forward ? -1 : 1
178
+
179
+ <<~CUDA
180
+ #include <cuda_runtime.h>
181
+ #include <math.h>
182
+
183
+ // Complex multiplication
184
+ __device__ inline #{complex_type} cmul(#{complex_type} a, #{complex_type} b) {
185
+ return make_#{complex_type}(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
186
+ }
187
+
188
+ // Complex addition
189
+ __device__ inline #{complex_type} cadd(#{complex_type} a, #{complex_type} b) {
190
+ return make_#{complex_type}(a.x + b.x, a.y + b.y);
191
+ }
192
+
193
+ // Complex subtraction
194
+ __device__ inline #{complex_type} csub(#{complex_type} a, #{complex_type} b) {
195
+ return make_#{complex_type}(a.x - b.x, a.y - b.y);
196
+ }
197
+
198
+ // Twiddle factor
199
+ __device__ inline #{complex_type} twiddle(int k, int N) {
200
+ #{base_type} angle = #{sign} * 2.0#{base_type == "double" ? "" : "f"} * M_PI * k / N;
201
+ return make_#{complex_type}(cos#{base_type == "double" ? "" : "f"}(angle), sin#{base_type == "double" ? "" : "f"}(angle));
202
+ }
203
+
204
+ // Cooley-Tukey radix-2 FFT in shared memory
205
+ extern "C" __global__ void cufftdx_fft(
206
+ const #{complex_type}* __restrict__ input,
207
+ #{complex_type}* __restrict__ output,
208
+ int N,
209
+ int batch
210
+ ) {
211
+ extern __shared__ #{complex_type} shared[];
212
+
213
+ int batch_idx = blockIdx.x;
214
+ int tid = threadIdx.x;
215
+ int threads = blockDim.x;
216
+
217
+ // Load data into shared memory with bit-reversal
218
+ for (int i = tid; i < N; i += threads) {
219
+ // Bit-reversal permutation
220
+ int rev = 0;
221
+ int val = i;
222
+ for (int j = 1; j < N; j <<= 1) {
223
+ rev = (rev << 1) | (val & 1);
224
+ val >>= 1;
225
+ }
226
+ shared[i] = input[batch_idx * N + rev];
227
+ }
228
+ __syncthreads();
229
+
230
+ // Iterative Cooley-Tukey FFT
231
+ for (int s = 1; s < N; s <<= 1) {
232
+ int m = s << 1;
233
+ for (int k = tid; k < N / 2; k += threads) {
234
+ int j = k / s;
235
+ int i = k % s;
236
+ int idx1 = j * m + i;
237
+ int idx2 = idx1 + s;
238
+
239
+ #{complex_type} w = twiddle(i, m);
240
+ #{complex_type} t = cmul(w, shared[idx2]);
241
+ #{complex_type} u = shared[idx1];
242
+
243
+ shared[idx1] = cadd(u, t);
244
+ shared[idx2] = csub(u, t);
245
+ }
246
+ __syncthreads();
247
+ }
248
+
249
+ // Write output
250
+ for (int i = tid; i < N; i += threads) {
251
+ #{@direction == :inverse ? "output[batch_idx * N + i] = make_#{complex_type}(shared[i].x / N, shared[i].y / N);" : "output[batch_idx * N + i] = shared[i];"}
252
+ }
253
+ }
254
+ CUDA
255
+ end
256
+ end
257
+ end
258
+ end
@@ -0,0 +1,293 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module MathDx
5
+ # Device-side GEMM kernel using cuBLASDx
6
+ # Generates and compiles CUDA C++ code that uses cuBLASDx for thread block GEMM
7
+ #
8
+ # cuBLASDx enables embedding GEMM operations inside CUDA kernels, allowing
9
+ # fusion with other operations (epilogs) to reduce memory bandwidth.
10
+ #
11
+ # @example Basic GEMM kernel
12
+ # kernel = GemmKernel.new(m: 64, n: 64, k: 64, dtype: :float16)
13
+ # kernel.compile!
14
+ # kernel.execute(a, b, c)
15
+ class GemmKernel
16
+ # @return [Integer] Matrix M dimension
17
+ attr_reader :m
18
+
19
+ # @return [Integer] Matrix N dimension
20
+ attr_reader :n
21
+
22
+ # @return [Integer] Matrix K dimension
23
+ attr_reader :k
24
+
25
+ # @return [Symbol] Data type
26
+ attr_reader :dtype
27
+
28
+ # @return [Symbol, nil] Epilog operation
29
+ attr_reader :epilog
30
+
31
+ # @return [Integer] Thread block size
32
+ attr_reader :block_size
33
+
34
+ # @return [Boolean] Whether kernel is compiled
35
+ attr_reader :compiled
36
+
37
+ # Initialize GEMM kernel configuration
38
+ # @param m [Integer] Output rows (must be power of 2, <= 128)
39
+ # @param n [Integer] Output columns (must be power of 2, <= 128)
40
+ # @param k [Integer] Inner dimension (must be power of 2)
41
+ # @param dtype [Symbol] Data type (:float16, :float32, :float64)
42
+ # @param epilog [Symbol, nil] Epilog (:relu, :gelu, :sigmoid, :tanh, nil)
43
+ # @param block_size [Integer] Threads per block
44
+ def initialize(m:, n:, k:, dtype: :float32, epilog: nil, block_size: 128)
45
+ validate_dimensions!(m, n, k)
46
+ validate_dtype!(dtype)
47
+ validate_epilog!(epilog) if epilog
48
+
49
+ @m = m
50
+ @n = n
51
+ @k = k
52
+ @dtype = dtype
53
+ @epilog = epilog
54
+ @block_size = block_size
55
+ @compiled = false
56
+ @kernel = nil
57
+ end
58
+
59
+ # Compile the GEMM kernel
60
+ # @param device_id [Integer] Target GPU device
61
+ # @return [self]
62
+ def compile!(device_id: 0)
63
+ source = generate_source
64
+ @kernel = Ignis::JIT::Compiler.compile(
65
+ source,
66
+ "cublasdx_gemm",
67
+ device_id: device_id,
68
+ options: nvrtc_options
69
+ )
70
+ @compiled = true
71
+ self
72
+ end
73
+
74
+ # Execute the GEMM kernel: C = alpha * A @ B + beta * C
75
+ # @param a [NvArray] Input matrix A (M x K)
76
+ # @param b [NvArray] Input matrix B (K x N)
77
+ # @param c [NvArray] Output matrix C (M x N)
78
+ # @param alpha [Float] Scaling factor for A @ B
79
+ # @param beta [Float] Scaling factor for C
80
+ # @param stream [CUDA::Stream, nil] CUDA stream
81
+ # @return [NvArray] Result matrix C
82
+ def execute(a, b, c, alpha: 1.0, beta: 0.0, stream: nil)
83
+ raise StateError, "Kernel not compiled. Call compile! first." unless @compiled
84
+
85
+ validate_execution_inputs!(a, b, c)
86
+
87
+ # Ensure arrays are on device
88
+ a_dev = a.on_device? ? a : a.to_device
89
+ b_dev = b.on_device? ? b : b.to_device
90
+ c_dev = c.on_device? ? c : c.to_device
91
+
92
+ # Calculate grid dimensions
93
+ grid_m = (a_dev.shape[0] + @m - 1) / @m
94
+ grid_n = (b_dev.shape[1] + @n - 1) / @n
95
+
96
+ # Launch kernel
97
+ @kernel.launch(
98
+ grid: [grid_m, grid_n],
99
+ block: [@block_size],
100
+ args: [
101
+ a_dev.device_ptr,
102
+ b_dev.device_ptr,
103
+ c_dev.device_ptr,
104
+ a_dev.shape[0], # M
105
+ b_dev.shape[1], # N
106
+ a_dev.shape[1], # K
107
+ a_dev.shape[1], # lda
108
+ b_dev.shape[1], # ldb
109
+ c_dev.shape[1], # ldc
110
+ alpha,
111
+ beta
112
+ ],
113
+ stream: stream
114
+ )
115
+
116
+ c_dev
117
+ end
118
+
119
+ # Release kernel resources
120
+ # @return [void]
121
+ def destroy!
122
+ @kernel = nil
123
+ @compiled = false
124
+ end
125
+
126
+ private
127
+
128
+ # Validate dimensions
129
+ def validate_dimensions!(m, n, k)
130
+ [m, n, k].each do |dim|
131
+ unless dim.positive? && (dim & (dim - 1)).zero? && dim <= 128
132
+ raise ArgumentError, "Dimensions must be positive powers of 2 <= 128, got #{dim}"
133
+ end
134
+ end
135
+ end
136
+
137
+ # Validate data type
138
+ def validate_dtype!(dtype)
139
+ unless %i[float16 float32 float64].include?(dtype)
140
+ raise UnsupportedDTypeError.new(dtype, operation: "cuBLASDx GEMM")
141
+ end
142
+ end
143
+
144
+ # Validate epilog
145
+ def validate_epilog!(epilog)
146
+ unless %i[relu gelu sigmoid tanh].include?(epilog)
147
+ raise ArgumentError, "Unknown epilog: #{epilog}. Supported: relu, gelu, sigmoid, tanh"
148
+ end
149
+ end
150
+
151
+ # Validate execution inputs
152
+ def validate_execution_inputs!(a, b, c)
153
+ unless a.is_a?(NvArray) && b.is_a?(NvArray) && c.is_a?(NvArray)
154
+ raise ArgumentError, "Expected NvArray inputs"
155
+ end
156
+
157
+ unless a.dtype == @dtype && b.dtype == @dtype && c.dtype == @dtype
158
+ raise ArgumentError, "Array dtypes must match kernel dtype #{@dtype}"
159
+ end
160
+ end
161
+
162
+ # Get NVRTC compilation options
163
+ # @return [Array<String>]
164
+ def nvrtc_options
165
+ opts = [
166
+ "-I#{MathDx.include_path}",
167
+ "--std=c++17",
168
+ "-DNDEBUG"
169
+ ]
170
+
171
+ # Add CUDA include path
172
+ cuda_include = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/include"
173
+ opts << "-I#{cuda_include}" if File.exist?(cuda_include)
174
+
175
+ opts
176
+ end
177
+
178
+ # Generate CUDA C++ source code
179
+ # @return [String]
180
+ def generate_source
181
+ cuda_type = case @dtype
182
+ when :float16 then "__half"
183
+ when :float32 then "float"
184
+ when :float64 then "double"
185
+ end
186
+
187
+ epilog_code = generate_epilog_code
188
+
189
+ <<~CUDA
190
+ #include <cuda_fp16.h>
191
+
192
+ // cuBLASDx fallback implementation when SDK not available
193
+ // Uses shared memory tiled GEMM algorithm
194
+
195
+ #{epilog_code[:device_function]}
196
+
197
+ extern "C" __global__ void cublasdx_gemm(
198
+ const #{cuda_type}* __restrict__ A,
199
+ const #{cuda_type}* __restrict__ B,
200
+ #{cuda_type}* __restrict__ C,
201
+ int M, int N, int K,
202
+ int lda, int ldb, int ldc,
203
+ #{cuda_type} alpha, #{cuda_type} beta
204
+ ) {
205
+ // Thread block GEMM with shared memory tiling
206
+ const int TILE_M = #{@m};
207
+ const int TILE_N = #{@n};
208
+ const int TILE_K = #{@k};
209
+
210
+ __shared__ #{cuda_type} As[TILE_M][TILE_K];
211
+ __shared__ #{cuda_type} Bs[TILE_K][TILE_N];
212
+
213
+ int bx = blockIdx.x;
214
+ int by = blockIdx.y;
215
+ int tx = threadIdx.x % TILE_N;
216
+ int ty = threadIdx.x / TILE_N;
217
+
218
+ int row = bx * TILE_M + ty;
219
+ int col = by * TILE_N + tx;
220
+
221
+ #{cuda_type} acc = 0;
222
+
223
+ for (int t = 0; t < (K + TILE_K - 1) / TILE_K; ++t) {
224
+ // Load tile of A into shared memory
225
+ if (row < M && t * TILE_K + tx < K) {
226
+ As[ty][tx] = A[row * lda + t * TILE_K + tx];
227
+ } else {
228
+ As[ty][tx] = 0;
229
+ }
230
+
231
+ // Load tile of B into shared memory
232
+ if (t * TILE_K + ty < K && col < N) {
233
+ Bs[ty][tx] = B[(t * TILE_K + ty) * ldb + col];
234
+ } else {
235
+ Bs[ty][tx] = 0;
236
+ }
237
+
238
+ __syncthreads();
239
+
240
+ // Compute partial dot product
241
+ #pragma unroll
242
+ for (int k = 0; k < TILE_K; ++k) {
243
+ acc += As[ty][k] * Bs[k][tx];
244
+ }
245
+
246
+ __syncthreads();
247
+ }
248
+
249
+ // Write result with epilog
250
+ if (row < M && col < N) {
251
+ #{cuda_type} result = alpha * acc + beta * C[row * ldc + col];
252
+ #{epilog_code[:apply]}
253
+ C[row * ldc + col] = result;
254
+ }
255
+ }
256
+ CUDA
257
+ end
258
+
259
+ # Generate epilog code
260
+ # @return [Hash] :device_function and :apply code
261
+ def generate_epilog_code
262
+ case @epilog
263
+ when :relu
264
+ {
265
+ device_function: "__device__ inline float relu(float x) { return fmaxf(x, 0.0f); }",
266
+ apply: "result = relu(result);"
267
+ }
268
+ when :gelu
269
+ {
270
+ device_function: "__device__ inline float gelu(float x) { " \
271
+ "return 0.5f * x * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x))); }",
272
+ apply: "result = gelu(result);"
273
+ }
274
+ when :sigmoid
275
+ {
276
+ device_function: "__device__ inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }",
277
+ apply: "result = sigmoid(result);"
278
+ }
279
+ when :tanh
280
+ {
281
+ device_function: "",
282
+ apply: "result = tanhf(result);"
283
+ }
284
+ else
285
+ { device_function: "", apply: "" }
286
+ end
287
+ end
288
+ end
289
+
290
+ # State error for kernel operations
291
+ class StateError < StandardError; end
292
+ end
293
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Ignis MathDx Module
4
+ # Device-side BLAS/FFT extensions using cuBLASDx and cuFFTDx
5
+
6
+ require_relative "mathdx/gemm_kernel"
7
+ require_relative "mathdx/fft_kernel"
8
+
9
+ module Ignis
10
+ # MathDx module for device-side extensions
11
+ # Provides high-performance fused kernel generation using cuBLASDx and cuFFTDx
12
+ #
13
+ # Note: MathDx libraries (cuBLASDx, cuFFTDx) are header-only C++ template libraries
14
+ # that generate optimized device code at compile time. This module generates
15
+ # CUDA C++ source code that uses MathDx and compiles it via NVRTC.
16
+ #
17
+ # @example Fused GEMM + epilog kernel
18
+ # kernel = Ignis::MathDx::GemmKernel.new(
19
+ # m: 64, n: 64, k: 64,
20
+ # dtype: :float16,
21
+ # epilog: :relu
22
+ # )
23
+ # kernel.compile!
24
+ # kernel.execute(a, b, c)
25
+ module MathDx
26
+ # MathDx installation path (set via environment or default)
27
+ MATHDX_PATH = ENV.fetch("MATHDX_PATH", "C:/Program Files/NVIDIA/MathDx")
28
+
29
+ # Check if MathDx SDK is available
30
+ # @return [Boolean]
31
+ def self.available?
32
+ cublasdx_header = File.join(MATHDX_PATH, "include", "cublasdx.hpp")
33
+ cufftdx_header = File.join(MATHDX_PATH, "include", "cufftdx.hpp")
34
+ File.exist?(cublasdx_header) || File.exist?(cufftdx_header)
35
+ rescue StandardError
36
+ false
37
+ end
38
+
39
+ # Get MathDx include path for NVRTC compilation
40
+ # @return [String]
41
+ def self.include_path
42
+ File.join(MATHDX_PATH, "include")
43
+ end
44
+
45
+ # Supported compute capabilities for MathDx
46
+ SUPPORTED_SM_VERSIONS = [70, 75, 80, 86, 89, 90].freeze
47
+
48
+ class << self
49
+ # Create a fused GEMM kernel with device-side MathDx acceleration
50
+ # @param m [Integer] Output rows
51
+ # @param n [Integer] Output columns
52
+ # @param k [Integer] Inner dimension
53
+ # @param dtype [Symbol] Data type (:float16, :float32, :float64)
54
+ # @param epilog [Symbol, nil] Epilog operation (:relu, :gelu, :sigmoid, nil)
55
+ # @param block_size [Integer] Thread block size
56
+ # @return [GemmKernel]
57
+ def gemm_kernel(m:, n:, k:, dtype: :float32, epilog: nil, block_size: 128)
58
+ GemmKernel.new(m: m, n: n, k: k, dtype: dtype, epilog: epilog, block_size: block_size)
59
+ end
60
+
61
+ # Create a fused FFT kernel with device-side MathDx acceleration
62
+ # @param size [Integer] FFT size
63
+ # @param dtype [Symbol] Data type (:complex64, :complex128)
64
+ # @param direction [Symbol] Direction (:forward, :inverse)
65
+ # @param elements_per_thread [Integer] Elements per thread
66
+ # @return [FftKernel]
67
+ def fft_kernel(size:, dtype: :complex64, direction: :forward, elements_per_thread: 8)
68
+ FftKernel.new(size: size, dtype: dtype, direction: direction,
69
+ elements_per_thread: elements_per_thread)
70
+ end
71
+ end
72
+ end
73
+ end