ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module JIT
5
+ module Kernels
6
+ # Layer normalization CUDA kernels.
7
+ # Forward computes mean, variance, normalizes, scales, and shifts.
8
+ # Backward computes gradients for input, weight (gamma), and bias (beta).
9
+ module Normalization
10
+ class << self
11
+ # LayerNorm forward: y = gamma * (x - mean) / sqrt(var + eps) + beta
12
+ # Each row (last dim) is normalized independently.
13
+ # @return [Ignis::JIT::Kernel]
14
+ def layer_norm_forward
15
+ source = <<~CUDA
16
+ extern "C" __global__
17
+ void layer_norm_forward(const float* __restrict__ input,
18
+ const float* __restrict__ gamma,
19
+ const float* __restrict__ beta,
20
+ float* __restrict__ output,
21
+ float* __restrict__ mean_out,
22
+ float* __restrict__ rstd_out,
23
+ const int outer_size,
24
+ const int norm_size,
25
+ const float eps) {
26
+ int row = blockIdx.x * blockDim.x + threadIdx.x;
27
+ if (row < outer_size) {
28
+ const float* in_row = input + row * norm_size;
29
+ float* out_row = output + row * norm_size;
30
+
31
+ // Compute mean
32
+ float mean = 0.0f;
33
+ for (int j = 0; j < norm_size; j++) {
34
+ mean += in_row[j];
35
+ }
36
+ mean /= (float)norm_size;
37
+
38
+ // Compute variance
39
+ float var = 0.0f;
40
+ for (int j = 0; j < norm_size; j++) {
41
+ float diff = in_row[j] - mean;
42
+ var += diff * diff;
43
+ }
44
+ var /= (float)norm_size;
45
+
46
+ float rstd = rsqrtf(var + eps);
47
+
48
+ // Save for backward pass
49
+ if (mean_out) mean_out[row] = mean;
50
+ if (rstd_out) rstd_out[row] = rstd;
51
+
52
+ // Normalize, scale, shift
53
+ for (int j = 0; j < norm_size; j++) {
54
+ float normalized = (in_row[j] - mean) * rstd;
55
+ out_row[j] = gamma[j] * normalized + beta[j];
56
+ }
57
+ }
58
+ }
59
+ CUDA
60
+ compile_cached(source, "layer_norm_forward")
61
+ end
62
+
63
+ # LayerNorm backward: computes dL/dx, dL/dgamma, dL/dbeta
64
+ # @return [Ignis::JIT::Kernel]
65
+ def layer_norm_backward
66
+ source = <<~CUDA
67
+ extern "C" __global__
68
+ void layer_norm_backward(const float* __restrict__ grad_output,
69
+ const float* __restrict__ input,
70
+ const float* __restrict__ gamma,
71
+ const float* __restrict__ mean,
72
+ const float* __restrict__ rstd,
73
+ float* __restrict__ grad_input,
74
+ float* __restrict__ grad_gamma,
75
+ float* __restrict__ grad_beta,
76
+ const int outer_size,
77
+ const int norm_size) {
78
+ int row = blockIdx.x * blockDim.x + threadIdx.x;
79
+ if (row < outer_size) {
80
+ const float* go = grad_output + row * norm_size;
81
+ const float* in_row = input + row * norm_size;
82
+ float* gi = grad_input + row * norm_size;
83
+ float m = mean[row];
84
+ float rs = rstd[row];
85
+
86
+ // Compute intermediate sums for efficient backward
87
+ float sum_go_x = 0.0f;
88
+ float sum_go = 0.0f;
89
+ for (int j = 0; j < norm_size; j++) {
90
+ float x_hat = (in_row[j] - m) * rs;
91
+ sum_go_x += go[j] * gamma[j] * x_hat;
92
+ sum_go += go[j] * gamma[j];
93
+ }
94
+
95
+ float inv_n = 1.0f / (float)norm_size;
96
+
97
+ // Compute grad_input
98
+ for (int j = 0; j < norm_size; j++) {
99
+ float x_hat = (in_row[j] - m) * rs;
100
+ gi[j] = rs * (go[j] * gamma[j] - inv_n * (sum_go + x_hat * sum_go_x));
101
+ }
102
+
103
+ // Accumulate grad_gamma and grad_beta (needs atomicAdd for multi-row)
104
+ for (int j = 0; j < norm_size; j++) {
105
+ float x_hat = (in_row[j] - m) * rs;
106
+ atomicAdd(&grad_gamma[j], go[j] * x_hat);
107
+ atomicAdd(&grad_beta[j], go[j]);
108
+ }
109
+ }
110
+ }
111
+ CUDA
112
+ compile_cached(source, "layer_norm_backward")
113
+ end
114
+
115
+ # RMSNorm forward: y = gamma * x / sqrt(mean(x^2) + eps)
116
+ # Used in LLaMA/Mistral architectures
117
+ # @return [Ignis::JIT::Kernel]
118
+ def rms_norm_forward
119
+ source = <<~CUDA
120
+ extern "C" __global__
121
+ void rms_norm_forward(const float* __restrict__ input,
122
+ const float* __restrict__ gamma,
123
+ float* __restrict__ output,
124
+ float* __restrict__ rstd_out,
125
+ const int outer_size,
126
+ const int norm_size,
127
+ const float eps) {
128
+ int row = blockIdx.x * blockDim.x + threadIdx.x;
129
+ if (row < outer_size) {
130
+ const float* in_row = input + row * norm_size;
131
+ float* out_row = output + row * norm_size;
132
+
133
+ float ss = 0.0f;
134
+ for (int j = 0; j < norm_size; j++) {
135
+ ss += in_row[j] * in_row[j];
136
+ }
137
+ float rstd = rsqrtf(ss / (float)norm_size + eps);
138
+
139
+ if (rstd_out) rstd_out[row] = rstd;
140
+
141
+ for (int j = 0; j < norm_size; j++) {
142
+ out_row[j] = gamma[j] * in_row[j] * rstd;
143
+ }
144
+ }
145
+ }
146
+ CUDA
147
+ compile_cached(source, "rms_norm_forward")
148
+ end
149
+
150
+ # RMSNorm backward: dL/dx and dL/dgamma (no bias in RMSNorm).
151
+ # With x_hat_j = x_j * rstd and y_j = gamma_j * x_hat_j:
152
+ # dL/dx_i = rstd * (go_i*gamma_i - x_hat_i * S / n), S = sum_j go_j*gamma_j*x_hat_j
153
+ # dL/dgamma_j = sum_rows go_j * x_hat_j
154
+ # @return [Ignis::JIT::Kernel]
155
+ def rms_norm_backward
156
+ source = <<~CUDA
157
+ extern "C" __global__
158
+ void rms_norm_backward(const float* __restrict__ grad_output,
159
+ const float* __restrict__ input,
160
+ const float* __restrict__ gamma,
161
+ const float* __restrict__ rstd,
162
+ float* __restrict__ grad_input,
163
+ float* __restrict__ grad_gamma,
164
+ const int outer_size,
165
+ const int norm_size) {
166
+ int row = blockIdx.x * blockDim.x + threadIdx.x;
167
+ if (row < outer_size) {
168
+ const float* go = grad_output + row * norm_size;
169
+ const float* in_row = input + row * norm_size;
170
+ float* gi = grad_input + row * norm_size;
171
+ float r = rstd[row];
172
+
173
+ // S = sum_j go_j * gamma_j * x_hat_j (x_hat_j = x_j * r)
174
+ float s = 0.0f;
175
+ for (int j = 0; j < norm_size; j++) {
176
+ s += go[j] * gamma[j] * (in_row[j] * r);
177
+ }
178
+
179
+ float inv_n = 1.0f / (float)norm_size;
180
+ for (int j = 0; j < norm_size; j++) {
181
+ float x_hat = in_row[j] * r;
182
+ gi[j] = r * (go[j] * gamma[j] - x_hat * s * inv_n);
183
+ atomicAdd(&grad_gamma[j], go[j] * x_hat);
184
+ }
185
+ }
186
+ }
187
+ CUDA
188
+ compile_cached(source, "rms_norm_backward")
189
+ end
190
+
191
+ private
192
+
193
+ def compile_cached(source, name, device_id: 0)
194
+ Ignis::JIT::Compiler.compile(source, name, device_id: device_id)
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,193 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module JIT
5
+ module Kernels
6
+ # Optimizer CUDA kernels.
7
+ # Each optimizer step is a single fused kernel per parameter
8
+ # (update moments + param in one pass, avoiding multiple kernel launches).
9
+ module Optimizer
10
+ class << self
11
+ # Fused Adam step: update m, v, and param in one kernel launch
12
+ # @return [Ignis::JIT::Kernel]
13
+ def adam_step
14
+ source = <<~CUDA
15
+ extern "C" __global__
16
+ void adam_step(float* __restrict__ param,
17
+ const float* __restrict__ grad,
18
+ float* __restrict__ m,
19
+ float* __restrict__ v,
20
+ const float lr,
21
+ const float beta1,
22
+ const float beta2,
23
+ const float eps,
24
+ const float weight_decay,
25
+ const float bias_correction1,
26
+ const float bias_correction2,
27
+ const int n) {
28
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
29
+ if (idx < n) {
30
+ float g = grad[idx];
31
+
32
+ // L2 regularization (Adam-style, not decoupled)
33
+ if (weight_decay > 0.0f) {
34
+ g += weight_decay * param[idx];
35
+ }
36
+
37
+ // Update biased first moment estimate
38
+ float m_new = beta1 * m[idx] + (1.0f - beta1) * g;
39
+ m[idx] = m_new;
40
+
41
+ // Update biased second moment estimate
42
+ float v_new = beta2 * v[idx] + (1.0f - beta2) * g * g;
43
+ v[idx] = v_new;
44
+
45
+ // Bias correction
46
+ float m_hat = m_new / bias_correction1;
47
+ float v_hat = v_new / bias_correction2;
48
+
49
+ // Update parameter
50
+ param[idx] -= lr * m_hat / (sqrtf(v_hat) + eps);
51
+ }
52
+ }
53
+ CUDA
54
+ compile_cached(source, "adam_step")
55
+ end
56
+
57
+ # Fused AdamW step: Adam with decoupled weight decay
58
+ # Weight decay applied directly to param, not through gradient
59
+ # @return [Ignis::JIT::Kernel]
60
+ def adamw_step
61
+ source = <<~CUDA
62
+ extern "C" __global__
63
+ void adamw_step(float* __restrict__ param,
64
+ const float* __restrict__ grad,
65
+ float* __restrict__ m,
66
+ float* __restrict__ v,
67
+ const float lr,
68
+ const float beta1,
69
+ const float beta2,
70
+ const float eps,
71
+ const float weight_decay,
72
+ const float bias_correction1,
73
+ const float bias_correction2,
74
+ const int n) {
75
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
76
+ if (idx < n) {
77
+ float g = grad[idx];
78
+
79
+ // Update biased first moment
80
+ float m_new = beta1 * m[idx] + (1.0f - beta1) * g;
81
+ m[idx] = m_new;
82
+
83
+ // Update biased second moment
84
+ float v_new = beta2 * v[idx] + (1.0f - beta2) * g * g;
85
+ v[idx] = v_new;
86
+
87
+ // Bias correction
88
+ float m_hat = m_new / bias_correction1;
89
+ float v_hat = v_new / bias_correction2;
90
+
91
+ // Decoupled weight decay + Adam update
92
+ param[idx] -= lr * (m_hat / (sqrtf(v_hat) + eps) + weight_decay * param[idx]);
93
+ }
94
+ }
95
+ CUDA
96
+ compile_cached(source, "adamw_step")
97
+ end
98
+
99
+ # SGD step with momentum and weight decay
100
+ # @return [Ignis::JIT::Kernel]
101
+ def sgd_step
102
+ source = <<~CUDA
103
+ extern "C" __global__
104
+ void sgd_step(float* __restrict__ param,
105
+ const float* __restrict__ grad,
106
+ float* __restrict__ velocity,
107
+ const float lr,
108
+ const float momentum,
109
+ const float weight_decay,
110
+ const int n) {
111
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
112
+ if (idx < n) {
113
+ float g = grad[idx];
114
+
115
+ if (weight_decay > 0.0f) {
116
+ g += weight_decay * param[idx];
117
+ }
118
+
119
+ float v;
120
+ if (momentum > 0.0f) {
121
+ v = momentum * velocity[idx] + g;
122
+ velocity[idx] = v;
123
+ } else {
124
+ v = g;
125
+ }
126
+
127
+ param[idx] -= lr * v;
128
+ }
129
+ }
130
+ CUDA
131
+ compile_cached(source, "sgd_step")
132
+ end
133
+
134
+ # Gradient clipping by global norm
135
+ # Phase 1: compute per-parameter squared sum
136
+ # @return [Ignis::JIT::Kernel]
137
+ def grad_squared_sum
138
+ source = <<~CUDA
139
+ extern "C" __global__
140
+ void grad_squared_sum(const float* __restrict__ grad,
141
+ float* __restrict__ partial_sum,
142
+ const int n) {
143
+ extern __shared__ float sdata[];
144
+ int tid = threadIdx.x;
145
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
146
+
147
+ sdata[tid] = (idx < n) ? grad[idx] * grad[idx] : 0.0f;
148
+ __syncthreads();
149
+
150
+ // Parallel reduction in shared memory
151
+ for (int s = blockDim.x / 2; s > 0; s >>= 1) {
152
+ if (tid < s) {
153
+ sdata[tid] += sdata[tid + s];
154
+ }
155
+ __syncthreads();
156
+ }
157
+
158
+ if (tid == 0) {
159
+ atomicAdd(partial_sum, sdata[0]);
160
+ }
161
+ }
162
+ CUDA
163
+ compile_cached(source, "grad_squared_sum")
164
+ end
165
+
166
+ # Phase 2: scale gradients by clip factor
167
+ # clip_factor = max_norm / (total_norm + eps)
168
+ # @return [Ignis::JIT::Kernel]
169
+ def grad_clip_scale
170
+ source = <<~CUDA
171
+ extern "C" __global__
172
+ void grad_clip_scale(float* __restrict__ grad,
173
+ const float clip_factor,
174
+ const int n) {
175
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
176
+ if (idx < n) {
177
+ grad[idx] *= clip_factor;
178
+ }
179
+ }
180
+ CUDA
181
+ compile_cached(source, "grad_clip_scale")
182
+ end
183
+
184
+ private
185
+
186
+ def compile_cached(source, name, device_id: 0)
187
+ Ignis::JIT::Compiler.compile(source, name, device_id: device_id)
188
+ end
189
+ end
190
+ end
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,282 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ffi"
4
+
5
+ module Ignis
6
+ module JIT
7
+ # NVRTC (NVIDIA Runtime Compilation) library FFI bindings
8
+ # Provides runtime compilation of CUDA C++ source code to PTX/CUBIN
9
+ module NVRTCBindings
10
+ extend FFI::Library
11
+
12
+ # NVRTC Result codes
13
+ NVRTC_SUCCESS = 0
14
+ NVRTC_ERROR_OUT_OF_MEMORY = 1
15
+ NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
16
+ NVRTC_ERROR_INVALID_INPUT = 3
17
+ NVRTC_ERROR_INVALID_PROGRAM = 4
18
+ NVRTC_ERROR_INVALID_OPTION = 5
19
+ NVRTC_ERROR_COMPILATION = 6
20
+ NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
21
+ NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
22
+ NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
23
+ NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
24
+ NVRTC_ERROR_INTERNAL_ERROR = 11
25
+
26
+ # @return [Boolean] Whether bindings are loaded
27
+ @loaded = false
28
+
29
+ # @return [Mutex] Thread safety lock
30
+ @mutex = Mutex.new
31
+
32
+ class << self
33
+ # Ensure NVRTC library is loaded
34
+ # @return [void]
35
+ # @raise [LibraryNotFoundError] If NVRTC cannot be loaded
36
+ def ensure_loaded!
37
+ @mutex.synchronize do
38
+ return if @loaded
39
+
40
+ Ignis::CUDA::LibraryLoader.load_library(:nvrtc)
41
+ attach_nvrtc_functions!
42
+ @loaded = true
43
+ Ignis.logger.info("NVRTC bindings initialized")
44
+ end
45
+ end
46
+
47
+ # Check if NVRTC is loaded
48
+ # @return [Boolean]
49
+ def loaded?
50
+ @mutex.synchronize { @loaded }
51
+ end
52
+
53
+ # Get NVRTC version
54
+ # @return [String] Version string (e.g., "12.6")
55
+ # @raise [NVRTCError] If version cannot be retrieved
56
+ def version
57
+ ensure_loaded!
58
+
59
+ major_ptr = FFI::MemoryPointer.new(:int)
60
+ minor_ptr = FFI::MemoryPointer.new(:int)
61
+
62
+ result = nvrtcVersion(major_ptr, minor_ptr)
63
+ check_result!(result, "nvrtcVersion")
64
+
65
+ "#{major_ptr.read_int}.#{minor_ptr.read_int}"
66
+ end
67
+
68
+ # Create an NVRTC program from source code
69
+ # @param source [String] CUDA C++ source code
70
+ # @param name [String] Program name (for error messages)
71
+ # @param headers [Array<String>] Header contents (optional)
72
+ # @param header_names [Array<String>] Header names (optional)
73
+ # @return [FFI::Pointer] Program handle
74
+ # @raise [NVRTCError] If program creation fails
75
+ def create_program(source, name: "kernel.cu", headers: [], header_names: [])
76
+ ensure_loaded!
77
+
78
+ prog_ptr = FFI::MemoryPointer.new(:pointer)
79
+ source_ptr = FFI::MemoryPointer.from_string(source)
80
+ name_ptr = FFI::MemoryPointer.from_string(name)
81
+
82
+ num_headers = headers.size
83
+
84
+ if num_headers.positive?
85
+ headers_ptr = FFI::MemoryPointer.new(:pointer, num_headers)
86
+ header_names_ptr = FFI::MemoryPointer.new(:pointer, num_headers)
87
+
88
+ headers.each_with_index do |header, i|
89
+ headers_ptr.put_pointer(i * FFI::Pointer.size, FFI::MemoryPointer.from_string(header))
90
+ end
91
+
92
+ header_names.each_with_index do |header_name, i|
93
+ header_names_ptr.put_pointer(i * FFI::Pointer.size, FFI::MemoryPointer.from_string(header_name))
94
+ end
95
+ else
96
+ headers_ptr = nil
97
+ header_names_ptr = nil
98
+ end
99
+
100
+ result = nvrtcCreateProgram(prog_ptr, source_ptr, name_ptr, num_headers, headers_ptr, header_names_ptr)
101
+ check_result!(result, "nvrtcCreateProgram")
102
+
103
+ prog_ptr.read_pointer
104
+ end
105
+
106
+ # Compile an NVRTC program
107
+ # @param program [FFI::Pointer] Program handle
108
+ # @param options [Array<String>] Compilation options
109
+ # @return [void]
110
+ # @raise [NVRTCError] If compilation fails (includes error log)
111
+ def compile_program(program, options: [])
112
+ ensure_loaded!
113
+
114
+ if options.any?
115
+ options_array = FFI::MemoryPointer.new(:pointer, options.size)
116
+ options.each_with_index do |opt, i|
117
+ options_array.put_pointer(i * FFI::Pointer.size, FFI::MemoryPointer.from_string(opt))
118
+ end
119
+ else
120
+ options_array = nil
121
+ end
122
+
123
+ result = nvrtcCompileProgram(program, options.size, options_array)
124
+
125
+ if result != NVRTC_SUCCESS
126
+ log = get_program_log(program)
127
+ raise NVRTCError.new(result, compilation_log: log)
128
+ end
129
+
130
+ nil
131
+ end
132
+
133
+ # Get the program compilation log
134
+ # @param program [FFI::Pointer] Program handle
135
+ # @return [String] Compilation log
136
+ def get_program_log(program)
137
+ ensure_loaded!
138
+
139
+ size_ptr = FFI::MemoryPointer.new(:size_t)
140
+ result = nvrtcGetProgramLogSize(program, size_ptr)
141
+ return "" unless result == NVRTC_SUCCESS
142
+
143
+ log_size = size_ptr.read(:size_t)
144
+ return "" if log_size.zero?
145
+
146
+ log_ptr = FFI::MemoryPointer.new(:char, log_size)
147
+ result = nvrtcGetProgramLog(program, log_ptr)
148
+ return "" unless result == NVRTC_SUCCESS
149
+
150
+ log_ptr.read_string(log_size - 1)
151
+ end
152
+
153
+ # Get compiled CUBIN size
154
+ # @param program [FFI::Pointer] Program handle
155
+ # @return [Integer] CUBIN size in bytes
156
+ # @raise [NVRTCError] If size cannot be retrieved
157
+ def get_cubin_size(program)
158
+ ensure_loaded!
159
+
160
+ size_ptr = FFI::MemoryPointer.new(:size_t)
161
+ result = nvrtcGetCUBINSize(program, size_ptr)
162
+ check_result!(result, "nvrtcGetCUBINSize")
163
+
164
+ size_ptr.read(:size_t)
165
+ end
166
+
167
+ # Get compiled CUBIN binary
168
+ # @param program [FFI::Pointer] Program handle
169
+ # @return [String] CUBIN binary data
170
+ # @raise [NVRTCError] If CUBIN cannot be retrieved
171
+ def get_cubin(program)
172
+ ensure_loaded!
173
+
174
+ cubin_size = get_cubin_size(program)
175
+ cubin_ptr = FFI::MemoryPointer.new(:char, cubin_size)
176
+
177
+ result = nvrtcGetCUBIN(program, cubin_ptr)
178
+ check_result!(result, "nvrtcGetCUBIN")
179
+
180
+ cubin_ptr.read_bytes(cubin_size)
181
+ end
182
+
183
+ # Get compiled PTX size
184
+ # @param program [FFI::Pointer] Program handle
185
+ # @return [Integer] PTX size in bytes
186
+ # @raise [NVRTCError] If size cannot be retrieved
187
+ def get_ptx_size(program)
188
+ ensure_loaded!
189
+
190
+ size_ptr = FFI::MemoryPointer.new(:size_t)
191
+ result = nvrtcGetPTXSize(program, size_ptr)
192
+ check_result!(result, "nvrtcGetPTXSize")
193
+
194
+ size_ptr.read(:size_t)
195
+ end
196
+
197
+ # Get compiled PTX code
198
+ # @param program [FFI::Pointer] Program handle
199
+ # @return [String] PTX code
200
+ # @raise [NVRTCError] If PTX cannot be retrieved
201
+ def get_ptx(program)
202
+ ensure_loaded!
203
+
204
+ ptx_size = get_ptx_size(program)
205
+ ptx_ptr = FFI::MemoryPointer.new(:char, ptx_size)
206
+
207
+ result = nvrtcGetPTX(program, ptx_ptr)
208
+ check_result!(result, "nvrtcGetPTX")
209
+
210
+ ptx_ptr.read_string(ptx_size - 1)
211
+ end
212
+
213
+ # Destroy an NVRTC program
214
+ # @param program [FFI::Pointer] Program handle
215
+ # @return [void]
216
+ def destroy_program(program)
217
+ return if program.nil? || program.null?
218
+
219
+ ensure_loaded!
220
+
221
+ prog_ptr = FFI::MemoryPointer.new(:pointer)
222
+ prog_ptr.write_pointer(program)
223
+ nvrtcDestroyProgram(prog_ptr)
224
+ end
225
+
226
+ # Check NVRTC result and raise on error
227
+ # @param result [Integer] NVRTC result code
228
+ # @param context [String] Context for error message
229
+ # @return [void]
230
+ # @raise [NVRTCError] If result is not success
231
+ def check_result!(result, context)
232
+ return if result == NVRTC_SUCCESS
233
+
234
+ raise NVRTCError.new(result, context: context)
235
+ end
236
+
237
+ private
238
+
239
+ # Attach all NVRTC FFI functions
240
+ # @return [void]
241
+ def attach_nvrtc_functions!
242
+ handle = Ignis::CUDA::LibraryLoader.load_library(:nvrtc)
243
+
244
+ define_nvrtc_function(handle, :nvrtcVersion, [:pointer, :pointer], :int)
245
+ define_nvrtc_function(handle, :nvrtcGetErrorString, [:int], :string)
246
+ define_nvrtc_function(handle, :nvrtcCreateProgram, [:pointer, :pointer, :pointer, :int, :pointer, :pointer], :int)
247
+ define_nvrtc_function(handle, :nvrtcDestroyProgram, [:pointer], :int)
248
+ define_nvrtc_function(handle, :nvrtcCompileProgram, [:pointer, :int, :pointer], :int)
249
+ define_nvrtc_function(handle, :nvrtcGetPTXSize, [:pointer, :pointer], :int)
250
+ define_nvrtc_function(handle, :nvrtcGetPTX, [:pointer, :pointer], :int)
251
+ define_nvrtc_function(handle, :nvrtcGetCUBINSize, [:pointer, :pointer], :int)
252
+ define_nvrtc_function(handle, :nvrtcGetCUBIN, [:pointer, :pointer], :int)
253
+ define_nvrtc_function(handle, :nvrtcGetProgramLogSize, [:pointer, :pointer], :int)
254
+ define_nvrtc_function(handle, :nvrtcGetProgramLog, [:pointer, :pointer], :int)
255
+ define_nvrtc_function(handle, :nvrtcAddNameExpression, [:pointer, :pointer], :int)
256
+ define_nvrtc_function(handle, :nvrtcGetLoweredName, [:pointer, :pointer, :pointer], :int)
257
+ end
258
+
259
+ # Define an NVRTC function from the loaded library
260
+ # @param handle [FFI::DynamicLibrary] Library handle
261
+ # @param name [Symbol] Function name
262
+ # @param args [Array] Argument types
263
+ # @param ret [Symbol] Return type
264
+ # @return [void]
265
+ def define_nvrtc_function(handle, name, args, ret)
266
+ # LibraryLoader returns a Fiddle::Handle; resolve the symbol address with
267
+ # Fiddle::Handle#[] and build an FFI::Function from it (Fiddle::Handle has
268
+ # no #find_function method).
269
+ func_ptr = begin
270
+ handle[name.to_s]
271
+ rescue Fiddle::DLError
272
+ nil
273
+ end
274
+ raise NVRTCError.new(NVRTC_ERROR_INTERNAL_ERROR, context: "Function #{name} not found") unless func_ptr
275
+
276
+ func = FFI::Function.new(ret, args, FFI::Pointer.new(func_ptr))
277
+ define_singleton_method(name) { |*call_args| func.call(*call_args) }
278
+ end
279
+ end
280
+ end
281
+ end
282
+ end