ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,363 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ffi"
4
+
5
+ module Ignis
6
+ module JIT
7
+ # CUDA Driver API FFI bindings for module and kernel management
8
+ # Provides low-level access to load compiled code and execute kernels
9
+ module DriverAPIBindings
10
+ extend FFI::Library
11
+
12
+ # CUDA Driver result codes
13
+ CUDA_SUCCESS = 0
14
+ CUDA_ERROR_INVALID_VALUE = 1
15
+ CUDA_ERROR_OUT_OF_MEMORY = 2
16
+ CUDA_ERROR_NOT_INITIALIZED = 3
17
+ CUDA_ERROR_DEINITIALIZED = 4
18
+ CUDA_ERROR_PROFILER_DISABLED = 5
19
+ CUDA_ERROR_NO_DEVICE = 100
20
+ CUDA_ERROR_INVALID_DEVICE = 101
21
+ CUDA_ERROR_INVALID_IMAGE = 200
22
+ CUDA_ERROR_INVALID_CONTEXT = 201
23
+ CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
24
+ CUDA_ERROR_MAP_FAILED = 205
25
+ CUDA_ERROR_UNMAP_FAILED = 206
26
+ CUDA_ERROR_ARRAY_IS_MAPPED = 207
27
+ CUDA_ERROR_ALREADY_MAPPED = 208
28
+ CUDA_ERROR_NO_BINARY_FOR_GPU = 209
29
+ CUDA_ERROR_ALREADY_ACQUIRED = 210
30
+ CUDA_ERROR_NOT_MAPPED = 211
31
+ CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
32
+ CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
33
+ CUDA_ERROR_ECC_UNCORRECTABLE = 214
34
+ CUDA_ERROR_UNSUPPORTED_LIMIT = 215
35
+ CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
36
+ CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
37
+ CUDA_ERROR_INVALID_PTX = 218
38
+ CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
39
+ CUDA_ERROR_INVALID_SOURCE = 300
40
+ CUDA_ERROR_FILE_NOT_FOUND = 301
41
+ CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
42
+ CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
43
+ CUDA_ERROR_OPERATING_SYSTEM = 304
44
+ CUDA_ERROR_INVALID_HANDLE = 400
45
+ CUDA_ERROR_ILLEGAL_STATE = 401
46
+ CUDA_ERROR_NOT_FOUND = 500
47
+ CUDA_ERROR_NOT_READY = 600
48
+ CUDA_ERROR_ILLEGAL_ADDRESS = 700
49
+ CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
50
+ CUDA_ERROR_LAUNCH_TIMEOUT = 702
51
+ CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
52
+ CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
53
+ CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
54
+ CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
55
+ CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
56
+ CUDA_ERROR_ASSERT = 710
57
+ CUDA_ERROR_TOO_MANY_PEERS = 711
58
+ CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
59
+ CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
60
+ CUDA_ERROR_HARDWARE_STACK_ERROR = 714
61
+ CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
62
+ CUDA_ERROR_MISALIGNED_ADDRESS = 716
63
+ CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
64
+ CUDA_ERROR_INVALID_PC = 718
65
+ CUDA_ERROR_LAUNCH_FAILED = 719
66
+ CUDA_ERROR_UNKNOWN = 999
67
+
68
+ # cuLaunchKernel extra parameter constants
69
+ CU_LAUNCH_PARAM_END = 0
70
+ CU_LAUNCH_PARAM_BUFFER_POINTER = 1
71
+ CU_LAUNCH_PARAM_BUFFER_SIZE = 2
72
+
73
+ # @return [Boolean] Whether bindings are loaded
74
+ @loaded = false
75
+
76
+ # @return [Boolean] Whether driver is initialized
77
+ @initialized = false
78
+
79
+ # @return [Mutex] Thread safety lock
80
+ @mutex = Mutex.new
81
+
82
+ class << self
83
+ # Ensure CUDA Driver API is loaded and initialized
84
+ # @return [void]
85
+ # @raise [LibraryNotFoundError] If CUDA driver cannot be loaded
86
+ def ensure_loaded!
87
+ @mutex.synchronize do
88
+ return if @loaded
89
+
90
+ load_cuda_driver!
91
+ attach_driver_functions!
92
+ initialize_driver!
93
+ @loaded = true
94
+ Ignis.logger.info("CUDA Driver API bindings initialized")
95
+ end
96
+ end
97
+
98
+ # Check if Driver API is loaded
99
+ # @return [Boolean]
100
+ def loaded?
101
+ @mutex.synchronize { @loaded }
102
+ end
103
+
104
+ # Load a module from CUBIN/PTX data
105
+ # @param data [String] CUBIN or PTX binary data
106
+ # @return [FFI::Pointer] Module handle (CUmodule)
107
+ # @raise [CudaDriverError] If module loading fails
108
+ def load_module_data(data)
109
+ ensure_loaded!
110
+
111
+ module_ptr = FFI::MemoryPointer.new(:pointer)
112
+ data_ptr = FFI::MemoryPointer.new(:char, data.bytesize)
113
+ data_ptr.put_bytes(0, data)
114
+
115
+ result = cuModuleLoadData(module_ptr, data_ptr)
116
+ check_result!(result, "cuModuleLoadData")
117
+
118
+ module_ptr.read_pointer
119
+ end
120
+
121
+ # Get a kernel function from a loaded module
122
+ # @param cuda_module [FFI::Pointer] Module handle
123
+ # @param function_name [String] Kernel function name
124
+ # @return [FFI::Pointer] Function handle (CUfunction)
125
+ # @raise [CudaDriverError] If function is not found
126
+ def get_module_function(cuda_module, function_name)
127
+ ensure_loaded!
128
+
129
+ func_ptr = FFI::MemoryPointer.new(:pointer)
130
+ name_ptr = FFI::MemoryPointer.from_string(function_name)
131
+
132
+ result = cuModuleGetFunction(func_ptr, cuda_module, name_ptr)
133
+ check_result!(result, "cuModuleGetFunction(#{function_name})")
134
+
135
+ func_ptr.read_pointer
136
+ end
137
+
138
+ # Launch a kernel function
139
+ # @param function [FFI::Pointer] Function handle (CUfunction)
140
+ # @param grid_dim [Array<Integer>] Grid dimensions [x, y, z]
141
+ # @param block_dim [Array<Integer>] Block dimensions [x, y, z]
142
+ # @param kernel_params [Array<FFI::Pointer>] Kernel parameter pointers
143
+ # @param shared_mem [Integer] Dynamic shared memory in bytes
144
+ # @param stream [FFI::Pointer, nil] CUDA stream (nil for default)
145
+ # @return [void]
146
+ # @raise [CudaDriverError] If kernel launch fails
147
+ def launch_kernel(function, grid_dim:, block_dim:, kernel_params:, shared_mem: 0, stream: nil)
148
+ ensure_loaded!
149
+
150
+ grid_x, grid_y, grid_z = normalize_dims(grid_dim)
151
+ block_x, block_y, block_z = normalize_dims(block_dim)
152
+
153
+ params_ptr = build_params_array(kernel_params)
154
+ stream_ptr = stream || FFI::Pointer::NULL
155
+
156
+ result = cuLaunchKernel(
157
+ function,
158
+ grid_x, grid_y, grid_z,
159
+ block_x, block_y, block_z,
160
+ shared_mem,
161
+ stream_ptr,
162
+ params_ptr,
163
+ nil
164
+ )
165
+
166
+ check_result!(result, "cuLaunchKernel")
167
+ end
168
+
169
+ # Unload a module
170
+ # @param cuda_module [FFI::Pointer] Module handle
171
+ # @return [void]
172
+ def unload_module(cuda_module)
173
+ return if cuda_module.nil? || cuda_module.null?
174
+
175
+ ensure_loaded!
176
+ cuModuleUnload(cuda_module)
177
+ end
178
+
179
+ # Get current CUDA context
180
+ # @return [FFI::Pointer] Context handle (CUcontext)
181
+ # @raise [CudaDriverError] If no context is current
182
+ def get_current_context
183
+ ensure_loaded!
184
+
185
+ ctx_ptr = FFI::MemoryPointer.new(:pointer)
186
+ result = cuCtxGetCurrent(ctx_ptr)
187
+ check_result!(result, "cuCtxGetCurrent")
188
+
189
+ ctx_ptr.read_pointer
190
+ end
191
+
192
+ # Set current CUDA context
193
+ # @param context [FFI::Pointer] Context handle
194
+ # @return [void]
195
+ # @raise [CudaDriverError] If context cannot be set
196
+ def set_current_context(context)
197
+ ensure_loaded!
198
+
199
+ result = cuCtxSetCurrent(context)
200
+ check_result!(result, "cuCtxSetCurrent")
201
+ end
202
+
203
+ # Get device compute capability
204
+ # @param device_id [Integer] Device index
205
+ # @return [Array<Integer>] [major, minor] compute capability
206
+ # @raise [CudaDriverError] If attribute cannot be retrieved
207
+ def get_device_compute_capability(device_id)
208
+ ensure_loaded!
209
+
210
+ device_ptr = FFI::MemoryPointer.new(:int)
211
+ result = cuDeviceGet(device_ptr, device_id)
212
+ check_result!(result, "cuDeviceGet")
213
+
214
+ device = device_ptr.read_int
215
+
216
+ major_ptr = FFI::MemoryPointer.new(:int)
217
+ minor_ptr = FFI::MemoryPointer.new(:int)
218
+
219
+ result = cuDeviceGetAttribute(major_ptr, 75, device) # CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
220
+ check_result!(result, "cuDeviceGetAttribute(major)")
221
+
222
+ result = cuDeviceGetAttribute(minor_ptr, 76, device) # CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
223
+ check_result!(result, "cuDeviceGetAttribute(minor)")
224
+
225
+ [major_ptr.read_int, minor_ptr.read_int]
226
+ end
227
+
228
+ # Synchronize current context
229
+ # @return [void]
230
+ # @raise [CudaDriverError] If synchronization fails
231
+ def context_synchronize
232
+ ensure_loaded!
233
+
234
+ result = cuCtxSynchronize()
235
+ check_result!(result, "cuCtxSynchronize")
236
+ end
237
+
238
+ # Check CUDA Driver result and raise on error
239
+ # @param result [Integer] CUDA result code
240
+ # @param context [String] Context for error message
241
+ # @return [void]
242
+ # @raise [CudaDriverError] If result is not success
243
+ def check_result!(result, context)
244
+ return if result == CUDA_SUCCESS
245
+
246
+ raise CudaDriverError.new(result, context: context)
247
+ end
248
+
249
+ private
250
+
251
+ # Load CUDA driver library
252
+ # @return [void]
253
+ def load_cuda_driver!
254
+ begin
255
+ Ignis::CUDA::LibraryLoader.load_library(:cuda_driver)
256
+ rescue LibraryNotFoundError
257
+ ffi_lib "nvcuda"
258
+ end
259
+ end
260
+
261
+ # Initialize CUDA driver
262
+ # @return [void]
263
+ def initialize_driver!
264
+ return if @initialized
265
+
266
+ result = cuInit(0)
267
+ if result == CUDA_SUCCESS
268
+ @initialized = true
269
+ else
270
+ Ignis.logger.warn("cuInit returned #{result}, driver may already be initialized")
271
+ @initialized = true
272
+ end
273
+ end
274
+
275
+ # Attach all CUDA Driver FFI functions
276
+ # @return [void]
277
+ def attach_driver_functions!
278
+ begin
279
+ handle = Ignis::CUDA::LibraryLoader.load_library(:cuda_driver)
280
+ define_driver_function(handle, :cuInit, [:uint], :int)
281
+ define_driver_function(handle, :cuCtxGetCurrent, [:pointer], :int)
282
+ define_driver_function(handle, :cuCtxSetCurrent, [:pointer], :int)
283
+ define_driver_function(handle, :cuCtxSynchronize, [], :int)
284
+ define_driver_function(handle, :cuDeviceGet, [:pointer, :int], :int)
285
+ define_driver_function(handle, :cuDeviceGetAttribute, [:pointer, :int, :int], :int)
286
+ define_driver_function(handle, :cuModuleLoadData, [:pointer, :pointer], :int)
287
+ define_driver_function(handle, :cuModuleLoadDataEx, [:pointer, :pointer, :uint, :pointer, :pointer], :int)
288
+ define_driver_function(handle, :cuModuleGetFunction, [:pointer, :pointer, :pointer], :int)
289
+ define_driver_function(handle, :cuModuleUnload, [:pointer], :int)
290
+ define_driver_function(handle, :cuLaunchKernel, [
291
+ :pointer, # CUfunction
292
+ :uint, :uint, :uint, # grid dims
293
+ :uint, :uint, :uint, # block dims
294
+ :uint, # shared mem
295
+ :pointer, # stream
296
+ :pointer, # kernelParams
297
+ :pointer # extra
298
+ ], :int)
299
+ rescue LibraryNotFoundError
300
+ attach_function :cuInit, [:uint], :int
301
+ attach_function :cuCtxGetCurrent, [:pointer], :int
302
+ attach_function :cuCtxSetCurrent, [:pointer], :int
303
+ attach_function :cuCtxSynchronize, [], :int
304
+ attach_function :cuDeviceGet, [:pointer, :int], :int
305
+ attach_function :cuDeviceGetAttribute, [:pointer, :int, :int], :int
306
+ attach_function :cuModuleLoadData, [:pointer, :pointer], :int
307
+ attach_function :cuModuleLoadDataEx, [:pointer, :pointer, :uint, :pointer, :pointer], :int
308
+ attach_function :cuModuleGetFunction, [:pointer, :pointer, :pointer], :int
309
+ attach_function :cuModuleUnload, [:pointer], :int
310
+ attach_function :cuLaunchKernel, [
311
+ :pointer, :uint, :uint, :uint, :uint, :uint, :uint,
312
+ :uint, :pointer, :pointer, :pointer
313
+ ], :int
314
+ end
315
+ end
316
+
317
+ # Define a Driver API function from the loaded library
318
+ # @param handle [FFI::DynamicLibrary] Library handle
319
+ # @param name [Symbol] Function name
320
+ # @param args [Array] Argument types
321
+ # @param ret [Symbol] Return type
322
+ # @return [void]
323
+ def define_driver_function(handle, name, args, ret)
324
+ # LibraryLoader returns a Fiddle::Handle; resolve the symbol address with
325
+ # Fiddle::Handle#[] and build an FFI::Function from it (Fiddle::Handle has
326
+ # no #find_function method).
327
+ func_ptr = begin
328
+ handle[name.to_s]
329
+ rescue Fiddle::DLError
330
+ nil
331
+ end
332
+ raise CudaDriverError.new(CUDA_ERROR_NOT_FOUND, context: "Function #{name} not found") unless func_ptr
333
+
334
+ func = FFI::Function.new(ret, args, FFI::Pointer.new(func_ptr))
335
+ define_singleton_method(name) { |*call_args| func.call(*call_args) }
336
+ end
337
+
338
+ # Normalize dimensions to [x, y, z]
339
+ # @param dims [Array<Integer>, Integer] Dimensions
340
+ # @return [Array<Integer>] [x, y, z] with defaults of 1
341
+ def normalize_dims(dims)
342
+ dims = [dims] if dims.is_a?(Integer)
343
+ dims = dims.dup
344
+ dims << 1 while dims.size < 3
345
+ dims[0..2]
346
+ end
347
+
348
+ # Build kernel params array from pointers
349
+ # @param params [Array<FFI::Pointer>] Parameter pointers
350
+ # @return [FFI::Pointer] Array of pointers for cuLaunchKernel
351
+ def build_params_array(params)
352
+ return nil if params.empty?
353
+
354
+ params_ptr = FFI::MemoryPointer.new(:pointer, params.size)
355
+ params.each_with_index do |param_ptr, i|
356
+ params_ptr.put_pointer(i * FFI::Pointer.size, param_ptr)
357
+ end
358
+ params_ptr
359
+ end
360
+ end
361
+ end
362
+ end
363
+ end
@@ -0,0 +1,240 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module JIT
5
+ # High-level kernel execution interface
6
+ # Provides easy-to-use launch configuration and argument marshaling
7
+ class Kernel
8
+ # @return [KernelModule] The underlying kernel module
9
+ attr_reader :kernel_module
10
+
11
+ # @return [Integer] Total number of times this kernel has been launched
12
+ attr_reader :launch_count
13
+
14
+ # @return [Float] Total time spent in kernel launches (seconds)
15
+ attr_reader :total_launch_time
16
+
17
+ # Create a new Kernel wrapper
18
+ # @param kernel_module [KernelModule] The loaded kernel module
19
+ def initialize(kernel_module)
20
+ @kernel_module = kernel_module
21
+ @launch_count = 0
22
+ @total_launch_time = 0.0
23
+ @mutex = Mutex.new
24
+ end
25
+
26
+ # Get the kernel function name
27
+ # @return [String]
28
+ def name
29
+ @kernel_module.kernel_name
30
+ end
31
+
32
+ # Get the device this kernel is loaded on
33
+ # @return [Integer]
34
+ def device_id
35
+ @kernel_module.device_id
36
+ end
37
+
38
+ # Launch the kernel with specified configuration
39
+ # @param grid [Array<Integer>, Integer] Grid dimensions (blocks)
40
+ # @param block [Array<Integer>, Integer] Block dimensions (threads)
41
+ # @param args [Array] Kernel arguments (NvArray, scalars, pointers)
42
+ # @param shared_mem [Integer] Dynamic shared memory in bytes
43
+ # @param stream [CUDA::Stream, nil] CUDA stream for async execution
44
+ # @return [self] Returns self for chaining
45
+ # @raise [CudaDriverError] If launch fails
46
+ # @raise [InvalidOperationError] If kernel module destroyed
47
+ def launch(grid:, block:, args: [], shared_mem: 0, stream: nil)
48
+ raise InvalidOperationError, "Kernel module has been destroyed" if @kernel_module.destroyed?
49
+
50
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
51
+
52
+ Ignis.set_device(device_id)
53
+ param_pointers = marshal_arguments(args)
54
+ stream_ptr = extract_stream_pointer(stream)
55
+
56
+ DriverAPIBindings.launch_kernel(
57
+ @kernel_module.function_handle,
58
+ grid_dim: normalize_dims(grid),
59
+ block_dim: normalize_dims(block),
60
+ kernel_params: param_pointers,
61
+ shared_mem: shared_mem,
62
+ stream: stream_ptr
63
+ )
64
+
65
+ @mutex.synchronize do
66
+ @launch_count += 1
67
+ @total_launch_time += Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
68
+ end
69
+
70
+ self
71
+ end
72
+
73
+ # Synchronous kernel launch with automatic device synchronization
74
+ # @param grid [Array<Integer>, Integer] Grid dimensions
75
+ # @param block [Array<Integer>, Integer] Block dimensions
76
+ # @param args [Array] Kernel arguments
77
+ # @param shared_mem [Integer] Dynamic shared memory
78
+ # @return [self]
79
+ def launch_sync(grid:, block:, args: [], shared_mem: 0)
80
+ launch(grid: grid, block: block, args: args, shared_mem: shared_mem, stream: nil)
81
+ Ignis.synchronize
82
+ self
83
+ end
84
+
85
+ # Calculate optimal grid size for a given element count
86
+ # @param total_elements [Integer] Total number of elements to process
87
+ # @param block_size [Integer] Desired block size (threads per block)
88
+ # @return [Array<Integer>] Grid dimensions
89
+ def self.calc_grid_size(total_elements, block_size: 256)
90
+ blocks = (total_elements + block_size - 1) / block_size
91
+ [blocks]
92
+ end
93
+
94
+ # Get execution statistics
95
+ # @return [Hash] Statistics including launch count and timing
96
+ def stats
97
+ @mutex.synchronize do
98
+ {
99
+ kernel_name: name,
100
+ device_id: device_id,
101
+ launch_count: @launch_count,
102
+ total_launch_time: @total_launch_time,
103
+ avg_launch_time: @launch_count.positive? ? @total_launch_time / @launch_count : 0.0
104
+ }
105
+ end
106
+ end
107
+
108
+ # Get a string representation
109
+ # @return [String]
110
+ def to_s
111
+ "#<Ignis::JIT::Kernel #{name} device=#{device_id} launches=#{@launch_count}>"
112
+ end
113
+
114
+ # Get detailed inspection
115
+ # @return [String]
116
+ def inspect
117
+ "#<Ignis::JIT::Kernel:0x#{object_id.to_s(16)} " \
118
+ "name=#{name.inspect} " \
119
+ "device=#{device_id} " \
120
+ "launches=#{@launch_count}>"
121
+ end
122
+
123
+ private
124
+
125
+ # Marshal Ruby arguments into FFI pointers for kernel params
126
+ # @param args [Array] Mixed array of NvArray, scalars, or pointers
127
+ # @return [Array<FFI::Pointer>] Array of parameter pointers
128
+ def marshal_arguments(args)
129
+ args.map { |arg| marshal_single_argument(arg) }
130
+ end
131
+
132
+ # Marshal a single argument to an FFI pointer
133
+ # @param arg [Object] NvArray, Numeric, or FFI::Pointer
134
+ # @return [FFI::Pointer]
135
+ def marshal_single_argument(arg)
136
+ case arg
137
+ when U64
138
+ marshal_u64(arg.value)
139
+ when FFI::Pointer
140
+ marshal_pointer(arg)
141
+ when Integer
142
+ marshal_integer(arg)
143
+ when Float
144
+ marshal_float(arg)
145
+ else
146
+ # Duck-typed GPU array: anything exposing the device-pointer API
147
+ # (Ignis::Shared::NvArray, or Ignis::NvArray from ignis-numerics).
148
+ if arg.respond_to?(:device_ffi_ptr)
149
+ marshal_nvarray(arg)
150
+ else
151
+ raise ArgumentError, "Unsupported kernel argument type: #{arg.class}"
152
+ end
153
+ end
154
+ end
155
+
156
+ # Wrapper forcing 64-bit (unsigned long long / size_t) marshalling of an
157
+ # integer kernel argument. The default Integer path is 32-bit (`int`); the
158
+ # kernel ABI is fixed by the parameter type, so a 64-bit parameter (e.g. an
159
+ # RNG seed) MUST be wrapped: kernel.launch(args: [..., U64.new(seed), n]).
160
+ U64 = Struct.new(:value)
161
+
162
+ # Marshal NvArray to device pointer
163
+ # @param arr [NvArray]
164
+ # @return [FFI::Pointer]
165
+ def marshal_nvarray(arr)
166
+ arr.to_device unless arr.on_device?
167
+ ptr = FFI::MemoryPointer.new(:pointer)
168
+ # device_ptr is a Fiddle::Pointer; the Driver API launch path is FFI, so
169
+ # use the FFI-wrapped device pointer.
170
+ ptr.write_pointer(arr.device_ffi_ptr)
171
+ ptr
172
+ end
173
+
174
+ # Marshal an existing pointer
175
+ # @param pointer [FFI::Pointer]
176
+ # @return [FFI::Pointer]
177
+ def marshal_pointer(pointer)
178
+ ptr = FFI::MemoryPointer.new(:pointer)
179
+ ptr.write_pointer(pointer)
180
+ ptr
181
+ end
182
+
183
+ # Marshal a 32-bit integer value (for `int` kernel params).
184
+ # @param value [Integer]
185
+ # @return [FFI::Pointer]
186
+ def marshal_integer(value)
187
+ unless value >= -(2**31) && value < 2**31
188
+ raise ArgumentError,
189
+ "Integer #{value} exceeds 32-bit kernel-arg range; wrap it in " \
190
+ "Ignis::JIT::Kernel::U64.new(x) for an unsigned long long parameter"
191
+ end
192
+ ptr = FFI::MemoryPointer.new(:int32)
193
+ ptr.write_int32(value)
194
+ ptr
195
+ end
196
+
197
+ # Marshal a 64-bit unsigned integer (for `unsigned long long` / `size_t` params).
198
+ # @param value [Integer]
199
+ # @return [FFI::Pointer]
200
+ def marshal_u64(value)
201
+ ptr = FFI::MemoryPointer.new(:uint64)
202
+ ptr.write_uint64(value)
203
+ ptr
204
+ end
205
+
206
+ # Marshal a float value
207
+ # @param value [Float]
208
+ # @return [FFI::Pointer]
209
+ def marshal_float(value)
210
+ ptr = FFI::MemoryPointer.new(:float)
211
+ ptr.write_float(value)
212
+ ptr
213
+ end
214
+
215
+ # Normalize dimensions to [x, y, z] format
216
+ # @param dims [Array<Integer>, Integer]
217
+ # @return [Array<Integer>]
218
+ def normalize_dims(dims)
219
+ dims = [dims] if dims.is_a?(Integer)
220
+ dims = dims.dup
221
+ dims << 1 while dims.size < 3
222
+ dims[0..2]
223
+ end
224
+
225
+ # Extract raw stream pointer
226
+ # @param stream [CUDA::Stream, FFI::Pointer, nil]
227
+ # @return [FFI::Pointer, nil]
228
+ def extract_stream_pointer(stream)
229
+ case stream
230
+ when nil
231
+ nil
232
+ when FFI::Pointer
233
+ stream
234
+ else
235
+ stream.respond_to?(:handle) ? stream.handle : nil
236
+ end
237
+ end
238
+ end
239
+ end
240
+ end