ignis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis.rb +94 -0
- data/lib/nnw/platform.rb +304 -0
- data/lib/nnw/shared/event_bus.rb +240 -0
- data/lib/nnw/shared/ffi_loader.rb +63 -0
- data/lib/nnw/shared/memory_contract.rb +204 -0
- data/lib/nnw/shared/nv_array.rb +710 -0
- data/lib/nnw/shared/recovery_protocol.rb +307 -0
- data/lib/nvruby/configuration.rb +217 -0
- data/lib/nvruby/cuda/device.rb +275 -0
- data/lib/nvruby/cuda/device_props.rb +202 -0
- data/lib/nvruby/cuda/graph.rb +265 -0
- data/lib/nvruby/cuda/graph_bindings.rb +119 -0
- data/lib/nvruby/cuda/library_loader.rb +285 -0
- data/lib/nvruby/cuda/memory.rb +410 -0
- data/lib/nvruby/cuda/runtime_api.rb +804 -0
- data/lib/nvruby/cuda/stream.rb +234 -0
- data/lib/nvruby/dtype.rb +139 -0
- data/lib/nvruby/epilogues.rb +438 -0
- data/lib/nvruby/errors.rb +303 -0
- data/lib/nvruby/half.rb +97 -0
- data/lib/nvruby/jit/compiled_kernel.rb +80 -0
- data/lib/nvruby/jit/compiler.rb +231 -0
- data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
- data/lib/nvruby/jit/kernel.rb +240 -0
- data/lib/nvruby/jit/kernel_module.rb +133 -0
- data/lib/nvruby/jit/kernels/activations.rb +179 -0
- data/lib/nvruby/jit/kernels/attention.rb +504 -0
- data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
- data/lib/nvruby/jit/kernels/loss.rb +213 -0
- data/lib/nvruby/jit/kernels/normalization.rb +200 -0
- data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
- data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
- data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
- data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
- data/lib/nvruby/linalg/epilog.rb +67 -0
- data/lib/nvruby/linalg/matmul.rb +247 -0
- data/lib/nvruby/linalg/matmul_plan.rb +229 -0
- data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
- data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
- data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
- data/lib/nvruby/memory/device_memory_resource.rb +106 -0
- data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
- data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
- data/lib/nvruby/memory/stats.rb +107 -0
- data/lib/nvruby/memory.rb +124 -0
- data/lib/nvruby/version.rb +5 -0
- metadata +108 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ffi"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module JIT
|
|
7
|
+
# CUDA Driver API FFI bindings for module and kernel management
|
|
8
|
+
# Provides low-level access to load compiled code and execute kernels
|
|
9
|
+
module DriverAPIBindings
|
|
10
|
+
extend FFI::Library
|
|
11
|
+
|
|
12
|
+
# CUDA Driver result codes
|
|
13
|
+
CUDA_SUCCESS = 0
|
|
14
|
+
CUDA_ERROR_INVALID_VALUE = 1
|
|
15
|
+
CUDA_ERROR_OUT_OF_MEMORY = 2
|
|
16
|
+
CUDA_ERROR_NOT_INITIALIZED = 3
|
|
17
|
+
CUDA_ERROR_DEINITIALIZED = 4
|
|
18
|
+
CUDA_ERROR_PROFILER_DISABLED = 5
|
|
19
|
+
CUDA_ERROR_NO_DEVICE = 100
|
|
20
|
+
CUDA_ERROR_INVALID_DEVICE = 101
|
|
21
|
+
CUDA_ERROR_INVALID_IMAGE = 200
|
|
22
|
+
CUDA_ERROR_INVALID_CONTEXT = 201
|
|
23
|
+
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
|
|
24
|
+
CUDA_ERROR_MAP_FAILED = 205
|
|
25
|
+
CUDA_ERROR_UNMAP_FAILED = 206
|
|
26
|
+
CUDA_ERROR_ARRAY_IS_MAPPED = 207
|
|
27
|
+
CUDA_ERROR_ALREADY_MAPPED = 208
|
|
28
|
+
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
|
|
29
|
+
CUDA_ERROR_ALREADY_ACQUIRED = 210
|
|
30
|
+
CUDA_ERROR_NOT_MAPPED = 211
|
|
31
|
+
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
|
|
32
|
+
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
|
|
33
|
+
CUDA_ERROR_ECC_UNCORRECTABLE = 214
|
|
34
|
+
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
|
|
35
|
+
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
|
|
36
|
+
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
|
|
37
|
+
CUDA_ERROR_INVALID_PTX = 218
|
|
38
|
+
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
|
|
39
|
+
CUDA_ERROR_INVALID_SOURCE = 300
|
|
40
|
+
CUDA_ERROR_FILE_NOT_FOUND = 301
|
|
41
|
+
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
|
|
42
|
+
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
|
|
43
|
+
CUDA_ERROR_OPERATING_SYSTEM = 304
|
|
44
|
+
CUDA_ERROR_INVALID_HANDLE = 400
|
|
45
|
+
CUDA_ERROR_ILLEGAL_STATE = 401
|
|
46
|
+
CUDA_ERROR_NOT_FOUND = 500
|
|
47
|
+
CUDA_ERROR_NOT_READY = 600
|
|
48
|
+
CUDA_ERROR_ILLEGAL_ADDRESS = 700
|
|
49
|
+
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
|
|
50
|
+
CUDA_ERROR_LAUNCH_TIMEOUT = 702
|
|
51
|
+
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
|
|
52
|
+
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
|
|
53
|
+
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
|
|
54
|
+
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
|
|
55
|
+
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
|
|
56
|
+
CUDA_ERROR_ASSERT = 710
|
|
57
|
+
CUDA_ERROR_TOO_MANY_PEERS = 711
|
|
58
|
+
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
|
|
59
|
+
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
|
|
60
|
+
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
|
|
61
|
+
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
|
|
62
|
+
CUDA_ERROR_MISALIGNED_ADDRESS = 716
|
|
63
|
+
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
|
|
64
|
+
CUDA_ERROR_INVALID_PC = 718
|
|
65
|
+
CUDA_ERROR_LAUNCH_FAILED = 719
|
|
66
|
+
CUDA_ERROR_UNKNOWN = 999
|
|
67
|
+
|
|
68
|
+
# cuLaunchKernel extra parameter constants
|
|
69
|
+
CU_LAUNCH_PARAM_END = 0
|
|
70
|
+
CU_LAUNCH_PARAM_BUFFER_POINTER = 1
|
|
71
|
+
CU_LAUNCH_PARAM_BUFFER_SIZE = 2
|
|
72
|
+
|
|
73
|
+
# @return [Boolean] Whether bindings are loaded
|
|
74
|
+
@loaded = false
|
|
75
|
+
|
|
76
|
+
# @return [Boolean] Whether driver is initialized
|
|
77
|
+
@initialized = false
|
|
78
|
+
|
|
79
|
+
# @return [Mutex] Thread safety lock
|
|
80
|
+
@mutex = Mutex.new
|
|
81
|
+
|
|
82
|
+
class << self
|
|
83
|
+
# Ensure CUDA Driver API is loaded and initialized
|
|
84
|
+
# @return [void]
|
|
85
|
+
# @raise [LibraryNotFoundError] If CUDA driver cannot be loaded
|
|
86
|
+
def ensure_loaded!
|
|
87
|
+
@mutex.synchronize do
|
|
88
|
+
return if @loaded
|
|
89
|
+
|
|
90
|
+
load_cuda_driver!
|
|
91
|
+
attach_driver_functions!
|
|
92
|
+
initialize_driver!
|
|
93
|
+
@loaded = true
|
|
94
|
+
Ignis.logger.info("CUDA Driver API bindings initialized")
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Check if Driver API is loaded
|
|
99
|
+
# @return [Boolean]
|
|
100
|
+
def loaded?
|
|
101
|
+
@mutex.synchronize { @loaded }
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Load a module from CUBIN/PTX data
|
|
105
|
+
# @param data [String] CUBIN or PTX binary data
|
|
106
|
+
# @return [FFI::Pointer] Module handle (CUmodule)
|
|
107
|
+
# @raise [CudaDriverError] If module loading fails
|
|
108
|
+
def load_module_data(data)
|
|
109
|
+
ensure_loaded!
|
|
110
|
+
|
|
111
|
+
module_ptr = FFI::MemoryPointer.new(:pointer)
|
|
112
|
+
data_ptr = FFI::MemoryPointer.new(:char, data.bytesize)
|
|
113
|
+
data_ptr.put_bytes(0, data)
|
|
114
|
+
|
|
115
|
+
result = cuModuleLoadData(module_ptr, data_ptr)
|
|
116
|
+
check_result!(result, "cuModuleLoadData")
|
|
117
|
+
|
|
118
|
+
module_ptr.read_pointer
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Get a kernel function from a loaded module
|
|
122
|
+
# @param cuda_module [FFI::Pointer] Module handle
|
|
123
|
+
# @param function_name [String] Kernel function name
|
|
124
|
+
# @return [FFI::Pointer] Function handle (CUfunction)
|
|
125
|
+
# @raise [CudaDriverError] If function is not found
|
|
126
|
+
def get_module_function(cuda_module, function_name)
|
|
127
|
+
ensure_loaded!
|
|
128
|
+
|
|
129
|
+
func_ptr = FFI::MemoryPointer.new(:pointer)
|
|
130
|
+
name_ptr = FFI::MemoryPointer.from_string(function_name)
|
|
131
|
+
|
|
132
|
+
result = cuModuleGetFunction(func_ptr, cuda_module, name_ptr)
|
|
133
|
+
check_result!(result, "cuModuleGetFunction(#{function_name})")
|
|
134
|
+
|
|
135
|
+
func_ptr.read_pointer
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Launch a kernel function
|
|
139
|
+
# @param function [FFI::Pointer] Function handle (CUfunction)
|
|
140
|
+
# @param grid_dim [Array<Integer>] Grid dimensions [x, y, z]
|
|
141
|
+
# @param block_dim [Array<Integer>] Block dimensions [x, y, z]
|
|
142
|
+
# @param kernel_params [Array<FFI::Pointer>] Kernel parameter pointers
|
|
143
|
+
# @param shared_mem [Integer] Dynamic shared memory in bytes
|
|
144
|
+
# @param stream [FFI::Pointer, nil] CUDA stream (nil for default)
|
|
145
|
+
# @return [void]
|
|
146
|
+
# @raise [CudaDriverError] If kernel launch fails
|
|
147
|
+
def launch_kernel(function, grid_dim:, block_dim:, kernel_params:, shared_mem: 0, stream: nil)
|
|
148
|
+
ensure_loaded!
|
|
149
|
+
|
|
150
|
+
grid_x, grid_y, grid_z = normalize_dims(grid_dim)
|
|
151
|
+
block_x, block_y, block_z = normalize_dims(block_dim)
|
|
152
|
+
|
|
153
|
+
params_ptr = build_params_array(kernel_params)
|
|
154
|
+
stream_ptr = stream || FFI::Pointer::NULL
|
|
155
|
+
|
|
156
|
+
result = cuLaunchKernel(
|
|
157
|
+
function,
|
|
158
|
+
grid_x, grid_y, grid_z,
|
|
159
|
+
block_x, block_y, block_z,
|
|
160
|
+
shared_mem,
|
|
161
|
+
stream_ptr,
|
|
162
|
+
params_ptr,
|
|
163
|
+
nil
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
check_result!(result, "cuLaunchKernel")
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Unload a module
|
|
170
|
+
# @param cuda_module [FFI::Pointer] Module handle
|
|
171
|
+
# @return [void]
|
|
172
|
+
def unload_module(cuda_module)
|
|
173
|
+
return if cuda_module.nil? || cuda_module.null?
|
|
174
|
+
|
|
175
|
+
ensure_loaded!
|
|
176
|
+
cuModuleUnload(cuda_module)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Get current CUDA context
|
|
180
|
+
# @return [FFI::Pointer] Context handle (CUcontext)
|
|
181
|
+
# @raise [CudaDriverError] If no context is current
|
|
182
|
+
def get_current_context
|
|
183
|
+
ensure_loaded!
|
|
184
|
+
|
|
185
|
+
ctx_ptr = FFI::MemoryPointer.new(:pointer)
|
|
186
|
+
result = cuCtxGetCurrent(ctx_ptr)
|
|
187
|
+
check_result!(result, "cuCtxGetCurrent")
|
|
188
|
+
|
|
189
|
+
ctx_ptr.read_pointer
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Set current CUDA context
|
|
193
|
+
# @param context [FFI::Pointer] Context handle
|
|
194
|
+
# @return [void]
|
|
195
|
+
# @raise [CudaDriverError] If context cannot be set
|
|
196
|
+
def set_current_context(context)
|
|
197
|
+
ensure_loaded!
|
|
198
|
+
|
|
199
|
+
result = cuCtxSetCurrent(context)
|
|
200
|
+
check_result!(result, "cuCtxSetCurrent")
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Get device compute capability
|
|
204
|
+
# @param device_id [Integer] Device index
|
|
205
|
+
# @return [Array<Integer>] [major, minor] compute capability
|
|
206
|
+
# @raise [CudaDriverError] If attribute cannot be retrieved
|
|
207
|
+
def get_device_compute_capability(device_id)
|
|
208
|
+
ensure_loaded!
|
|
209
|
+
|
|
210
|
+
device_ptr = FFI::MemoryPointer.new(:int)
|
|
211
|
+
result = cuDeviceGet(device_ptr, device_id)
|
|
212
|
+
check_result!(result, "cuDeviceGet")
|
|
213
|
+
|
|
214
|
+
device = device_ptr.read_int
|
|
215
|
+
|
|
216
|
+
major_ptr = FFI::MemoryPointer.new(:int)
|
|
217
|
+
minor_ptr = FFI::MemoryPointer.new(:int)
|
|
218
|
+
|
|
219
|
+
result = cuDeviceGetAttribute(major_ptr, 75, device) # CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
|
|
220
|
+
check_result!(result, "cuDeviceGetAttribute(major)")
|
|
221
|
+
|
|
222
|
+
result = cuDeviceGetAttribute(minor_ptr, 76, device) # CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
|
|
223
|
+
check_result!(result, "cuDeviceGetAttribute(minor)")
|
|
224
|
+
|
|
225
|
+
[major_ptr.read_int, minor_ptr.read_int]
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Synchronize current context
|
|
229
|
+
# @return [void]
|
|
230
|
+
# @raise [CudaDriverError] If synchronization fails
|
|
231
|
+
def context_synchronize
|
|
232
|
+
ensure_loaded!
|
|
233
|
+
|
|
234
|
+
result = cuCtxSynchronize()
|
|
235
|
+
check_result!(result, "cuCtxSynchronize")
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Check CUDA Driver result and raise on error
|
|
239
|
+
# @param result [Integer] CUDA result code
|
|
240
|
+
# @param context [String] Context for error message
|
|
241
|
+
# @return [void]
|
|
242
|
+
# @raise [CudaDriverError] If result is not success
|
|
243
|
+
def check_result!(result, context)
|
|
244
|
+
return if result == CUDA_SUCCESS
|
|
245
|
+
|
|
246
|
+
raise CudaDriverError.new(result, context: context)
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
private
|
|
250
|
+
|
|
251
|
+
# Load CUDA driver library
|
|
252
|
+
# @return [void]
|
|
253
|
+
def load_cuda_driver!
|
|
254
|
+
begin
|
|
255
|
+
Ignis::CUDA::LibraryLoader.load_library(:cuda_driver)
|
|
256
|
+
rescue LibraryNotFoundError
|
|
257
|
+
ffi_lib "nvcuda"
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Initialize CUDA driver
|
|
262
|
+
# @return [void]
|
|
263
|
+
def initialize_driver!
|
|
264
|
+
return if @initialized
|
|
265
|
+
|
|
266
|
+
result = cuInit(0)
|
|
267
|
+
if result == CUDA_SUCCESS
|
|
268
|
+
@initialized = true
|
|
269
|
+
else
|
|
270
|
+
Ignis.logger.warn("cuInit returned #{result}, driver may already be initialized")
|
|
271
|
+
@initialized = true
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Attach all CUDA Driver FFI functions
|
|
276
|
+
# @return [void]
|
|
277
|
+
def attach_driver_functions!
|
|
278
|
+
begin
|
|
279
|
+
handle = Ignis::CUDA::LibraryLoader.load_library(:cuda_driver)
|
|
280
|
+
define_driver_function(handle, :cuInit, [:uint], :int)
|
|
281
|
+
define_driver_function(handle, :cuCtxGetCurrent, [:pointer], :int)
|
|
282
|
+
define_driver_function(handle, :cuCtxSetCurrent, [:pointer], :int)
|
|
283
|
+
define_driver_function(handle, :cuCtxSynchronize, [], :int)
|
|
284
|
+
define_driver_function(handle, :cuDeviceGet, [:pointer, :int], :int)
|
|
285
|
+
define_driver_function(handle, :cuDeviceGetAttribute, [:pointer, :int, :int], :int)
|
|
286
|
+
define_driver_function(handle, :cuModuleLoadData, [:pointer, :pointer], :int)
|
|
287
|
+
define_driver_function(handle, :cuModuleLoadDataEx, [:pointer, :pointer, :uint, :pointer, :pointer], :int)
|
|
288
|
+
define_driver_function(handle, :cuModuleGetFunction, [:pointer, :pointer, :pointer], :int)
|
|
289
|
+
define_driver_function(handle, :cuModuleUnload, [:pointer], :int)
|
|
290
|
+
define_driver_function(handle, :cuLaunchKernel, [
|
|
291
|
+
:pointer, # CUfunction
|
|
292
|
+
:uint, :uint, :uint, # grid dims
|
|
293
|
+
:uint, :uint, :uint, # block dims
|
|
294
|
+
:uint, # shared mem
|
|
295
|
+
:pointer, # stream
|
|
296
|
+
:pointer, # kernelParams
|
|
297
|
+
:pointer # extra
|
|
298
|
+
], :int)
|
|
299
|
+
rescue LibraryNotFoundError
|
|
300
|
+
attach_function :cuInit, [:uint], :int
|
|
301
|
+
attach_function :cuCtxGetCurrent, [:pointer], :int
|
|
302
|
+
attach_function :cuCtxSetCurrent, [:pointer], :int
|
|
303
|
+
attach_function :cuCtxSynchronize, [], :int
|
|
304
|
+
attach_function :cuDeviceGet, [:pointer, :int], :int
|
|
305
|
+
attach_function :cuDeviceGetAttribute, [:pointer, :int, :int], :int
|
|
306
|
+
attach_function :cuModuleLoadData, [:pointer, :pointer], :int
|
|
307
|
+
attach_function :cuModuleLoadDataEx, [:pointer, :pointer, :uint, :pointer, :pointer], :int
|
|
308
|
+
attach_function :cuModuleGetFunction, [:pointer, :pointer, :pointer], :int
|
|
309
|
+
attach_function :cuModuleUnload, [:pointer], :int
|
|
310
|
+
attach_function :cuLaunchKernel, [
|
|
311
|
+
:pointer, :uint, :uint, :uint, :uint, :uint, :uint,
|
|
312
|
+
:uint, :pointer, :pointer, :pointer
|
|
313
|
+
], :int
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Define a Driver API function from the loaded library
|
|
318
|
+
# @param handle [FFI::DynamicLibrary] Library handle
|
|
319
|
+
# @param name [Symbol] Function name
|
|
320
|
+
# @param args [Array] Argument types
|
|
321
|
+
# @param ret [Symbol] Return type
|
|
322
|
+
# @return [void]
|
|
323
|
+
def define_driver_function(handle, name, args, ret)
|
|
324
|
+
# LibraryLoader returns a Fiddle::Handle; resolve the symbol address with
|
|
325
|
+
# Fiddle::Handle#[] and build an FFI::Function from it (Fiddle::Handle has
|
|
326
|
+
# no #find_function method).
|
|
327
|
+
func_ptr = begin
|
|
328
|
+
handle[name.to_s]
|
|
329
|
+
rescue Fiddle::DLError
|
|
330
|
+
nil
|
|
331
|
+
end
|
|
332
|
+
raise CudaDriverError.new(CUDA_ERROR_NOT_FOUND, context: "Function #{name} not found") unless func_ptr
|
|
333
|
+
|
|
334
|
+
func = FFI::Function.new(ret, args, FFI::Pointer.new(func_ptr))
|
|
335
|
+
define_singleton_method(name) { |*call_args| func.call(*call_args) }
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
# Normalize dimensions to [x, y, z]
|
|
339
|
+
# @param dims [Array<Integer>, Integer] Dimensions
|
|
340
|
+
# @return [Array<Integer>] [x, y, z] with defaults of 1
|
|
341
|
+
def normalize_dims(dims)
|
|
342
|
+
dims = [dims] if dims.is_a?(Integer)
|
|
343
|
+
dims = dims.dup
|
|
344
|
+
dims << 1 while dims.size < 3
|
|
345
|
+
dims[0..2]
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# Build kernel params array from pointers
|
|
349
|
+
# @param params [Array<FFI::Pointer>] Parameter pointers
|
|
350
|
+
# @return [FFI::Pointer] Array of pointers for cuLaunchKernel
|
|
351
|
+
def build_params_array(params)
|
|
352
|
+
return nil if params.empty?
|
|
353
|
+
|
|
354
|
+
params_ptr = FFI::MemoryPointer.new(:pointer, params.size)
|
|
355
|
+
params.each_with_index do |param_ptr, i|
|
|
356
|
+
params_ptr.put_pointer(i * FFI::Pointer.size, param_ptr)
|
|
357
|
+
end
|
|
358
|
+
params_ptr
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
end
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module JIT
|
|
5
|
+
# High-level kernel execution interface
|
|
6
|
+
# Provides easy-to-use launch configuration and argument marshaling
|
|
7
|
+
class Kernel
|
|
8
|
+
# @return [KernelModule] The underlying kernel module
|
|
9
|
+
attr_reader :kernel_module
|
|
10
|
+
|
|
11
|
+
# @return [Integer] Total number of times this kernel has been launched
|
|
12
|
+
attr_reader :launch_count
|
|
13
|
+
|
|
14
|
+
# @return [Float] Total time spent in kernel launches (seconds)
|
|
15
|
+
attr_reader :total_launch_time
|
|
16
|
+
|
|
17
|
+
# Create a new Kernel wrapper
|
|
18
|
+
# @param kernel_module [KernelModule] The loaded kernel module
|
|
19
|
+
def initialize(kernel_module)
|
|
20
|
+
@kernel_module = kernel_module
|
|
21
|
+
@launch_count = 0
|
|
22
|
+
@total_launch_time = 0.0
|
|
23
|
+
@mutex = Mutex.new
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Get the kernel function name
|
|
27
|
+
# @return [String]
|
|
28
|
+
def name
|
|
29
|
+
@kernel_module.kernel_name
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Get the device this kernel is loaded on
|
|
33
|
+
# @return [Integer]
|
|
34
|
+
def device_id
|
|
35
|
+
@kernel_module.device_id
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Launch the kernel with specified configuration
|
|
39
|
+
# @param grid [Array<Integer>, Integer] Grid dimensions (blocks)
|
|
40
|
+
# @param block [Array<Integer>, Integer] Block dimensions (threads)
|
|
41
|
+
# @param args [Array] Kernel arguments (NvArray, scalars, pointers)
|
|
42
|
+
# @param shared_mem [Integer] Dynamic shared memory in bytes
|
|
43
|
+
# @param stream [CUDA::Stream, nil] CUDA stream for async execution
|
|
44
|
+
# @return [self] Returns self for chaining
|
|
45
|
+
# @raise [CudaDriverError] If launch fails
|
|
46
|
+
# @raise [InvalidOperationError] If kernel module destroyed
|
|
47
|
+
def launch(grid:, block:, args: [], shared_mem: 0, stream: nil)
|
|
48
|
+
raise InvalidOperationError, "Kernel module has been destroyed" if @kernel_module.destroyed?
|
|
49
|
+
|
|
50
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
51
|
+
|
|
52
|
+
Ignis.set_device(device_id)
|
|
53
|
+
param_pointers = marshal_arguments(args)
|
|
54
|
+
stream_ptr = extract_stream_pointer(stream)
|
|
55
|
+
|
|
56
|
+
DriverAPIBindings.launch_kernel(
|
|
57
|
+
@kernel_module.function_handle,
|
|
58
|
+
grid_dim: normalize_dims(grid),
|
|
59
|
+
block_dim: normalize_dims(block),
|
|
60
|
+
kernel_params: param_pointers,
|
|
61
|
+
shared_mem: shared_mem,
|
|
62
|
+
stream: stream_ptr
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@mutex.synchronize do
|
|
66
|
+
@launch_count += 1
|
|
67
|
+
@total_launch_time += Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
self
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Synchronous kernel launch with automatic device synchronization
|
|
74
|
+
# @param grid [Array<Integer>, Integer] Grid dimensions
|
|
75
|
+
# @param block [Array<Integer>, Integer] Block dimensions
|
|
76
|
+
# @param args [Array] Kernel arguments
|
|
77
|
+
# @param shared_mem [Integer] Dynamic shared memory
|
|
78
|
+
# @return [self]
|
|
79
|
+
def launch_sync(grid:, block:, args: [], shared_mem: 0)
|
|
80
|
+
launch(grid: grid, block: block, args: args, shared_mem: shared_mem, stream: nil)
|
|
81
|
+
Ignis.synchronize
|
|
82
|
+
self
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Calculate optimal grid size for a given element count
|
|
86
|
+
# @param total_elements [Integer] Total number of elements to process
|
|
87
|
+
# @param block_size [Integer] Desired block size (threads per block)
|
|
88
|
+
# @return [Array<Integer>] Grid dimensions
|
|
89
|
+
def self.calc_grid_size(total_elements, block_size: 256)
|
|
90
|
+
blocks = (total_elements + block_size - 1) / block_size
|
|
91
|
+
[blocks]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Get execution statistics
|
|
95
|
+
# @return [Hash] Statistics including launch count and timing
|
|
96
|
+
def stats
|
|
97
|
+
@mutex.synchronize do
|
|
98
|
+
{
|
|
99
|
+
kernel_name: name,
|
|
100
|
+
device_id: device_id,
|
|
101
|
+
launch_count: @launch_count,
|
|
102
|
+
total_launch_time: @total_launch_time,
|
|
103
|
+
avg_launch_time: @launch_count.positive? ? @total_launch_time / @launch_count : 0.0
|
|
104
|
+
}
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Get a string representation
|
|
109
|
+
# @return [String]
|
|
110
|
+
def to_s
|
|
111
|
+
"#<Ignis::JIT::Kernel #{name} device=#{device_id} launches=#{@launch_count}>"
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Get detailed inspection
|
|
115
|
+
# @return [String]
|
|
116
|
+
def inspect
|
|
117
|
+
"#<Ignis::JIT::Kernel:0x#{object_id.to_s(16)} " \
|
|
118
|
+
"name=#{name.inspect} " \
|
|
119
|
+
"device=#{device_id} " \
|
|
120
|
+
"launches=#{@launch_count}>"
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
private
|
|
124
|
+
|
|
125
|
+
# Marshal Ruby arguments into FFI pointers for kernel params
|
|
126
|
+
# @param args [Array] Mixed array of NvArray, scalars, or pointers
|
|
127
|
+
# @return [Array<FFI::Pointer>] Array of parameter pointers
|
|
128
|
+
def marshal_arguments(args)
|
|
129
|
+
args.map { |arg| marshal_single_argument(arg) }
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Marshal a single argument to an FFI pointer
|
|
133
|
+
# @param arg [Object] NvArray, Numeric, or FFI::Pointer
|
|
134
|
+
# @return [FFI::Pointer]
|
|
135
|
+
def marshal_single_argument(arg)
|
|
136
|
+
case arg
|
|
137
|
+
when U64
|
|
138
|
+
marshal_u64(arg.value)
|
|
139
|
+
when FFI::Pointer
|
|
140
|
+
marshal_pointer(arg)
|
|
141
|
+
when Integer
|
|
142
|
+
marshal_integer(arg)
|
|
143
|
+
when Float
|
|
144
|
+
marshal_float(arg)
|
|
145
|
+
else
|
|
146
|
+
# Duck-typed GPU array: anything exposing the device-pointer API
|
|
147
|
+
# (Ignis::Shared::NvArray, or Ignis::NvArray from ignis-numerics).
|
|
148
|
+
if arg.respond_to?(:device_ffi_ptr)
|
|
149
|
+
marshal_nvarray(arg)
|
|
150
|
+
else
|
|
151
|
+
raise ArgumentError, "Unsupported kernel argument type: #{arg.class}"
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Wrapper forcing 64-bit (unsigned long long / size_t) marshalling of an
|
|
157
|
+
# integer kernel argument. The default Integer path is 32-bit (`int`); the
|
|
158
|
+
# kernel ABI is fixed by the parameter type, so a 64-bit parameter (e.g. an
|
|
159
|
+
# RNG seed) MUST be wrapped: kernel.launch(args: [..., U64.new(seed), n]).
|
|
160
|
+
U64 = Struct.new(:value)
|
|
161
|
+
|
|
162
|
+
# Marshal NvArray to device pointer
|
|
163
|
+
# @param arr [NvArray]
|
|
164
|
+
# @return [FFI::Pointer]
|
|
165
|
+
def marshal_nvarray(arr)
|
|
166
|
+
arr.to_device unless arr.on_device?
|
|
167
|
+
ptr = FFI::MemoryPointer.new(:pointer)
|
|
168
|
+
# device_ptr is a Fiddle::Pointer; the Driver API launch path is FFI, so
|
|
169
|
+
# use the FFI-wrapped device pointer.
|
|
170
|
+
ptr.write_pointer(arr.device_ffi_ptr)
|
|
171
|
+
ptr
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Marshal an existing pointer
|
|
175
|
+
# @param pointer [FFI::Pointer]
|
|
176
|
+
# @return [FFI::Pointer]
|
|
177
|
+
def marshal_pointer(pointer)
|
|
178
|
+
ptr = FFI::MemoryPointer.new(:pointer)
|
|
179
|
+
ptr.write_pointer(pointer)
|
|
180
|
+
ptr
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Marshal a 32-bit integer value (for `int` kernel params).
|
|
184
|
+
# @param value [Integer]
|
|
185
|
+
# @return [FFI::Pointer]
|
|
186
|
+
def marshal_integer(value)
|
|
187
|
+
unless value >= -(2**31) && value < 2**31
|
|
188
|
+
raise ArgumentError,
|
|
189
|
+
"Integer #{value} exceeds 32-bit kernel-arg range; wrap it in " \
|
|
190
|
+
"Ignis::JIT::Kernel::U64.new(x) for an unsigned long long parameter"
|
|
191
|
+
end
|
|
192
|
+
ptr = FFI::MemoryPointer.new(:int32)
|
|
193
|
+
ptr.write_int32(value)
|
|
194
|
+
ptr
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Marshal a 64-bit unsigned integer (for `unsigned long long` / `size_t` params).
|
|
198
|
+
# @param value [Integer]
|
|
199
|
+
# @return [FFI::Pointer]
|
|
200
|
+
def marshal_u64(value)
|
|
201
|
+
ptr = FFI::MemoryPointer.new(:uint64)
|
|
202
|
+
ptr.write_uint64(value)
|
|
203
|
+
ptr
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Marshal a float value
|
|
207
|
+
# @param value [Float]
|
|
208
|
+
# @return [FFI::Pointer]
|
|
209
|
+
def marshal_float(value)
|
|
210
|
+
ptr = FFI::MemoryPointer.new(:float)
|
|
211
|
+
ptr.write_float(value)
|
|
212
|
+
ptr
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Normalize dimensions to [x, y, z] format
|
|
216
|
+
# @param dims [Array<Integer>, Integer]
|
|
217
|
+
# @return [Array<Integer>]
|
|
218
|
+
def normalize_dims(dims)
|
|
219
|
+
dims = [dims] if dims.is_a?(Integer)
|
|
220
|
+
dims = dims.dup
|
|
221
|
+
dims << 1 while dims.size < 3
|
|
222
|
+
dims[0..2]
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Extract raw stream pointer
|
|
226
|
+
# @param stream [CUDA::Stream, FFI::Pointer, nil]
|
|
227
|
+
# @return [FFI::Pointer, nil]
|
|
228
|
+
def extract_stream_pointer(stream)
|
|
229
|
+
case stream
|
|
230
|
+
when nil
|
|
231
|
+
nil
|
|
232
|
+
when FFI::Pointer
|
|
233
|
+
stream
|
|
234
|
+
else
|
|
235
|
+
stream.respond_to?(:handle) ? stream.handle : nil
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|