ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,275 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module CUDA
5
+ # CUDA device attribute constants (cudaDeviceAttr enum values)
6
+ module DeviceAttribute
7
+ MAX_THREADS_PER_BLOCK = 1
8
+ MAX_BLOCK_DIM_X = 2
9
+ MAX_BLOCK_DIM_Y = 3
10
+ MAX_BLOCK_DIM_Z = 4
11
+ MAX_GRID_DIM_X = 5
12
+ MAX_GRID_DIM_Y = 6
13
+ MAX_GRID_DIM_Z = 7
14
+ MAX_SHARED_MEMORY_PER_BLOCK = 8
15
+ TOTAL_CONSTANT_MEMORY = 9
16
+ WARP_SIZE = 10
17
+ MAX_PITCH = 11
18
+ MAX_REGISTERS_PER_BLOCK = 12
19
+ CLOCK_RATE = 13
20
+ TEXTURE_ALIGNMENT = 14
21
+ MULTIPROCESSOR_COUNT = 16
22
+ KERNEL_EXEC_TIMEOUT = 17
23
+ INTEGRATED = 18
24
+ CAN_MAP_HOST_MEMORY = 19
25
+ COMPUTE_MODE = 20
26
+ CONCURRENT_KERNELS = 31
27
+ ECC_ENABLED = 32
28
+ MEMORY_CLOCK_RATE = 36
29
+ GLOBAL_MEMORY_BUS_WIDTH = 37
30
+ L2_CACHE_SIZE = 38
31
+ MAX_THREADS_PER_MULTIPROCESSOR = 39
32
+ ASYNC_ENGINE_COUNT = 40
33
+ UNIFIED_ADDRESSING = 41
34
+ COMPUTE_CAPABILITY_MAJOR = 75
35
+ COMPUTE_CAPABILITY_MINOR = 76
36
+ MANAGED_MEMORY = 83
37
+ COOPERATIVE_LAUNCH = 95
38
+ MAX_BLOCKS_PER_MULTIPROCESSOR = 106
39
+ GPU_DIRECT_RDMA_SUPPORTED = 116
40
+ MEMORY_POOLS_SUPPORTED = 115
41
+ end
42
+
43
+ # Represents a CUDA GPU device.
44
+ #
45
+ # Uses RuntimeAPI (Fiddle-based) for hot-path attribute queries
46
+ # and DeviceProperties (FFI::Struct) for full device property reads.
47
+ class Device
48
+ # @return [Integer] Device index
49
+ attr_reader :index
50
+
51
+ # @param index [Integer] Device index
52
+ def initialize(index)
53
+ @index = index
54
+ @attribute_cache = {}
55
+ @name = nil
56
+ @total_memory = nil
57
+ @props_cache = nil
58
+ end
59
+
60
+ # @return [String] Device name
61
+ def name
62
+ return @name if @name
63
+
64
+ @name = properties[:name]
65
+ end
66
+
67
+ # @return [Integer] Total global memory in bytes
68
+ def total_memory
69
+ return @total_memory if @total_memory
70
+
71
+ @total_memory = memory_info[:total_bytes]
72
+ end
73
+
74
+ # @return [Integer] Shared memory per block in bytes
75
+ def shared_memory_per_block
76
+ get_attribute(DeviceAttribute::MAX_SHARED_MEMORY_PER_BLOCK)
77
+ end
78
+
79
+ # @return [Integer] Number of streaming multiprocessors
80
+ def multiprocessor_count
81
+ get_attribute(DeviceAttribute::MULTIPROCESSOR_COUNT)
82
+ end
83
+
84
+ # @return [Integer] Warp size (typically 32)
85
+ def warp_size
86
+ get_attribute(DeviceAttribute::WARP_SIZE)
87
+ end
88
+
89
+ # @return [String] Compute capability as "major.minor"
90
+ def compute_capability
91
+ "#{compute_capability_major}.#{compute_capability_minor}"
92
+ end
93
+
94
+ # @return [Integer] Major compute capability
95
+ def compute_capability_major
96
+ get_attribute(DeviceAttribute::COMPUTE_CAPABILITY_MAJOR)
97
+ end
98
+
99
+ # @return [Integer] Minor compute capability
100
+ def compute_capability_minor
101
+ get_attribute(DeviceAttribute::COMPUTE_CAPABILITY_MINOR)
102
+ end
103
+
104
+ # @return [Integer] Max threads per block
105
+ def max_threads_per_block
106
+ get_attribute(DeviceAttribute::MAX_THREADS_PER_BLOCK)
107
+ end
108
+
109
+ # @return [Array<Integer>] Max threads per dimension [x, y, z]
110
+ def max_threads_dim
111
+ [
112
+ get_attribute(DeviceAttribute::MAX_BLOCK_DIM_X),
113
+ get_attribute(DeviceAttribute::MAX_BLOCK_DIM_Y),
114
+ get_attribute(DeviceAttribute::MAX_BLOCK_DIM_Z)
115
+ ]
116
+ end
117
+
118
+ # @return [Array<Integer>] Max grid size [x, y, z]
119
+ def max_grid_size
120
+ [
121
+ get_attribute(DeviceAttribute::MAX_GRID_DIM_X),
122
+ get_attribute(DeviceAttribute::MAX_GRID_DIM_Y),
123
+ get_attribute(DeviceAttribute::MAX_GRID_DIM_Z)
124
+ ]
125
+ end
126
+
127
+ # @return [Integer] Clock rate in kHz
128
+ def clock_rate
129
+ get_attribute(DeviceAttribute::CLOCK_RATE)
130
+ end
131
+
132
+ # @return [Integer] Memory clock rate in kHz
133
+ def memory_clock_rate
134
+ get_attribute(DeviceAttribute::MEMORY_CLOCK_RATE)
135
+ end
136
+
137
+ # @return [Integer] Memory bus width in bits
138
+ def memory_bus_width
139
+ get_attribute(DeviceAttribute::GLOBAL_MEMORY_BUS_WIDTH)
140
+ end
141
+
142
+ # @return [Integer] L2 cache size in bytes
143
+ def l2_cache_size
144
+ get_attribute(DeviceAttribute::L2_CACHE_SIZE)
145
+ end
146
+
147
+ # @return [Boolean] Whether ECC memory is enabled
148
+ def ecc_enabled?
149
+ get_attribute(DeviceAttribute::ECC_ENABLED) != 0
150
+ end
151
+
152
+ # @return [Boolean] Whether concurrent kernels are supported
153
+ def concurrent_kernels?
154
+ get_attribute(DeviceAttribute::CONCURRENT_KERNELS) != 0
155
+ end
156
+
157
+ # @return [Boolean] Whether unified addressing is supported
158
+ def unified_addressing?
159
+ get_attribute(DeviceAttribute::UNIFIED_ADDRESSING) != 0
160
+ end
161
+
162
+ # @return [Boolean] Whether managed memory is supported
163
+ def managed_memory?
164
+ get_attribute(DeviceAttribute::MANAGED_MEMORY) != 0
165
+ end
166
+
167
+ # @return [Boolean] Whether cooperative launch is supported
168
+ def cooperative_launch?
169
+ get_attribute(DeviceAttribute::COOPERATIVE_LAUNCH) != 0
170
+ end
171
+
172
+ # Get available and total memory via RuntimeAPI (Fiddle hot path).
173
+ # @return [Hash] {free_bytes:, total_bytes:}
174
+ def memory_info
175
+ RuntimeAPI.ensure_loaded!
176
+ RuntimeAPI.set_device(@index)
177
+ RuntimeAPI.mem_get_info
178
+ end
179
+
180
+ # @return [Integer] Free memory in bytes
181
+ def free_memory
182
+ memory_info[:free_bytes]
183
+ end
184
+
185
+ # Get full device properties via DeviceProperties (FFI::Struct).
186
+ # Cached after first call.
187
+ # @return [Hash] device property summary
188
+ def properties
189
+ @props_cache ||= DeviceProperties.summary(@index)
190
+ end
191
+
192
+ # Set this device as the current device.
193
+ # @return [void]
194
+ def set_current!
195
+ RuntimeAPI.ensure_loaded!
196
+ RuntimeAPI.set_device(@index)
197
+ end
198
+
199
+ # Synchronize this device.
200
+ # @return [void]
201
+ def synchronize
202
+ set_current!
203
+ RuntimeAPI.device_synchronize
204
+ end
205
+
206
+ # Reset this device.
207
+ # @return [void]
208
+ def reset!
209
+ set_current!
210
+ RuntimeAPI.device_reset
211
+ end
212
+
213
+ # @return [String] Human-readable device description
214
+ def to_s
215
+ mem_mb = total_memory / (1024 * 1024)
216
+ "Device[#{@index}]: #{name} (CC #{compute_capability}, #{mem_mb} MB)"
217
+ end
218
+
219
+ # @return [String] Detailed inspection
220
+ def inspect
221
+ "#<Ignis::CUDA::Device:#{object_id} index=#{@index} name=#{name.inspect} " \
222
+ "compute=#{compute_capability} memory=#{total_memory}>"
223
+ end
224
+
225
+ class << self
226
+ # Get the number of CUDA devices.
227
+ # @return [Integer]
228
+ def count
229
+ RuntimeAPI.ensure_loaded!
230
+ RuntimeAPI.get_device_count
231
+ end
232
+
233
+ # List all available devices.
234
+ # @return [Array<Device>]
235
+ def list
236
+ count.times.map { |i| new(i) }
237
+ end
238
+
239
+ # Get the current device.
240
+ # @return [Device]
241
+ def current
242
+ RuntimeAPI.ensure_loaded!
243
+ new(RuntimeAPI.get_device)
244
+ end
245
+
246
+ # Get the default device from configuration.
247
+ # @return [Device]
248
+ def default
249
+ new(Ignis.configuration.default_device)
250
+ end
251
+
252
+ # Check if any CUDA device is available.
253
+ # @return [Boolean]
254
+ def available?
255
+ count.positive?
256
+ rescue CudaRuntimeError
257
+ false
258
+ end
259
+ end
260
+
261
+ private
262
+
263
+ # Get a device attribute value (cached).
264
+ # Uses RuntimeAPI.device_get_attribute (Fiddle hot path).
265
+ # @param attribute [Integer] Attribute constant from DeviceAttribute
266
+ # @return [Integer] Attribute value
267
+ def get_attribute(attribute)
268
+ return @attribute_cache[attribute] if @attribute_cache.key?(attribute)
269
+
270
+ RuntimeAPI.ensure_loaded!
271
+ @attribute_cache[attribute] = RuntimeAPI.device_get_attribute(attribute, @index)
272
+ end
273
+ end
274
+ end
275
+ end
@@ -0,0 +1,202 @@
1
+ # frozen_string_literal: true
2
+
3
+ # CUDA Device Properties — FFI::Struct definition
4
+ #
5
+ # Rule 4: FFI structs live in their own file, never mixed with Fiddle hot-path calls.
6
+ # This file is used by device.rb for one-shot property reads.
7
+ #
8
+ # cuDeviceProp has 268+ fields. We define the most commonly used subset
9
+ # and provide a raw accessor for the rest.
10
+
11
+ require_relative '../../nnw/shared/ffi_loader'
12
+ Ignis::Shared::FFILoader.load!
13
+
14
+ module Ignis
15
+ module CUDA
16
+ # cudaDeviceProp struct for FFI binding.
17
+ #
18
+ # This covers the essential fields. For the full struct, use raw_props.
19
+ module DeviceProperties
20
+ extend FFI::Library
21
+
22
+ # Resolve CUDA runtime library path per platform.
23
+ CUDART_PATH = if defined?(Ignis::Platform)
24
+ Ignis::Platform.cudart_path
25
+ elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
26
+ File.join('C:', 'Program Files', 'NVIDIA GPU Computing Toolkit',
27
+ 'CUDA', 'v13.0', 'bin', 'cudart64_130.dll')
28
+ else
29
+ 'libcudart.so.13'
30
+ end
31
+
32
+ begin
33
+ ffi_lib CUDART_PATH
34
+ rescue LoadError => e
35
+ $stderr.puts "[Ignis] WARNING: Cannot load #{CUDART_PATH}: #{e.message}"
36
+ end
37
+
38
+ # cudaDeviceProp struct definition — essential fields.
39
+ #
40
+ # The full struct is ~2KB. We define the first ~80 fields that are
41
+ # most commonly queried. Remaining bytes are padded.
42
+ class CudaDeviceProp < FFI::Struct
43
+ # The struct layout matches CUDA 12.x / 13.x cudaDeviceProp ordering.
44
+ # Field names match the CUDA documentation exactly.
45
+ layout \
46
+ :name, [:char, 256], # Device name
47
+ :uuid, [:char, 16], # 16-byte UUID
48
+ :luid, [:char, 8], # 8-byte LUID
49
+ :luidDeviceNodeMask, :uint,
50
+ :totalGlobalMem, :size_t,
51
+ :sharedMemPerBlock, :size_t,
52
+ :regsPerBlock, :int,
53
+ :warpSize, :int,
54
+ :memPitch, :size_t,
55
+ :maxThreadsPerBlock, :int,
56
+ :maxThreadsDim, [:int, 3],
57
+ :maxGridSize, [:int, 3],
58
+ :clockRate, :int,
59
+ :totalConstMem, :size_t,
60
+ :major, :int, # Compute capability major
61
+ :minor, :int, # Compute capability minor
62
+ :textureAlignment, :size_t,
63
+ :texturePitchAlignment, :size_t,
64
+ :deviceOverlap, :int,
65
+ :multiProcessorCount, :int,
66
+ :kernelExecTimeoutEnabled, :int,
67
+ :integrated, :int,
68
+ :canMapHostMemory, :int,
69
+ :computeMode, :int,
70
+ :maxTexture1D, :int,
71
+ :maxTexture1DMipmap, :int,
72
+ :maxTexture1DLinear, :int,
73
+ :maxTexture2D, [:int, 2],
74
+ :maxTexture2DMipmap, [:int, 2],
75
+ :maxTexture2DLinear, [:int, 3],
76
+ :maxTexture2DGather, [:int, 2],
77
+ :maxTexture3D, [:int, 3],
78
+ :maxTexture3DAlt, [:int, 3],
79
+ :maxTextureCubemap, :int,
80
+ :maxTexture1DLayered, [:int, 2],
81
+ :maxTexture2DLayered, [:int, 3],
82
+ :maxTextureCubemapLayered, [:int, 2],
83
+ :maxSurface1D, :int,
84
+ :maxSurface2D, [:int, 2],
85
+ :maxSurface3D, [:int, 3],
86
+ :maxSurface1DLayered, [:int, 2],
87
+ :maxSurface2DLayered, [:int, 3],
88
+ :maxSurfaceCubemap, :int,
89
+ :maxSurfaceCubemapLayered, [:int, 2],
90
+ :surfaceAlignment, :size_t,
91
+ :concurrentKernels, :int,
92
+ :ECCEnabled, :int,
93
+ :pciBusID, :int,
94
+ :pciDeviceID, :int,
95
+ :pciDomainID, :int,
96
+ :tccDriver, :int,
97
+ :asyncEngineCount, :int,
98
+ :unifiedAddressing, :int,
99
+ :memoryClockRate, :int,
100
+ :memoryBusWidth, :int,
101
+ :l2CacheSize, :int,
102
+ :persistingL2CacheMaxSize, :int,
103
+ :maxThreadsPerMultiProcessor, :int,
104
+ :streamPrioritiesSupported, :int,
105
+ :globalL1CacheSupported, :int,
106
+ :localL1CacheSupported, :int,
107
+ :sharedMemPerMultiprocessor, :size_t,
108
+ :regsPerMultiprocessor, :int,
109
+ :managedMemory, :int,
110
+ :isMultiGpuBoard, :int,
111
+ :multiGpuBoardGroupID, :int,
112
+ :hostNativeAtomicSupported, :int,
113
+ :singleToDoublePrecisionPerfRatio, :int,
114
+ :pageableMemoryAccess, :int,
115
+ :concurrentManagedAccess, :int,
116
+ :computePreemptionSupported, :int,
117
+ :canUseHostPointerForRegisteredMem, :int,
118
+ :cooperativeLaunch, :int,
119
+ :cooperativeMultiDeviceLaunch, :int,
120
+ :sharedMemPerBlockOptin, :size_t,
121
+ :pageableMemoryAccessUsesHostPageTables, :int,
122
+ :directManagedMemAccessFromHost, :int,
123
+ :maxBlocksPerMultiProcessor, :int,
124
+ :accessPolicyMaxWindowSize, :int,
125
+ :reservedSharedMemPerBlock, :size_t,
126
+ :hostRegisterSupported, :int,
127
+ :sparseCudaArraySupported, :int,
128
+ :hostRegisterReadOnlySupported, :int,
129
+ :timelineSemaphoreInteropSupported, :int,
130
+ :memoryPoolsSupported, :int,
131
+ :gpuDirectRDMASupported, :int,
132
+ :gpuDirectRDMAFlushWritesOptions, :uint,
133
+ :gpuDirectRDMAWritesOrdering, :int,
134
+ :memoryPoolSupportedHandleTypes, :uint,
135
+ :deferredMappingCudaArraySupported, :int,
136
+ :ipcEventSupported, :int,
137
+ :clusterLaunch, :int,
138
+ :unifiedFunctionPointers, :int,
139
+ :_reserved_padding, [:char, 512] # Padding for future fields
140
+ end
141
+
142
+ # Bind cudaGetDeviceProperties_v2
143
+ begin
144
+ attach_function :cudaGetDeviceProperties_v2, [:pointer, :int], :int
145
+ rescue FFI::NotFoundError
146
+ # Fall back to non-v2 variant
147
+ begin
148
+ attach_function :cudaGetDeviceProperties, [:pointer, :int], :int
149
+ rescue FFI::NotFoundError
150
+ $stderr.puts "[Ignis] WARNING: cudaGetDeviceProperties not found in #{CUDART_DLL}"
151
+ end
152
+ end
153
+
154
+ # Get device properties for a given device index.
155
+ #
156
+ # @param device_id [Integer] GPU device index
157
+ # @return [CudaDeviceProp]
158
+ # @raise [RuntimeError] if the CUDA call fails
159
+ def self.get(device_id)
160
+ prop = CudaDeviceProp.new
161
+
162
+ if respond_to?(:cudaGetDeviceProperties_v2)
163
+ status = cudaGetDeviceProperties_v2(prop.pointer, device_id)
164
+ elsif respond_to?(:cudaGetDeviceProperties)
165
+ status = cudaGetDeviceProperties(prop.pointer, device_id)
166
+ else
167
+ raise 'cudaGetDeviceProperties not available'
168
+ end
169
+
170
+ raise "cudaGetDeviceProperties failed with status #{status}" unless status.zero?
171
+
172
+ prop
173
+ end
174
+
175
+ # Helper to extract a human-readable property hash.
176
+ #
177
+ # @param device_id [Integer]
178
+ # @return [Hash]
179
+ def self.summary(device_id)
180
+ prop = get(device_id)
181
+ {
182
+ name: prop[:name].to_s.strip,
183
+ compute_capability: "#{prop[:major]}.#{prop[:minor]}",
184
+ total_memory_mb: (prop[:totalGlobalMem] / (1024.0 * 1024.0)).round(1),
185
+ multiprocessor_count: prop[:multiProcessorCount],
186
+ max_threads_per_block: prop[:maxThreadsPerBlock],
187
+ warp_size: prop[:warpSize],
188
+ memory_clock_mhz: prop[:memoryClockRate] / 1000,
189
+ memory_bus_width: prop[:memoryBusWidth],
190
+ l2_cache_kb: prop[:l2CacheSize] / 1024,
191
+ ecc_enabled: prop[:ECCEnabled] != 0,
192
+ unified_addressing: prop[:unifiedAddressing] != 0,
193
+ managed_memory: prop[:managedMemory] != 0,
194
+ cooperative_launch: prop[:cooperativeLaunch] != 0,
195
+ gpu_direct_rdma: prop[:gpuDirectRDMASupported] != 0,
196
+ memory_pools: prop[:memoryPoolsSupported] != 0,
197
+ cluster_launch: prop[:clusterLaunch] != 0
198
+ }
199
+ end
200
+ end
201
+ end
202
+ end