ignis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis.rb +94 -0
- data/lib/nnw/platform.rb +304 -0
- data/lib/nnw/shared/event_bus.rb +240 -0
- data/lib/nnw/shared/ffi_loader.rb +63 -0
- data/lib/nnw/shared/memory_contract.rb +204 -0
- data/lib/nnw/shared/nv_array.rb +710 -0
- data/lib/nnw/shared/recovery_protocol.rb +307 -0
- data/lib/nvruby/configuration.rb +217 -0
- data/lib/nvruby/cuda/device.rb +275 -0
- data/lib/nvruby/cuda/device_props.rb +202 -0
- data/lib/nvruby/cuda/graph.rb +265 -0
- data/lib/nvruby/cuda/graph_bindings.rb +119 -0
- data/lib/nvruby/cuda/library_loader.rb +285 -0
- data/lib/nvruby/cuda/memory.rb +410 -0
- data/lib/nvruby/cuda/runtime_api.rb +804 -0
- data/lib/nvruby/cuda/stream.rb +234 -0
- data/lib/nvruby/dtype.rb +139 -0
- data/lib/nvruby/epilogues.rb +438 -0
- data/lib/nvruby/errors.rb +303 -0
- data/lib/nvruby/half.rb +97 -0
- data/lib/nvruby/jit/compiled_kernel.rb +80 -0
- data/lib/nvruby/jit/compiler.rb +231 -0
- data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
- data/lib/nvruby/jit/kernel.rb +240 -0
- data/lib/nvruby/jit/kernel_module.rb +133 -0
- data/lib/nvruby/jit/kernels/activations.rb +179 -0
- data/lib/nvruby/jit/kernels/attention.rb +504 -0
- data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
- data/lib/nvruby/jit/kernels/loss.rb +213 -0
- data/lib/nvruby/jit/kernels/normalization.rb +200 -0
- data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
- data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
- data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
- data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
- data/lib/nvruby/linalg/epilog.rb +67 -0
- data/lib/nvruby/linalg/matmul.rb +247 -0
- data/lib/nvruby/linalg/matmul_plan.rb +229 -0
- data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
- data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
- data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
- data/lib/nvruby/memory/device_memory_resource.rb +106 -0
- data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
- data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
- data/lib/nvruby/memory/stats.rb +107 -0
- data/lib/nvruby/memory.rb +124 -0
- data/lib/nvruby/version.rb +5 -0
- metadata +108 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module CUDA
|
|
5
|
+
# CUDA device attribute constants (cudaDeviceAttr enum values)
|
|
6
|
+
module DeviceAttribute
|
|
7
|
+
MAX_THREADS_PER_BLOCK = 1
|
|
8
|
+
MAX_BLOCK_DIM_X = 2
|
|
9
|
+
MAX_BLOCK_DIM_Y = 3
|
|
10
|
+
MAX_BLOCK_DIM_Z = 4
|
|
11
|
+
MAX_GRID_DIM_X = 5
|
|
12
|
+
MAX_GRID_DIM_Y = 6
|
|
13
|
+
MAX_GRID_DIM_Z = 7
|
|
14
|
+
MAX_SHARED_MEMORY_PER_BLOCK = 8
|
|
15
|
+
TOTAL_CONSTANT_MEMORY = 9
|
|
16
|
+
WARP_SIZE = 10
|
|
17
|
+
MAX_PITCH = 11
|
|
18
|
+
MAX_REGISTERS_PER_BLOCK = 12
|
|
19
|
+
CLOCK_RATE = 13
|
|
20
|
+
TEXTURE_ALIGNMENT = 14
|
|
21
|
+
MULTIPROCESSOR_COUNT = 16
|
|
22
|
+
KERNEL_EXEC_TIMEOUT = 17
|
|
23
|
+
INTEGRATED = 18
|
|
24
|
+
CAN_MAP_HOST_MEMORY = 19
|
|
25
|
+
COMPUTE_MODE = 20
|
|
26
|
+
CONCURRENT_KERNELS = 31
|
|
27
|
+
ECC_ENABLED = 32
|
|
28
|
+
MEMORY_CLOCK_RATE = 36
|
|
29
|
+
GLOBAL_MEMORY_BUS_WIDTH = 37
|
|
30
|
+
L2_CACHE_SIZE = 38
|
|
31
|
+
MAX_THREADS_PER_MULTIPROCESSOR = 39
|
|
32
|
+
ASYNC_ENGINE_COUNT = 40
|
|
33
|
+
UNIFIED_ADDRESSING = 41
|
|
34
|
+
COMPUTE_CAPABILITY_MAJOR = 75
|
|
35
|
+
COMPUTE_CAPABILITY_MINOR = 76
|
|
36
|
+
MANAGED_MEMORY = 83
|
|
37
|
+
COOPERATIVE_LAUNCH = 95
|
|
38
|
+
MAX_BLOCKS_PER_MULTIPROCESSOR = 106
|
|
39
|
+
GPU_DIRECT_RDMA_SUPPORTED = 116
|
|
40
|
+
MEMORY_POOLS_SUPPORTED = 115
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Represents a CUDA GPU device.
|
|
44
|
+
#
|
|
45
|
+
# Uses RuntimeAPI (Fiddle-based) for hot-path attribute queries
|
|
46
|
+
# and DeviceProperties (FFI::Struct) for full device property reads.
|
|
47
|
+
class Device
|
|
48
|
+
# @return [Integer] Device index
|
|
49
|
+
attr_reader :index
|
|
50
|
+
|
|
51
|
+
# @param index [Integer] Device index
|
|
52
|
+
def initialize(index)
|
|
53
|
+
@index = index
|
|
54
|
+
@attribute_cache = {}
|
|
55
|
+
@name = nil
|
|
56
|
+
@total_memory = nil
|
|
57
|
+
@props_cache = nil
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# @return [String] Device name
|
|
61
|
+
def name
|
|
62
|
+
return @name if @name
|
|
63
|
+
|
|
64
|
+
@name = properties[:name]
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# @return [Integer] Total global memory in bytes
|
|
68
|
+
def total_memory
|
|
69
|
+
return @total_memory if @total_memory
|
|
70
|
+
|
|
71
|
+
@total_memory = memory_info[:total_bytes]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# @return [Integer] Shared memory per block in bytes
|
|
75
|
+
def shared_memory_per_block
|
|
76
|
+
get_attribute(DeviceAttribute::MAX_SHARED_MEMORY_PER_BLOCK)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# @return [Integer] Number of streaming multiprocessors
|
|
80
|
+
def multiprocessor_count
|
|
81
|
+
get_attribute(DeviceAttribute::MULTIPROCESSOR_COUNT)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# @return [Integer] Warp size (typically 32)
|
|
85
|
+
def warp_size
|
|
86
|
+
get_attribute(DeviceAttribute::WARP_SIZE)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# @return [String] Compute capability as "major.minor"
|
|
90
|
+
def compute_capability
|
|
91
|
+
"#{compute_capability_major}.#{compute_capability_minor}"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# @return [Integer] Major compute capability
|
|
95
|
+
def compute_capability_major
|
|
96
|
+
get_attribute(DeviceAttribute::COMPUTE_CAPABILITY_MAJOR)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# @return [Integer] Minor compute capability
|
|
100
|
+
def compute_capability_minor
|
|
101
|
+
get_attribute(DeviceAttribute::COMPUTE_CAPABILITY_MINOR)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# @return [Integer] Max threads per block
|
|
105
|
+
def max_threads_per_block
|
|
106
|
+
get_attribute(DeviceAttribute::MAX_THREADS_PER_BLOCK)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# @return [Array<Integer>] Max threads per dimension [x, y, z]
|
|
110
|
+
def max_threads_dim
|
|
111
|
+
[
|
|
112
|
+
get_attribute(DeviceAttribute::MAX_BLOCK_DIM_X),
|
|
113
|
+
get_attribute(DeviceAttribute::MAX_BLOCK_DIM_Y),
|
|
114
|
+
get_attribute(DeviceAttribute::MAX_BLOCK_DIM_Z)
|
|
115
|
+
]
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# @return [Array<Integer>] Max grid size [x, y, z]
|
|
119
|
+
def max_grid_size
|
|
120
|
+
[
|
|
121
|
+
get_attribute(DeviceAttribute::MAX_GRID_DIM_X),
|
|
122
|
+
get_attribute(DeviceAttribute::MAX_GRID_DIM_Y),
|
|
123
|
+
get_attribute(DeviceAttribute::MAX_GRID_DIM_Z)
|
|
124
|
+
]
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# @return [Integer] Clock rate in kHz
|
|
128
|
+
def clock_rate
|
|
129
|
+
get_attribute(DeviceAttribute::CLOCK_RATE)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# @return [Integer] Memory clock rate in kHz
|
|
133
|
+
def memory_clock_rate
|
|
134
|
+
get_attribute(DeviceAttribute::MEMORY_CLOCK_RATE)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# @return [Integer] Memory bus width in bits
|
|
138
|
+
def memory_bus_width
|
|
139
|
+
get_attribute(DeviceAttribute::GLOBAL_MEMORY_BUS_WIDTH)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# @return [Integer] L2 cache size in bytes
|
|
143
|
+
def l2_cache_size
|
|
144
|
+
get_attribute(DeviceAttribute::L2_CACHE_SIZE)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# @return [Boolean] Whether ECC memory is enabled
|
|
148
|
+
def ecc_enabled?
|
|
149
|
+
get_attribute(DeviceAttribute::ECC_ENABLED) != 0
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# @return [Boolean] Whether concurrent kernels are supported
|
|
153
|
+
def concurrent_kernels?
|
|
154
|
+
get_attribute(DeviceAttribute::CONCURRENT_KERNELS) != 0
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# @return [Boolean] Whether unified addressing is supported
|
|
158
|
+
def unified_addressing?
|
|
159
|
+
get_attribute(DeviceAttribute::UNIFIED_ADDRESSING) != 0
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# @return [Boolean] Whether managed memory is supported
|
|
163
|
+
def managed_memory?
|
|
164
|
+
get_attribute(DeviceAttribute::MANAGED_MEMORY) != 0
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# @return [Boolean] Whether cooperative launch is supported
|
|
168
|
+
def cooperative_launch?
|
|
169
|
+
get_attribute(DeviceAttribute::COOPERATIVE_LAUNCH) != 0
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Get available and total memory via RuntimeAPI (Fiddle hot path).
|
|
173
|
+
# @return [Hash] {free_bytes:, total_bytes:}
|
|
174
|
+
def memory_info
|
|
175
|
+
RuntimeAPI.ensure_loaded!
|
|
176
|
+
RuntimeAPI.set_device(@index)
|
|
177
|
+
RuntimeAPI.mem_get_info
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# @return [Integer] Free memory in bytes
|
|
181
|
+
def free_memory
|
|
182
|
+
memory_info[:free_bytes]
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Get full device properties via DeviceProperties (FFI::Struct).
|
|
186
|
+
# Cached after first call.
|
|
187
|
+
# @return [Hash] device property summary
|
|
188
|
+
def properties
|
|
189
|
+
@props_cache ||= DeviceProperties.summary(@index)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Set this device as the current device.
|
|
193
|
+
# @return [void]
|
|
194
|
+
def set_current!
|
|
195
|
+
RuntimeAPI.ensure_loaded!
|
|
196
|
+
RuntimeAPI.set_device(@index)
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Synchronize this device.
|
|
200
|
+
# @return [void]
|
|
201
|
+
def synchronize
|
|
202
|
+
set_current!
|
|
203
|
+
RuntimeAPI.device_synchronize
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Reset this device.
|
|
207
|
+
# @return [void]
|
|
208
|
+
def reset!
|
|
209
|
+
set_current!
|
|
210
|
+
RuntimeAPI.device_reset
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# @return [String] Human-readable device description
|
|
214
|
+
def to_s
|
|
215
|
+
mem_mb = total_memory / (1024 * 1024)
|
|
216
|
+
"Device[#{@index}]: #{name} (CC #{compute_capability}, #{mem_mb} MB)"
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# @return [String] Detailed inspection
|
|
220
|
+
def inspect
|
|
221
|
+
"#<Ignis::CUDA::Device:#{object_id} index=#{@index} name=#{name.inspect} " \
|
|
222
|
+
"compute=#{compute_capability} memory=#{total_memory}>"
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
class << self
|
|
226
|
+
# Get the number of CUDA devices.
|
|
227
|
+
# @return [Integer]
|
|
228
|
+
def count
|
|
229
|
+
RuntimeAPI.ensure_loaded!
|
|
230
|
+
RuntimeAPI.get_device_count
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# List all available devices.
|
|
234
|
+
# @return [Array<Device>]
|
|
235
|
+
def list
|
|
236
|
+
count.times.map { |i| new(i) }
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# Get the current device.
|
|
240
|
+
# @return [Device]
|
|
241
|
+
def current
|
|
242
|
+
RuntimeAPI.ensure_loaded!
|
|
243
|
+
new(RuntimeAPI.get_device)
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# Get the default device from configuration.
|
|
247
|
+
# @return [Device]
|
|
248
|
+
def default
|
|
249
|
+
new(Ignis.configuration.default_device)
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Check if any CUDA device is available.
|
|
253
|
+
# @return [Boolean]
|
|
254
|
+
def available?
|
|
255
|
+
count.positive?
|
|
256
|
+
rescue CudaRuntimeError
|
|
257
|
+
false
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
private
|
|
262
|
+
|
|
263
|
+
# Get a device attribute value (cached).
|
|
264
|
+
# Uses RuntimeAPI.device_get_attribute (Fiddle hot path).
|
|
265
|
+
# @param attribute [Integer] Attribute constant from DeviceAttribute
|
|
266
|
+
# @return [Integer] Attribute value
|
|
267
|
+
def get_attribute(attribute)
|
|
268
|
+
return @attribute_cache[attribute] if @attribute_cache.key?(attribute)
|
|
269
|
+
|
|
270
|
+
RuntimeAPI.ensure_loaded!
|
|
271
|
+
@attribute_cache[attribute] = RuntimeAPI.device_get_attribute(attribute, @index)
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
end
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# CUDA Device Properties — FFI::Struct definition
|
|
4
|
+
#
|
|
5
|
+
# Rule 4: FFI structs live in their own file, never mixed with Fiddle hot-path calls.
|
|
6
|
+
# This file is used by device.rb for one-shot property reads.
|
|
7
|
+
#
|
|
8
|
+
# cuDeviceProp has 268+ fields. We define the most commonly used subset
|
|
9
|
+
# and provide a raw accessor for the rest.
|
|
10
|
+
|
|
11
|
+
require_relative '../../nnw/shared/ffi_loader'
|
|
12
|
+
Ignis::Shared::FFILoader.load!
|
|
13
|
+
|
|
14
|
+
module Ignis
|
|
15
|
+
module CUDA
|
|
16
|
+
# cudaDeviceProp struct for FFI binding.
|
|
17
|
+
#
|
|
18
|
+
# This covers the essential fields. For the full struct, use raw_props.
|
|
19
|
+
module DeviceProperties
|
|
20
|
+
extend FFI::Library
|
|
21
|
+
|
|
22
|
+
# Resolve CUDA runtime library path per platform.
|
|
23
|
+
CUDART_PATH = if defined?(Ignis::Platform)
|
|
24
|
+
Ignis::Platform.cudart_path
|
|
25
|
+
elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
|
|
26
|
+
File.join('C:', 'Program Files', 'NVIDIA GPU Computing Toolkit',
|
|
27
|
+
'CUDA', 'v13.0', 'bin', 'cudart64_130.dll')
|
|
28
|
+
else
|
|
29
|
+
'libcudart.so.13'
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
begin
|
|
33
|
+
ffi_lib CUDART_PATH
|
|
34
|
+
rescue LoadError => e
|
|
35
|
+
$stderr.puts "[Ignis] WARNING: Cannot load #{CUDART_PATH}: #{e.message}"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# cudaDeviceProp struct definition — essential fields.
|
|
39
|
+
#
|
|
40
|
+
# The full struct is ~2KB. We define the first ~80 fields that are
|
|
41
|
+
# most commonly queried. Remaining bytes are padded.
|
|
42
|
+
class CudaDeviceProp < FFI::Struct
|
|
43
|
+
# The struct layout matches CUDA 12.x / 13.x cudaDeviceProp ordering.
|
|
44
|
+
# Field names match the CUDA documentation exactly.
|
|
45
|
+
layout \
|
|
46
|
+
:name, [:char, 256], # Device name
|
|
47
|
+
:uuid, [:char, 16], # 16-byte UUID
|
|
48
|
+
:luid, [:char, 8], # 8-byte LUID
|
|
49
|
+
:luidDeviceNodeMask, :uint,
|
|
50
|
+
:totalGlobalMem, :size_t,
|
|
51
|
+
:sharedMemPerBlock, :size_t,
|
|
52
|
+
:regsPerBlock, :int,
|
|
53
|
+
:warpSize, :int,
|
|
54
|
+
:memPitch, :size_t,
|
|
55
|
+
:maxThreadsPerBlock, :int,
|
|
56
|
+
:maxThreadsDim, [:int, 3],
|
|
57
|
+
:maxGridSize, [:int, 3],
|
|
58
|
+
:clockRate, :int,
|
|
59
|
+
:totalConstMem, :size_t,
|
|
60
|
+
:major, :int, # Compute capability major
|
|
61
|
+
:minor, :int, # Compute capability minor
|
|
62
|
+
:textureAlignment, :size_t,
|
|
63
|
+
:texturePitchAlignment, :size_t,
|
|
64
|
+
:deviceOverlap, :int,
|
|
65
|
+
:multiProcessorCount, :int,
|
|
66
|
+
:kernelExecTimeoutEnabled, :int,
|
|
67
|
+
:integrated, :int,
|
|
68
|
+
:canMapHostMemory, :int,
|
|
69
|
+
:computeMode, :int,
|
|
70
|
+
:maxTexture1D, :int,
|
|
71
|
+
:maxTexture1DMipmap, :int,
|
|
72
|
+
:maxTexture1DLinear, :int,
|
|
73
|
+
:maxTexture2D, [:int, 2],
|
|
74
|
+
:maxTexture2DMipmap, [:int, 2],
|
|
75
|
+
:maxTexture2DLinear, [:int, 3],
|
|
76
|
+
:maxTexture2DGather, [:int, 2],
|
|
77
|
+
:maxTexture3D, [:int, 3],
|
|
78
|
+
:maxTexture3DAlt, [:int, 3],
|
|
79
|
+
:maxTextureCubemap, :int,
|
|
80
|
+
:maxTexture1DLayered, [:int, 2],
|
|
81
|
+
:maxTexture2DLayered, [:int, 3],
|
|
82
|
+
:maxTextureCubemapLayered, [:int, 2],
|
|
83
|
+
:maxSurface1D, :int,
|
|
84
|
+
:maxSurface2D, [:int, 2],
|
|
85
|
+
:maxSurface3D, [:int, 3],
|
|
86
|
+
:maxSurface1DLayered, [:int, 2],
|
|
87
|
+
:maxSurface2DLayered, [:int, 3],
|
|
88
|
+
:maxSurfaceCubemap, :int,
|
|
89
|
+
:maxSurfaceCubemapLayered, [:int, 2],
|
|
90
|
+
:surfaceAlignment, :size_t,
|
|
91
|
+
:concurrentKernels, :int,
|
|
92
|
+
:ECCEnabled, :int,
|
|
93
|
+
:pciBusID, :int,
|
|
94
|
+
:pciDeviceID, :int,
|
|
95
|
+
:pciDomainID, :int,
|
|
96
|
+
:tccDriver, :int,
|
|
97
|
+
:asyncEngineCount, :int,
|
|
98
|
+
:unifiedAddressing, :int,
|
|
99
|
+
:memoryClockRate, :int,
|
|
100
|
+
:memoryBusWidth, :int,
|
|
101
|
+
:l2CacheSize, :int,
|
|
102
|
+
:persistingL2CacheMaxSize, :int,
|
|
103
|
+
:maxThreadsPerMultiProcessor, :int,
|
|
104
|
+
:streamPrioritiesSupported, :int,
|
|
105
|
+
:globalL1CacheSupported, :int,
|
|
106
|
+
:localL1CacheSupported, :int,
|
|
107
|
+
:sharedMemPerMultiprocessor, :size_t,
|
|
108
|
+
:regsPerMultiprocessor, :int,
|
|
109
|
+
:managedMemory, :int,
|
|
110
|
+
:isMultiGpuBoard, :int,
|
|
111
|
+
:multiGpuBoardGroupID, :int,
|
|
112
|
+
:hostNativeAtomicSupported, :int,
|
|
113
|
+
:singleToDoublePrecisionPerfRatio, :int,
|
|
114
|
+
:pageableMemoryAccess, :int,
|
|
115
|
+
:concurrentManagedAccess, :int,
|
|
116
|
+
:computePreemptionSupported, :int,
|
|
117
|
+
:canUseHostPointerForRegisteredMem, :int,
|
|
118
|
+
:cooperativeLaunch, :int,
|
|
119
|
+
:cooperativeMultiDeviceLaunch, :int,
|
|
120
|
+
:sharedMemPerBlockOptin, :size_t,
|
|
121
|
+
:pageableMemoryAccessUsesHostPageTables, :int,
|
|
122
|
+
:directManagedMemAccessFromHost, :int,
|
|
123
|
+
:maxBlocksPerMultiProcessor, :int,
|
|
124
|
+
:accessPolicyMaxWindowSize, :int,
|
|
125
|
+
:reservedSharedMemPerBlock, :size_t,
|
|
126
|
+
:hostRegisterSupported, :int,
|
|
127
|
+
:sparseCudaArraySupported, :int,
|
|
128
|
+
:hostRegisterReadOnlySupported, :int,
|
|
129
|
+
:timelineSemaphoreInteropSupported, :int,
|
|
130
|
+
:memoryPoolsSupported, :int,
|
|
131
|
+
:gpuDirectRDMASupported, :int,
|
|
132
|
+
:gpuDirectRDMAFlushWritesOptions, :uint,
|
|
133
|
+
:gpuDirectRDMAWritesOrdering, :int,
|
|
134
|
+
:memoryPoolSupportedHandleTypes, :uint,
|
|
135
|
+
:deferredMappingCudaArraySupported, :int,
|
|
136
|
+
:ipcEventSupported, :int,
|
|
137
|
+
:clusterLaunch, :int,
|
|
138
|
+
:unifiedFunctionPointers, :int,
|
|
139
|
+
:_reserved_padding, [:char, 512] # Padding for future fields
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Bind cudaGetDeviceProperties_v2
|
|
143
|
+
begin
|
|
144
|
+
attach_function :cudaGetDeviceProperties_v2, [:pointer, :int], :int
|
|
145
|
+
rescue FFI::NotFoundError
|
|
146
|
+
# Fall back to non-v2 variant
|
|
147
|
+
begin
|
|
148
|
+
attach_function :cudaGetDeviceProperties, [:pointer, :int], :int
|
|
149
|
+
rescue FFI::NotFoundError
|
|
150
|
+
$stderr.puts "[Ignis] WARNING: cudaGetDeviceProperties not found in #{CUDART_DLL}"
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Get device properties for a given device index.
|
|
155
|
+
#
|
|
156
|
+
# @param device_id [Integer] GPU device index
|
|
157
|
+
# @return [CudaDeviceProp]
|
|
158
|
+
# @raise [RuntimeError] if the CUDA call fails
|
|
159
|
+
def self.get(device_id)
|
|
160
|
+
prop = CudaDeviceProp.new
|
|
161
|
+
|
|
162
|
+
if respond_to?(:cudaGetDeviceProperties_v2)
|
|
163
|
+
status = cudaGetDeviceProperties_v2(prop.pointer, device_id)
|
|
164
|
+
elsif respond_to?(:cudaGetDeviceProperties)
|
|
165
|
+
status = cudaGetDeviceProperties(prop.pointer, device_id)
|
|
166
|
+
else
|
|
167
|
+
raise 'cudaGetDeviceProperties not available'
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
raise "cudaGetDeviceProperties failed with status #{status}" unless status.zero?
|
|
171
|
+
|
|
172
|
+
prop
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Helper to extract a human-readable property hash.
|
|
176
|
+
#
|
|
177
|
+
# @param device_id [Integer]
|
|
178
|
+
# @return [Hash]
|
|
179
|
+
def self.summary(device_id)
|
|
180
|
+
prop = get(device_id)
|
|
181
|
+
{
|
|
182
|
+
name: prop[:name].to_s.strip,
|
|
183
|
+
compute_capability: "#{prop[:major]}.#{prop[:minor]}",
|
|
184
|
+
total_memory_mb: (prop[:totalGlobalMem] / (1024.0 * 1024.0)).round(1),
|
|
185
|
+
multiprocessor_count: prop[:multiProcessorCount],
|
|
186
|
+
max_threads_per_block: prop[:maxThreadsPerBlock],
|
|
187
|
+
warp_size: prop[:warpSize],
|
|
188
|
+
memory_clock_mhz: prop[:memoryClockRate] / 1000,
|
|
189
|
+
memory_bus_width: prop[:memoryBusWidth],
|
|
190
|
+
l2_cache_kb: prop[:l2CacheSize] / 1024,
|
|
191
|
+
ecc_enabled: prop[:ECCEnabled] != 0,
|
|
192
|
+
unified_addressing: prop[:unifiedAddressing] != 0,
|
|
193
|
+
managed_memory: prop[:managedMemory] != 0,
|
|
194
|
+
cooperative_launch: prop[:cooperativeLaunch] != 0,
|
|
195
|
+
gpu_direct_rdma: prop[:gpuDirectRDMASupported] != 0,
|
|
196
|
+
memory_pools: prop[:memoryPoolsSupported] != 0,
|
|
197
|
+
cluster_launch: prop[:clusterLaunch] != 0
|
|
198
|
+
}
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|