ignis-numerics 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis-numerics.rb +62 -0
- data/lib/nvruby/array.rb +646 -0
- data/lib/nvruby/fft/cufft_bindings.rb +134 -0
- data/lib/nvruby/fft/fft_plan.rb +288 -0
- data/lib/nvruby/fft/operations.rb +364 -0
- data/lib/nvruby/linalg/cutensor_bindings.rb +107 -0
- data/lib/nvruby/mathdx/fft_kernel.rb +258 -0
- data/lib/nvruby/mathdx/gemm_kernel.rb +293 -0
- data/lib/nvruby/mathdx.rb +73 -0
- data/lib/nvruby/random/curand_bindings.rb +115 -0
- data/lib/nvruby/random/generator.rb +305 -0
- data/lib/nvruby/solver/amgx_bindings.rb +172 -0
- data/lib/nvruby/solver/amgx_config.rb +142 -0
- data/lib/nvruby/solver/amgx_solver.rb +251 -0
- data/lib/nvruby/solver/cudss_bindings.rb +115 -0
- data/lib/nvruby/solver/cusolver_bindings.rb +358 -0
- data/lib/nvruby/solver/eigen.rb +226 -0
- data/lib/nvruby/solver/lu.rb +265 -0
- data/lib/nvruby/solver/sparse_solver.rb +429 -0
- data/lib/nvruby/solver/svd.rb +266 -0
- data/lib/nvruby/solver.rb +122 -0
- data/lib/nvruby/sparse/cusparse_bindings.rb +231 -0
- data/lib/nvruby/sparse/sparse_matrix.rb +456 -0
- data/lib/nvruby/tensor/contraction.rb +218 -0
- data/lib/nvruby/tensor.rb +42 -0
- metadata +85 -0
data/lib/nvruby/array.rb
ADDED
|
@@ -0,0 +1,646 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
# GPU-aware multi-dimensional array
|
|
5
|
+
# Similar to NumPy ndarray but with CUDA memory backing
|
|
6
|
+
class NvArray
|
|
7
|
+
# @return [Array<Integer>] Shape of the array
|
|
8
|
+
attr_reader :shape
|
|
9
|
+
|
|
10
|
+
# @return [Symbol] Data type (:float32, :float64, :complex64, etc.)
|
|
11
|
+
attr_reader :dtype
|
|
12
|
+
|
|
13
|
+
# @return [Array<Integer>] Strides in bytes for each dimension
|
|
14
|
+
attr_reader :strides
|
|
15
|
+
|
|
16
|
+
# @return [CUDA::Memory, nil] Device memory (nil if on host)
|
|
17
|
+
attr_reader :device_memory
|
|
18
|
+
|
|
19
|
+
# @return [FFI::Pointer, nil] Host memory pointer (nil if on device)
|
|
20
|
+
attr_reader :host_memory
|
|
21
|
+
|
|
22
|
+
# @return [:host, :device] Current memory location
|
|
23
|
+
attr_reader :location
|
|
24
|
+
|
|
25
|
+
# @return [Integer] Device index
|
|
26
|
+
attr_reader :device_index
|
|
27
|
+
|
|
28
|
+
# Create a new NvArray
|
|
29
|
+
# @param shape [Array<Integer>] Shape of the array
|
|
30
|
+
# @param dtype [Symbol] Data type (default: :float32)
|
|
31
|
+
# @param device [Integer, nil] Device index (nil for host-only)
|
|
32
|
+
# @param data [Array, String, nil] Initial data
|
|
33
|
+
def initialize(shape:, dtype: :float32, device: nil, data: nil)
|
|
34
|
+
@shape = normalize_shape(shape)
|
|
35
|
+
@dtype = DType.validate!(dtype)
|
|
36
|
+
@device_index = device || Ignis.configuration.default_device
|
|
37
|
+
@strides = compute_strides
|
|
38
|
+
@size_bytes = compute_size_bytes
|
|
39
|
+
|
|
40
|
+
@device_memory = nil
|
|
41
|
+
@host_memory = nil
|
|
42
|
+
|
|
43
|
+
if data
|
|
44
|
+
initialize_with_data(data)
|
|
45
|
+
else
|
|
46
|
+
allocate_memory(device ? :device : :host)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# @return [Integer] Total number of elements
|
|
51
|
+
def size
|
|
52
|
+
@shape.reduce(1, :*)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# @return [Integer] Number of dimensions
|
|
56
|
+
def ndim
|
|
57
|
+
@shape.size
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# @return [Integer] Total size in bytes
|
|
61
|
+
def nbytes
|
|
62
|
+
@size_bytes
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# @return [Integer] Size of each element in bytes
|
|
66
|
+
def itemsize
|
|
67
|
+
DType.byte_size(@dtype)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Check if data is on device
|
|
71
|
+
# @return [Boolean]
|
|
72
|
+
def on_device?
|
|
73
|
+
@location == :device
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Check if data is on host
|
|
77
|
+
# @return [Boolean]
|
|
78
|
+
def on_host?
|
|
79
|
+
@location == :host
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Transfer data to GPU
|
|
83
|
+
# @param device [Integer, nil] Target device (nil for current)
|
|
84
|
+
# @param stream [CUDA::Stream, nil] Stream for async transfer
|
|
85
|
+
# @return [self]
|
|
86
|
+
def to_device(device: nil, stream: nil)
|
|
87
|
+
return self if on_device? && (device.nil? || device == @device_index)
|
|
88
|
+
|
|
89
|
+
target_device = device || @device_index
|
|
90
|
+
|
|
91
|
+
ensure_host_data!
|
|
92
|
+
|
|
93
|
+
@device_memory = CUDA::Memory.new(@size_bytes, device: target_device)
|
|
94
|
+
@device_memory.copy_from_host(@host_memory, stream: stream)
|
|
95
|
+
|
|
96
|
+
@device_index = target_device
|
|
97
|
+
@location = :device
|
|
98
|
+
|
|
99
|
+
# Free host memory if not needed
|
|
100
|
+
@host_memory = nil unless Ignis.configuration.with_lock { false } # Keep host copy option
|
|
101
|
+
|
|
102
|
+
self
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Transfer data to host
|
|
106
|
+
# @param stream [CUDA::Stream, nil] Stream for async transfer
|
|
107
|
+
# @return [self]
|
|
108
|
+
def to_host(stream: nil)
|
|
109
|
+
return self if on_host?
|
|
110
|
+
|
|
111
|
+
ensure_device_data!
|
|
112
|
+
|
|
113
|
+
@host_memory = FFI::MemoryPointer.new(:uint8, @size_bytes)
|
|
114
|
+
@device_memory.copy_to_host(host_buffer: @host_memory, stream: stream)
|
|
115
|
+
|
|
116
|
+
@location = :host
|
|
117
|
+
@device_memory.free!
|
|
118
|
+
@device_memory = nil
|
|
119
|
+
|
|
120
|
+
self
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Get device pointer for CUDA operations
|
|
124
|
+
# @return [Fiddle::Pointer] Device pointer
|
|
125
|
+
# @raise [InvalidOperationError] If not on device
|
|
126
|
+
def device_ptr
|
|
127
|
+
ensure_device_data!
|
|
128
|
+
@device_memory.device_ptr
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Get device pointer wrapped as an FFI::Pointer for FFI-bound CUDA-X
|
|
132
|
+
# library calls (cuBLAS/cuSOLVER/cuFFT/cuRAND/cuSPARSE), which cannot
|
|
133
|
+
# accept the Fiddle::Pointer returned by #device_ptr.
|
|
134
|
+
# @return [FFI::Pointer] Device pointer
|
|
135
|
+
# @raise [InvalidOperationError] If not on device
|
|
136
|
+
def device_ffi_ptr
|
|
137
|
+
ensure_device_data!
|
|
138
|
+
@device_memory.ffi_ptr
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Get host pointer
|
|
142
|
+
# @return [FFI::Pointer] Host pointer
|
|
143
|
+
# @raise [InvalidOperationError] If not on host
|
|
144
|
+
def host_ptr
|
|
145
|
+
ensure_host_data!
|
|
146
|
+
@host_memory
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Get data as Ruby array (copies to host if needed)
|
|
150
|
+
# @return [Array] Nested Ruby array with data
|
|
151
|
+
def to_a
|
|
152
|
+
synchronize_if_needed
|
|
153
|
+
ensure_host_data!
|
|
154
|
+
|
|
155
|
+
flat_data = read_flat_data
|
|
156
|
+
reshape_to_nested(flat_data, @shape)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Get flat data as Ruby array
|
|
160
|
+
# @return [Array] 1D Ruby array with all elements
|
|
161
|
+
def flatten
|
|
162
|
+
synchronize_if_needed
|
|
163
|
+
ensure_host_data!
|
|
164
|
+
read_flat_data
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Reshape the array
|
|
168
|
+
# @param new_shape [Array<Integer>] New shape
|
|
169
|
+
# @return [NvArray] Reshaped array (view if contiguous)
|
|
170
|
+
def reshape(new_shape)
|
|
171
|
+
new_shape = normalize_shape(new_shape)
|
|
172
|
+
|
|
173
|
+
# Handle -1 in shape
|
|
174
|
+
if new_shape.include?(-1)
|
|
175
|
+
neg_idx = new_shape.index(-1)
|
|
176
|
+
other_size = new_shape.reject { |d| d == -1 }.reduce(1, :*)
|
|
177
|
+
new_shape[neg_idx] = size / other_size
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
raise DimensionError, "Cannot reshape array of size #{size} to #{new_shape}" unless new_shape.reduce(1, :*) == size
|
|
181
|
+
|
|
182
|
+
# Create new array with same memory
|
|
183
|
+
result = dup
|
|
184
|
+
result.instance_variable_set(:@shape, new_shape)
|
|
185
|
+
result.instance_variable_set(:@strides, result.send(:compute_strides))
|
|
186
|
+
result
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Transpose the array
|
|
190
|
+
# @param axes [Array<Integer>, nil] Permutation of axes (reverses if nil)
|
|
191
|
+
# @return [NvArray] Transposed array
|
|
192
|
+
def transpose(axes: nil)
|
|
193
|
+
axes ||= (0...ndim).to_a.reverse
|
|
194
|
+
|
|
195
|
+
raise DimensionError, "Invalid axes for transpose" unless axes.sort == (0...ndim).to_a
|
|
196
|
+
|
|
197
|
+
new_shape = axes.map { |ax| @shape[ax] }
|
|
198
|
+
new_strides = axes.map { |ax| @strides[ax] }
|
|
199
|
+
|
|
200
|
+
result = dup
|
|
201
|
+
result.instance_variable_set(:@shape, new_shape)
|
|
202
|
+
result.instance_variable_set(:@strides, new_strides)
|
|
203
|
+
result
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Create a contiguous copy
|
|
207
|
+
# @return [NvArray] Contiguous copy
|
|
208
|
+
def contiguous
|
|
209
|
+
return self if contiguous?
|
|
210
|
+
|
|
211
|
+
# Create new array and copy data
|
|
212
|
+
result = NvArray.new(shape: @shape, dtype: @dtype, device: on_device? ? @device_index : nil)
|
|
213
|
+
|
|
214
|
+
if on_device?
|
|
215
|
+
# Device-to-device copy
|
|
216
|
+
result.device_memory.copy_from_device(@device_memory)
|
|
217
|
+
else
|
|
218
|
+
# Copy host data
|
|
219
|
+
result.host_memory.put_bytes(0, @host_memory.get_bytes(0, @size_bytes))
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
result
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Check if memory layout is contiguous
|
|
226
|
+
# @return [Boolean]
|
|
227
|
+
def contiguous?
|
|
228
|
+
expected = itemsize
|
|
229
|
+
@strides.reverse.each_with_index do |stride, i|
|
|
230
|
+
dim = @shape[ndim - 1 - i]
|
|
231
|
+
return false unless stride == expected || dim == 1
|
|
232
|
+
|
|
233
|
+
expected *= dim
|
|
234
|
+
end
|
|
235
|
+
true
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Duplicate the array
|
|
239
|
+
# @return [NvArray] Copy of the array
|
|
240
|
+
def dup
|
|
241
|
+
result = NvArray.new(shape: @shape.dup, dtype: @dtype, device: on_device? ? @device_index : nil)
|
|
242
|
+
|
|
243
|
+
if on_device?
|
|
244
|
+
result.instance_variable_set(:@device_memory, CUDA::Memory.new(@size_bytes, device: @device_index))
|
|
245
|
+
result.device_memory.copy_from_device(@device_memory)
|
|
246
|
+
result.instance_variable_set(:@location, :device)
|
|
247
|
+
else
|
|
248
|
+
new_host = FFI::MemoryPointer.new(:uint8, @size_bytes)
|
|
249
|
+
new_host.put_bytes(0, @host_memory.get_bytes(0, @size_bytes))
|
|
250
|
+
result.instance_variable_set(:@host_memory, new_host)
|
|
251
|
+
result.instance_variable_set(:@location, :host)
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
result
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Zero out the array
|
|
258
|
+
# @param stream [CUDA::Stream, nil] Stream for async operation
|
|
259
|
+
# @return [self]
|
|
260
|
+
def zero!(stream: nil)
|
|
261
|
+
if on_device?
|
|
262
|
+
@device_memory.zero!(stream: stream)
|
|
263
|
+
else
|
|
264
|
+
@host_memory.clear
|
|
265
|
+
end
|
|
266
|
+
self
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
# Free all memory
|
|
270
|
+
# @return [void]
|
|
271
|
+
def free!
|
|
272
|
+
@device_memory&.free!
|
|
273
|
+
@device_memory = nil
|
|
274
|
+
@host_memory = nil
|
|
275
|
+
@location = nil
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# @return [String] String representation
|
|
279
|
+
def to_s
|
|
280
|
+
loc = on_device? ? "device:#{@device_index}" : "host"
|
|
281
|
+
"NvArray(shape=#{@shape}, dtype=#{@dtype}, #{loc})"
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# @return [String] Detailed inspection
|
|
285
|
+
def inspect
|
|
286
|
+
"#<Ignis::NvArray:#{object_id} shape=#{@shape} dtype=#{@dtype} " \
|
|
287
|
+
"location=#{@location} device=#{@device_index} bytes=#{@size_bytes}>"
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
class << self
|
|
291
|
+
# Create array filled with zeros
|
|
292
|
+
# @param shape [Array<Integer>] Shape
|
|
293
|
+
# @param dtype [Symbol] Data type
|
|
294
|
+
# @param device [Integer, nil] Device index
|
|
295
|
+
# @return [NvArray]
|
|
296
|
+
def zeros(shape, dtype: :float32, device: nil)
|
|
297
|
+
arr = new(shape: shape, dtype: dtype, device: device)
|
|
298
|
+
arr.zero!
|
|
299
|
+
arr
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
# Create array filled with ones
|
|
303
|
+
# @param shape [Array<Integer>] Shape
|
|
304
|
+
# @param dtype [Symbol] Data type
|
|
305
|
+
# @param device [Integer, nil] Device index
|
|
306
|
+
# @return [NvArray]
|
|
307
|
+
def ones(shape, dtype: :float32, device: nil)
|
|
308
|
+
size = Array(shape).reduce(1, :*)
|
|
309
|
+
data = Array.new(size, 1.0)
|
|
310
|
+
new(shape: shape, dtype: dtype, device: device, data: data)
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Create array with evenly spaced values
|
|
314
|
+
# @param start [Numeric] Start value
|
|
315
|
+
# @param stop [Numeric] End value
|
|
316
|
+
# @param num [Integer] Number of samples
|
|
317
|
+
# @param dtype [Symbol] Data type
|
|
318
|
+
# @param device [Integer, nil] Device index
|
|
319
|
+
# @return [NvArray]
|
|
320
|
+
def linspace(start, stop, num, dtype: :float32, device: nil)
|
|
321
|
+
step = (stop - start).to_f / (num - 1)
|
|
322
|
+
data = (0...num).map { |i| start + step * i }
|
|
323
|
+
new(shape: [num], dtype: dtype, device: device, data: data)
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
# Create array from existing device memory
|
|
327
|
+
# @param ptr [FFI::Pointer, CUDA::Memory] Device pointer or Memory object
|
|
328
|
+
# @param shape [Array<Integer>] Shape
|
|
329
|
+
# @param dtype [Symbol] Data type
|
|
330
|
+
# @param take_ownership [Boolean] If true, the array will manage the memory lifecycle
|
|
331
|
+
# @return [NvArray]
|
|
332
|
+
def from_device_ptr(ptr, shape:, dtype: :float32, take_ownership: false)
|
|
333
|
+
arr = allocate_empty_metadata(shape, dtype)
|
|
334
|
+
|
|
335
|
+
if ptr.is_a?(CUDA::Memory)
|
|
336
|
+
# If it's already a Memory object, we can use it directly
|
|
337
|
+
# If we don't take ownership, we might need a non-owning version
|
|
338
|
+
if take_ownership
|
|
339
|
+
arr.instance_variable_set(:@device_memory, ptr)
|
|
340
|
+
else
|
|
341
|
+
# Create a non-owning wrapper for the same pointer
|
|
342
|
+
wrapper = CUDA::Memory.new(arr.nbytes, device: ptr.device_index, ptr: ptr.device_ptr, owned: false)
|
|
343
|
+
arr.instance_variable_set(:@device_memory, wrapper)
|
|
344
|
+
end
|
|
345
|
+
else
|
|
346
|
+
# It's a raw FFI::Pointer
|
|
347
|
+
# We wrap it in a Memory object
|
|
348
|
+
wrapper = CUDA::Memory.new(arr.nbytes, ptr: ptr, owned: take_ownership)
|
|
349
|
+
arr.instance_variable_set(:@device_memory, wrapper)
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
arr.instance_variable_set(:@location, :device)
|
|
353
|
+
arr.instance_variable_set(:@device_index, arr.device_memory.device_index)
|
|
354
|
+
arr
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
# Create an empty NvArray without allocating memory
|
|
358
|
+
# @param shape [Array<Integer>] Shape
|
|
359
|
+
# @param dtype [Symbol] Data type
|
|
360
|
+
# @return [NvArray]
|
|
361
|
+
def allocate_empty_metadata(shape, dtype)
|
|
362
|
+
# We use allocate to avoid calling initialize which tries to allocate memory
|
|
363
|
+
arr = allocate
|
|
364
|
+
arr.instance_variable_set(:@shape, Array(shape).map(&:to_i))
|
|
365
|
+
arr.instance_variable_set(:@dtype, DType.validate!(dtype))
|
|
366
|
+
arr.instance_variable_set(:@strides, arr.send(:compute_strides))
|
|
367
|
+
arr.instance_variable_set(:@size_bytes, arr.send(:compute_size_bytes))
|
|
368
|
+
arr
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
# Create array filled with ones
|
|
372
|
+
# @param shape [Array<Integer>] Shape
|
|
373
|
+
# @param dtype [Symbol] Data type
|
|
374
|
+
# @param device [Integer, nil] Device index
|
|
375
|
+
# @return [NvArray]
|
|
376
|
+
def ones(shape, dtype: :float32, device: nil)
|
|
377
|
+
size = Array(shape).reduce(1, :*)
|
|
378
|
+
data = Array.new(size, 1.0)
|
|
379
|
+
new(shape: shape, dtype: dtype, device: device, data: data)
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
# Create array with evenly spaced values
|
|
383
|
+
# @param start [Numeric] Start value
|
|
384
|
+
# @param stop [Numeric] End value
|
|
385
|
+
# @param num [Integer] Number of samples
|
|
386
|
+
# @param dtype [Symbol] Data type
|
|
387
|
+
# @param device [Integer, nil] Device index
|
|
388
|
+
# @return [NvArray]
|
|
389
|
+
def linspace(start, stop, num, dtype: :float32, device: nil)
|
|
390
|
+
step = (stop - start).to_f / (num - 1)
|
|
391
|
+
data = (0...num).map { |i| start + step * i }
|
|
392
|
+
new(shape: [num], dtype: dtype, device: device, data: data)
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
# Create array from Ruby array
|
|
396
|
+
# @param data [Array] Nested Ruby array
|
|
397
|
+
# @param dtype [Symbol] Data type
|
|
398
|
+
# @param device [Integer, nil] Device index
|
|
399
|
+
# @return [NvArray]
|
|
400
|
+
def from_array(data, dtype: :float32, device: nil)
|
|
401
|
+
shape = infer_shape(data)
|
|
402
|
+
flat = flatten_nested(data)
|
|
403
|
+
new(shape: shape, dtype: dtype, device: device, data: flat)
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
# Create identity matrix
|
|
407
|
+
# @param size [Integer] Size of the square matrix
|
|
408
|
+
# @param dtype [Symbol] Data type
|
|
409
|
+
# @param device [Integer, nil] Device index
|
|
410
|
+
# @return [NvArray]
|
|
411
|
+
def eye(size, dtype: :float32, device: nil)
|
|
412
|
+
data = Array.new(size * size) { |i| i / size == i % size ? 1.0 : 0.0 }
|
|
413
|
+
new(shape: [size, size], dtype: dtype, device: device, data: data)
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
private
|
|
417
|
+
|
|
418
|
+
# Infer shape from nested array
|
|
419
|
+
# @param data [Array] Nested array
|
|
420
|
+
# @return [Array<Integer>]
|
|
421
|
+
def infer_shape(data)
|
|
422
|
+
shape = []
|
|
423
|
+
current = data
|
|
424
|
+
while current.is_a?(Array)
|
|
425
|
+
shape << current.size
|
|
426
|
+
current = current.first
|
|
427
|
+
end
|
|
428
|
+
shape
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
# Flatten nested array
|
|
432
|
+
# @param data [Array] Nested array
|
|
433
|
+
# @return [Array]
|
|
434
|
+
def flatten_nested(data)
|
|
435
|
+
data.flatten
|
|
436
|
+
end
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
private
|
|
440
|
+
|
|
441
|
+
# Normalize shape input
|
|
442
|
+
# @param shape [Array<Integer>, Integer]
|
|
443
|
+
# @return [Array<Integer>]
|
|
444
|
+
def normalize_shape(shape)
|
|
445
|
+
Array(shape).map(&:to_i)
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
# Compute strides for row-major layout
|
|
449
|
+
# @return [Array<Integer>]
|
|
450
|
+
def compute_strides
|
|
451
|
+
strides = []
|
|
452
|
+
stride = itemsize
|
|
453
|
+
@shape.reverse_each do |dim|
|
|
454
|
+
strides.unshift(stride)
|
|
455
|
+
stride *= dim
|
|
456
|
+
end
|
|
457
|
+
strides
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
# Compute total size in bytes
|
|
461
|
+
# @return [Integer]
|
|
462
|
+
def compute_size_bytes
|
|
463
|
+
size * itemsize
|
|
464
|
+
end
|
|
465
|
+
|
|
466
|
+
# Allocate memory on specified location
|
|
467
|
+
# @param location [:host, :device]
|
|
468
|
+
def allocate_memory(location)
|
|
469
|
+
@location = location
|
|
470
|
+
|
|
471
|
+
if location == :device
|
|
472
|
+
@device_memory = CUDA::Memory.new(@size_bytes, device: @device_index)
|
|
473
|
+
else
|
|
474
|
+
@host_memory = FFI::MemoryPointer.new(:uint8, @size_bytes)
|
|
475
|
+
end
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
# Initialize with provided data
|
|
479
|
+
# @param data [Array, String]
|
|
480
|
+
def initialize_with_data(data)
|
|
481
|
+
@host_memory = FFI::MemoryPointer.new(:uint8, @size_bytes)
|
|
482
|
+
@location = :host
|
|
483
|
+
|
|
484
|
+
flat_data = data.is_a?(Array) ? data.flatten : data
|
|
485
|
+
|
|
486
|
+
write_data(flat_data)
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
# Write data to host memory
|
|
490
|
+
# @param data [Array, String]
|
|
491
|
+
def write_data(data)
|
|
492
|
+
case data
|
|
493
|
+
when String
|
|
494
|
+
@host_memory.put_bytes(0, data)
|
|
495
|
+
when Array
|
|
496
|
+
if DType.complex?(@dtype)
|
|
497
|
+
write_complex_data(data)
|
|
498
|
+
else
|
|
499
|
+
write_scalar_data(data)
|
|
500
|
+
end
|
|
501
|
+
end
|
|
502
|
+
end
|
|
503
|
+
|
|
504
|
+
# Write complex data (stored as [real0, imag0, real1, imag1, ...])
|
|
505
|
+
# @param data [Array] Interleaved real/imag values
|
|
506
|
+
def write_complex_data(data)
|
|
507
|
+
component_type = DType.real_dtype(@dtype)
|
|
508
|
+
component_size = DType.byte_size(component_type)
|
|
509
|
+
|
|
510
|
+
data.each_with_index do |val, i|
|
|
511
|
+
offset = i * component_size
|
|
512
|
+
case component_type
|
|
513
|
+
when :float32
|
|
514
|
+
@host_memory.put_float(offset, val.to_f)
|
|
515
|
+
when :float64
|
|
516
|
+
@host_memory.put_double(offset, val.to_f)
|
|
517
|
+
end
|
|
518
|
+
end
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
# Write scalar (non-complex) or component data
|
|
522
|
+
# @param data [Array] Values to write
|
|
523
|
+
def write_scalar_data(data)
|
|
524
|
+
# Half-precision types have no native FFI representation: encode the float
|
|
525
|
+
# value into its 16-bit bit pattern rather than truncating it to an integer.
|
|
526
|
+
if @dtype == :float16
|
|
527
|
+
data.each_with_index { |val, i| @host_memory.put_uint16(i * 2, Half.f32_to_f16(val)) }
|
|
528
|
+
return
|
|
529
|
+
elsif @dtype == :bfloat16
|
|
530
|
+
data.each_with_index { |val, i| @host_memory.put_uint16(i * 2, Half.f32_to_bf16(val)) }
|
|
531
|
+
return
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
ffi_type = DType.ffi_type(@dtype)
|
|
535
|
+
comp_size = DType.complex?(@dtype) ? itemsize / 2 : itemsize
|
|
536
|
+
data.each_with_index do |val, i|
|
|
537
|
+
offset = i * comp_size
|
|
538
|
+
case ffi_type
|
|
539
|
+
when :float
|
|
540
|
+
@host_memory.put_float(offset, val.to_f)
|
|
541
|
+
when :double
|
|
542
|
+
@host_memory.put_double(offset, val.to_f)
|
|
543
|
+
when :int8
|
|
544
|
+
@host_memory.put_int8(offset, val.to_i)
|
|
545
|
+
when :int16
|
|
546
|
+
@host_memory.put_int16(offset, val.to_i)
|
|
547
|
+
when :int32
|
|
548
|
+
@host_memory.put_int32(offset, val.to_i)
|
|
549
|
+
when :int64
|
|
550
|
+
@host_memory.put_int64(offset, val.to_i)
|
|
551
|
+
when :uint8
|
|
552
|
+
@host_memory.put_uint8(offset, val.to_i)
|
|
553
|
+
when :uint16
|
|
554
|
+
@host_memory.put_uint16(offset, val.to_i)
|
|
555
|
+
when :uint32
|
|
556
|
+
@host_memory.put_uint32(offset, val.to_i)
|
|
557
|
+
when :uint64
|
|
558
|
+
@host_memory.put_uint64(offset, val.to_i)
|
|
559
|
+
end
|
|
560
|
+
end
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
# Read flat data from host memory (logically ordered)
|
|
564
|
+
# @return [Array]
|
|
565
|
+
def read_flat_data
|
|
566
|
+
ensure_host_data!
|
|
567
|
+
|
|
568
|
+
ffi_type = DType.ffi_type(@dtype)
|
|
569
|
+
is_complex = DType.complex?(@dtype)
|
|
570
|
+
count = is_complex ? size * 2 : size
|
|
571
|
+
comp_size = is_complex ? itemsize / 2 : itemsize
|
|
572
|
+
|
|
573
|
+
# Half-precision types are stored as raw uint16 bits and must be decoded.
|
|
574
|
+
half_decoder = case @dtype
|
|
575
|
+
when :float16 then Half.method(:f16_to_f32)
|
|
576
|
+
when :bfloat16 then Half.method(:bf16_to_f32)
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
flat = if contiguous?
|
|
580
|
+
count.times.map do |i|
|
|
581
|
+
@host_memory.send("get_#{ffi_type}", i * comp_size)
|
|
582
|
+
end
|
|
583
|
+
else
|
|
584
|
+
# Logical iteration for non-contiguous arrays
|
|
585
|
+
values = []
|
|
586
|
+
iterate_shape do |offset|
|
|
587
|
+
if is_complex
|
|
588
|
+
values << @host_memory.send("get_#{ffi_type}", offset)
|
|
589
|
+
values << @host_memory.send("get_#{ffi_type}", offset + comp_size)
|
|
590
|
+
else
|
|
591
|
+
values << @host_memory.send("get_#{ffi_type}", offset)
|
|
592
|
+
end
|
|
593
|
+
end
|
|
594
|
+
values
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
half_decoder ? flat.map { |bits| half_decoder.call(bits) } : flat
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
# Iterate over shape and yield byte offsets
|
|
601
|
+
def iterate_shape(dim = 0, current_offset = 0, &block)
|
|
602
|
+
if dim == ndim - 1
|
|
603
|
+
@shape[dim].times do |i|
|
|
604
|
+
yield current_offset + i * @strides[dim]
|
|
605
|
+
end
|
|
606
|
+
else
|
|
607
|
+
@shape[dim].times do |i|
|
|
608
|
+
iterate_shape(dim + 1, current_offset + i * @strides[dim], &block)
|
|
609
|
+
end
|
|
610
|
+
end
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
# Reshape flat array to nested structure
|
|
614
|
+
# @param flat [Array] Flat array
|
|
615
|
+
# @param shape [Array<Integer>] Target shape
|
|
616
|
+
# @return [Array]
|
|
617
|
+
def reshape_to_nested(flat, shape)
|
|
618
|
+
return flat.first if shape.empty?
|
|
619
|
+
return flat if shape.size == 1
|
|
620
|
+
|
|
621
|
+
chunk_size = shape[1..].reduce(1, :*)
|
|
622
|
+
flat.each_slice(chunk_size).map do |chunk|
|
|
623
|
+
reshape_to_nested(chunk, shape[1..])
|
|
624
|
+
end
|
|
625
|
+
end
|
|
626
|
+
|
|
627
|
+
# Ensure data is on host
|
|
628
|
+
def ensure_host_data!
|
|
629
|
+
to_host if on_device?
|
|
630
|
+
raise InvalidOperationError, "No host data available" unless @host_memory
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
# Ensure data is on device
|
|
634
|
+
def ensure_device_data!
|
|
635
|
+
to_device if on_host?
|
|
636
|
+
raise InvalidOperationError, "No device data available" unless @device_memory
|
|
637
|
+
end
|
|
638
|
+
|
|
639
|
+
# Synchronize if on device
|
|
640
|
+
def synchronize_if_needed
|
|
641
|
+
CUDA::Device.current.synchronize if on_device?
|
|
642
|
+
rescue StandardError
|
|
643
|
+
# Ignore sync errors when reading
|
|
644
|
+
end
|
|
645
|
+
end
|
|
646
|
+
end
|