ignis-numerics 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,646 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ # GPU-aware multi-dimensional array
5
+ # Similar to NumPy ndarray but with CUDA memory backing
6
+ class NvArray
7
+ # @return [Array<Integer>] Shape of the array
8
+ attr_reader :shape
9
+
10
+ # @return [Symbol] Data type (:float32, :float64, :complex64, etc.)
11
+ attr_reader :dtype
12
+
13
+ # @return [Array<Integer>] Strides in bytes for each dimension
14
+ attr_reader :strides
15
+
16
+ # @return [CUDA::Memory, nil] Device memory (nil if on host)
17
+ attr_reader :device_memory
18
+
19
+ # @return [FFI::Pointer, nil] Host memory pointer (nil if on device)
20
+ attr_reader :host_memory
21
+
22
+ # @return [:host, :device] Current memory location
23
+ attr_reader :location
24
+
25
+ # @return [Integer] Device index
26
+ attr_reader :device_index
27
+
28
+ # Create a new NvArray
29
+ # @param shape [Array<Integer>] Shape of the array
30
+ # @param dtype [Symbol] Data type (default: :float32)
31
+ # @param device [Integer, nil] Device index (nil for host-only)
32
+ # @param data [Array, String, nil] Initial data
33
+ def initialize(shape:, dtype: :float32, device: nil, data: nil)
34
+ @shape = normalize_shape(shape)
35
+ @dtype = DType.validate!(dtype)
36
+ @device_index = device || Ignis.configuration.default_device
37
+ @strides = compute_strides
38
+ @size_bytes = compute_size_bytes
39
+
40
+ @device_memory = nil
41
+ @host_memory = nil
42
+
43
+ if data
44
+ initialize_with_data(data)
45
+ else
46
+ allocate_memory(device ? :device : :host)
47
+ end
48
+ end
49
+
50
+ # @return [Integer] Total number of elements
51
+ def size
52
+ @shape.reduce(1, :*)
53
+ end
54
+
55
+ # @return [Integer] Number of dimensions
56
+ def ndim
57
+ @shape.size
58
+ end
59
+
60
+ # @return [Integer] Total size in bytes
61
+ def nbytes
62
+ @size_bytes
63
+ end
64
+
65
+ # @return [Integer] Size of each element in bytes
66
+ def itemsize
67
+ DType.byte_size(@dtype)
68
+ end
69
+
70
+ # Check if data is on device
71
+ # @return [Boolean]
72
+ def on_device?
73
+ @location == :device
74
+ end
75
+
76
+ # Check if data is on host
77
+ # @return [Boolean]
78
+ def on_host?
79
+ @location == :host
80
+ end
81
+
82
+ # Transfer data to GPU
83
+ # @param device [Integer, nil] Target device (nil for current)
84
+ # @param stream [CUDA::Stream, nil] Stream for async transfer
85
+ # @return [self]
86
+ def to_device(device: nil, stream: nil)
87
+ return self if on_device? && (device.nil? || device == @device_index)
88
+
89
+ target_device = device || @device_index
90
+
91
+ ensure_host_data!
92
+
93
+ @device_memory = CUDA::Memory.new(@size_bytes, device: target_device)
94
+ @device_memory.copy_from_host(@host_memory, stream: stream)
95
+
96
+ @device_index = target_device
97
+ @location = :device
98
+
99
+ # Free host memory if not needed
100
+ @host_memory = nil unless Ignis.configuration.with_lock { false } # Keep host copy option
101
+
102
+ self
103
+ end
104
+
105
+ # Transfer data to host
106
+ # @param stream [CUDA::Stream, nil] Stream for async transfer
107
+ # @return [self]
108
+ def to_host(stream: nil)
109
+ return self if on_host?
110
+
111
+ ensure_device_data!
112
+
113
+ @host_memory = FFI::MemoryPointer.new(:uint8, @size_bytes)
114
+ @device_memory.copy_to_host(host_buffer: @host_memory, stream: stream)
115
+
116
+ @location = :host
117
+ @device_memory.free!
118
+ @device_memory = nil
119
+
120
+ self
121
+ end
122
+
123
+ # Get device pointer for CUDA operations
124
+ # @return [Fiddle::Pointer] Device pointer
125
+ # @raise [InvalidOperationError] If not on device
126
+ def device_ptr
127
+ ensure_device_data!
128
+ @device_memory.device_ptr
129
+ end
130
+
131
+ # Get device pointer wrapped as an FFI::Pointer for FFI-bound CUDA-X
132
+ # library calls (cuBLAS/cuSOLVER/cuFFT/cuRAND/cuSPARSE), which cannot
133
+ # accept the Fiddle::Pointer returned by #device_ptr.
134
+ # @return [FFI::Pointer] Device pointer
135
+ # @raise [InvalidOperationError] If not on device
136
+ def device_ffi_ptr
137
+ ensure_device_data!
138
+ @device_memory.ffi_ptr
139
+ end
140
+
141
+ # Get host pointer
142
+ # @return [FFI::Pointer] Host pointer
143
+ # @raise [InvalidOperationError] If not on host
144
+ def host_ptr
145
+ ensure_host_data!
146
+ @host_memory
147
+ end
148
+
149
+ # Get data as Ruby array (copies to host if needed)
150
+ # @return [Array] Nested Ruby array with data
151
+ def to_a
152
+ synchronize_if_needed
153
+ ensure_host_data!
154
+
155
+ flat_data = read_flat_data
156
+ reshape_to_nested(flat_data, @shape)
157
+ end
158
+
159
+ # Get flat data as Ruby array
160
+ # @return [Array] 1D Ruby array with all elements
161
+ def flatten
162
+ synchronize_if_needed
163
+ ensure_host_data!
164
+ read_flat_data
165
+ end
166
+
167
+ # Reshape the array
168
+ # @param new_shape [Array<Integer>] New shape
169
+ # @return [NvArray] Reshaped array (view if contiguous)
170
+ def reshape(new_shape)
171
+ new_shape = normalize_shape(new_shape)
172
+
173
+ # Handle -1 in shape
174
+ if new_shape.include?(-1)
175
+ neg_idx = new_shape.index(-1)
176
+ other_size = new_shape.reject { |d| d == -1 }.reduce(1, :*)
177
+ new_shape[neg_idx] = size / other_size
178
+ end
179
+
180
+ raise DimensionError, "Cannot reshape array of size #{size} to #{new_shape}" unless new_shape.reduce(1, :*) == size
181
+
182
+ # Create new array with same memory
183
+ result = dup
184
+ result.instance_variable_set(:@shape, new_shape)
185
+ result.instance_variable_set(:@strides, result.send(:compute_strides))
186
+ result
187
+ end
188
+
189
+ # Transpose the array
190
+ # @param axes [Array<Integer>, nil] Permutation of axes (reverses if nil)
191
+ # @return [NvArray] Transposed array
192
+ def transpose(axes: nil)
193
+ axes ||= (0...ndim).to_a.reverse
194
+
195
+ raise DimensionError, "Invalid axes for transpose" unless axes.sort == (0...ndim).to_a
196
+
197
+ new_shape = axes.map { |ax| @shape[ax] }
198
+ new_strides = axes.map { |ax| @strides[ax] }
199
+
200
+ result = dup
201
+ result.instance_variable_set(:@shape, new_shape)
202
+ result.instance_variable_set(:@strides, new_strides)
203
+ result
204
+ end
205
+
206
+ # Create a contiguous copy
207
+ # @return [NvArray] Contiguous copy
208
+ def contiguous
209
+ return self if contiguous?
210
+
211
+ # Create new array and copy data
212
+ result = NvArray.new(shape: @shape, dtype: @dtype, device: on_device? ? @device_index : nil)
213
+
214
+ if on_device?
215
+ # Device-to-device copy
216
+ result.device_memory.copy_from_device(@device_memory)
217
+ else
218
+ # Copy host data
219
+ result.host_memory.put_bytes(0, @host_memory.get_bytes(0, @size_bytes))
220
+ end
221
+
222
+ result
223
+ end
224
+
225
+ # Check if memory layout is contiguous
226
+ # @return [Boolean]
227
+ def contiguous?
228
+ expected = itemsize
229
+ @strides.reverse.each_with_index do |stride, i|
230
+ dim = @shape[ndim - 1 - i]
231
+ return false unless stride == expected || dim == 1
232
+
233
+ expected *= dim
234
+ end
235
+ true
236
+ end
237
+
238
+ # Duplicate the array
239
+ # @return [NvArray] Copy of the array
240
+ def dup
241
+ result = NvArray.new(shape: @shape.dup, dtype: @dtype, device: on_device? ? @device_index : nil)
242
+
243
+ if on_device?
244
+ result.instance_variable_set(:@device_memory, CUDA::Memory.new(@size_bytes, device: @device_index))
245
+ result.device_memory.copy_from_device(@device_memory)
246
+ result.instance_variable_set(:@location, :device)
247
+ else
248
+ new_host = FFI::MemoryPointer.new(:uint8, @size_bytes)
249
+ new_host.put_bytes(0, @host_memory.get_bytes(0, @size_bytes))
250
+ result.instance_variable_set(:@host_memory, new_host)
251
+ result.instance_variable_set(:@location, :host)
252
+ end
253
+
254
+ result
255
+ end
256
+
257
+ # Zero out the array
258
+ # @param stream [CUDA::Stream, nil] Stream for async operation
259
+ # @return [self]
260
+ def zero!(stream: nil)
261
+ if on_device?
262
+ @device_memory.zero!(stream: stream)
263
+ else
264
+ @host_memory.clear
265
+ end
266
+ self
267
+ end
268
+
269
+ # Free all memory
270
+ # @return [void]
271
+ def free!
272
+ @device_memory&.free!
273
+ @device_memory = nil
274
+ @host_memory = nil
275
+ @location = nil
276
+ end
277
+
278
+ # @return [String] String representation
279
+ def to_s
280
+ loc = on_device? ? "device:#{@device_index}" : "host"
281
+ "NvArray(shape=#{@shape}, dtype=#{@dtype}, #{loc})"
282
+ end
283
+
284
+ # @return [String] Detailed inspection
285
+ def inspect
286
+ "#<Ignis::NvArray:#{object_id} shape=#{@shape} dtype=#{@dtype} " \
287
+ "location=#{@location} device=#{@device_index} bytes=#{@size_bytes}>"
288
+ end
289
+
290
+ class << self
291
+ # Create array filled with zeros
292
+ # @param shape [Array<Integer>] Shape
293
+ # @param dtype [Symbol] Data type
294
+ # @param device [Integer, nil] Device index
295
+ # @return [NvArray]
296
+ def zeros(shape, dtype: :float32, device: nil)
297
+ arr = new(shape: shape, dtype: dtype, device: device)
298
+ arr.zero!
299
+ arr
300
+ end
301
+
302
+ # Create array filled with ones
303
+ # @param shape [Array<Integer>] Shape
304
+ # @param dtype [Symbol] Data type
305
+ # @param device [Integer, nil] Device index
306
+ # @return [NvArray]
307
+ def ones(shape, dtype: :float32, device: nil)
308
+ size = Array(shape).reduce(1, :*)
309
+ data = Array.new(size, 1.0)
310
+ new(shape: shape, dtype: dtype, device: device, data: data)
311
+ end
312
+
313
+ # Create array with evenly spaced values
314
+ # @param start [Numeric] Start value
315
+ # @param stop [Numeric] End value
316
+ # @param num [Integer] Number of samples
317
+ # @param dtype [Symbol] Data type
318
+ # @param device [Integer, nil] Device index
319
+ # @return [NvArray]
320
+ def linspace(start, stop, num, dtype: :float32, device: nil)
321
+ step = (stop - start).to_f / (num - 1)
322
+ data = (0...num).map { |i| start + step * i }
323
+ new(shape: [num], dtype: dtype, device: device, data: data)
324
+ end
325
+
326
+ # Create array from existing device memory
327
+ # @param ptr [FFI::Pointer, CUDA::Memory] Device pointer or Memory object
328
+ # @param shape [Array<Integer>] Shape
329
+ # @param dtype [Symbol] Data type
330
+ # @param take_ownership [Boolean] If true, the array will manage the memory lifecycle
331
+ # @return [NvArray]
332
+ def from_device_ptr(ptr, shape:, dtype: :float32, take_ownership: false)
333
+ arr = allocate_empty_metadata(shape, dtype)
334
+
335
+ if ptr.is_a?(CUDA::Memory)
336
+ # If it's already a Memory object, we can use it directly
337
+ # If we don't take ownership, we might need a non-owning version
338
+ if take_ownership
339
+ arr.instance_variable_set(:@device_memory, ptr)
340
+ else
341
+ # Create a non-owning wrapper for the same pointer
342
+ wrapper = CUDA::Memory.new(arr.nbytes, device: ptr.device_index, ptr: ptr.device_ptr, owned: false)
343
+ arr.instance_variable_set(:@device_memory, wrapper)
344
+ end
345
+ else
346
+ # It's a raw FFI::Pointer
347
+ # We wrap it in a Memory object
348
+ wrapper = CUDA::Memory.new(arr.nbytes, ptr: ptr, owned: take_ownership)
349
+ arr.instance_variable_set(:@device_memory, wrapper)
350
+ end
351
+
352
+ arr.instance_variable_set(:@location, :device)
353
+ arr.instance_variable_set(:@device_index, arr.device_memory.device_index)
354
+ arr
355
+ end
356
+
357
+ # Create an empty NvArray without allocating memory
358
+ # @param shape [Array<Integer>] Shape
359
+ # @param dtype [Symbol] Data type
360
+ # @return [NvArray]
361
+ def allocate_empty_metadata(shape, dtype)
362
+ # We use allocate to avoid calling initialize which tries to allocate memory
363
+ arr = allocate
364
+ arr.instance_variable_set(:@shape, Array(shape).map(&:to_i))
365
+ arr.instance_variable_set(:@dtype, DType.validate!(dtype))
366
+ arr.instance_variable_set(:@strides, arr.send(:compute_strides))
367
+ arr.instance_variable_set(:@size_bytes, arr.send(:compute_size_bytes))
368
+ arr
369
+ end
370
+
371
+ # Create array filled with ones
372
+ # @param shape [Array<Integer>] Shape
373
+ # @param dtype [Symbol] Data type
374
+ # @param device [Integer, nil] Device index
375
+ # @return [NvArray]
376
+ def ones(shape, dtype: :float32, device: nil)
377
+ size = Array(shape).reduce(1, :*)
378
+ data = Array.new(size, 1.0)
379
+ new(shape: shape, dtype: dtype, device: device, data: data)
380
+ end
381
+
382
+ # Create array with evenly spaced values
383
+ # @param start [Numeric] Start value
384
+ # @param stop [Numeric] End value
385
+ # @param num [Integer] Number of samples
386
+ # @param dtype [Symbol] Data type
387
+ # @param device [Integer, nil] Device index
388
+ # @return [NvArray]
389
+ def linspace(start, stop, num, dtype: :float32, device: nil)
390
+ step = (stop - start).to_f / (num - 1)
391
+ data = (0...num).map { |i| start + step * i }
392
+ new(shape: [num], dtype: dtype, device: device, data: data)
393
+ end
394
+
395
+ # Create array from Ruby array
396
+ # @param data [Array] Nested Ruby array
397
+ # @param dtype [Symbol] Data type
398
+ # @param device [Integer, nil] Device index
399
+ # @return [NvArray]
400
+ def from_array(data, dtype: :float32, device: nil)
401
+ shape = infer_shape(data)
402
+ flat = flatten_nested(data)
403
+ new(shape: shape, dtype: dtype, device: device, data: flat)
404
+ end
405
+
406
+ # Create identity matrix
407
+ # @param size [Integer] Size of the square matrix
408
+ # @param dtype [Symbol] Data type
409
+ # @param device [Integer, nil] Device index
410
+ # @return [NvArray]
411
+ def eye(size, dtype: :float32, device: nil)
412
+ data = Array.new(size * size) { |i| i / size == i % size ? 1.0 : 0.0 }
413
+ new(shape: [size, size], dtype: dtype, device: device, data: data)
414
+ end
415
+
416
+ private
417
+
418
+ # Infer shape from nested array
419
+ # @param data [Array] Nested array
420
+ # @return [Array<Integer>]
421
+ def infer_shape(data)
422
+ shape = []
423
+ current = data
424
+ while current.is_a?(Array)
425
+ shape << current.size
426
+ current = current.first
427
+ end
428
+ shape
429
+ end
430
+
431
+ # Flatten nested array
432
+ # @param data [Array] Nested array
433
+ # @return [Array]
434
+ def flatten_nested(data)
435
+ data.flatten
436
+ end
437
+ end
438
+
439
+ private
440
+
441
+ # Normalize shape input
442
+ # @param shape [Array<Integer>, Integer]
443
+ # @return [Array<Integer>]
444
+ def normalize_shape(shape)
445
+ Array(shape).map(&:to_i)
446
+ end
447
+
448
+ # Compute strides for row-major layout
449
+ # @return [Array<Integer>]
450
+ def compute_strides
451
+ strides = []
452
+ stride = itemsize
453
+ @shape.reverse_each do |dim|
454
+ strides.unshift(stride)
455
+ stride *= dim
456
+ end
457
+ strides
458
+ end
459
+
460
+ # Compute total size in bytes
461
+ # @return [Integer]
462
+ def compute_size_bytes
463
+ size * itemsize
464
+ end
465
+
466
+ # Allocate memory on specified location
467
+ # @param location [:host, :device]
468
+ def allocate_memory(location)
469
+ @location = location
470
+
471
+ if location == :device
472
+ @device_memory = CUDA::Memory.new(@size_bytes, device: @device_index)
473
+ else
474
+ @host_memory = FFI::MemoryPointer.new(:uint8, @size_bytes)
475
+ end
476
+ end
477
+
478
+ # Initialize with provided data
479
+ # @param data [Array, String]
480
+ def initialize_with_data(data)
481
+ @host_memory = FFI::MemoryPointer.new(:uint8, @size_bytes)
482
+ @location = :host
483
+
484
+ flat_data = data.is_a?(Array) ? data.flatten : data
485
+
486
+ write_data(flat_data)
487
+ end
488
+
489
+ # Write data to host memory
490
+ # @param data [Array, String]
491
+ def write_data(data)
492
+ case data
493
+ when String
494
+ @host_memory.put_bytes(0, data)
495
+ when Array
496
+ if DType.complex?(@dtype)
497
+ write_complex_data(data)
498
+ else
499
+ write_scalar_data(data)
500
+ end
501
+ end
502
+ end
503
+
504
+ # Write complex data (stored as [real0, imag0, real1, imag1, ...])
505
+ # @param data [Array] Interleaved real/imag values
506
+ def write_complex_data(data)
507
+ component_type = DType.real_dtype(@dtype)
508
+ component_size = DType.byte_size(component_type)
509
+
510
+ data.each_with_index do |val, i|
511
+ offset = i * component_size
512
+ case component_type
513
+ when :float32
514
+ @host_memory.put_float(offset, val.to_f)
515
+ when :float64
516
+ @host_memory.put_double(offset, val.to_f)
517
+ end
518
+ end
519
+ end
520
+
521
+ # Write scalar (non-complex) or component data
522
+ # @param data [Array] Values to write
523
+ def write_scalar_data(data)
524
+ # Half-precision types have no native FFI representation: encode the float
525
+ # value into its 16-bit bit pattern rather than truncating it to an integer.
526
+ if @dtype == :float16
527
+ data.each_with_index { |val, i| @host_memory.put_uint16(i * 2, Half.f32_to_f16(val)) }
528
+ return
529
+ elsif @dtype == :bfloat16
530
+ data.each_with_index { |val, i| @host_memory.put_uint16(i * 2, Half.f32_to_bf16(val)) }
531
+ return
532
+ end
533
+
534
+ ffi_type = DType.ffi_type(@dtype)
535
+ comp_size = DType.complex?(@dtype) ? itemsize / 2 : itemsize
536
+ data.each_with_index do |val, i|
537
+ offset = i * comp_size
538
+ case ffi_type
539
+ when :float
540
+ @host_memory.put_float(offset, val.to_f)
541
+ when :double
542
+ @host_memory.put_double(offset, val.to_f)
543
+ when :int8
544
+ @host_memory.put_int8(offset, val.to_i)
545
+ when :int16
546
+ @host_memory.put_int16(offset, val.to_i)
547
+ when :int32
548
+ @host_memory.put_int32(offset, val.to_i)
549
+ when :int64
550
+ @host_memory.put_int64(offset, val.to_i)
551
+ when :uint8
552
+ @host_memory.put_uint8(offset, val.to_i)
553
+ when :uint16
554
+ @host_memory.put_uint16(offset, val.to_i)
555
+ when :uint32
556
+ @host_memory.put_uint32(offset, val.to_i)
557
+ when :uint64
558
+ @host_memory.put_uint64(offset, val.to_i)
559
+ end
560
+ end
561
+ end
562
+
563
+ # Read flat data from host memory (logically ordered)
564
+ # @return [Array]
565
+ def read_flat_data
566
+ ensure_host_data!
567
+
568
+ ffi_type = DType.ffi_type(@dtype)
569
+ is_complex = DType.complex?(@dtype)
570
+ count = is_complex ? size * 2 : size
571
+ comp_size = is_complex ? itemsize / 2 : itemsize
572
+
573
+ # Half-precision types are stored as raw uint16 bits and must be decoded.
574
+ half_decoder = case @dtype
575
+ when :float16 then Half.method(:f16_to_f32)
576
+ when :bfloat16 then Half.method(:bf16_to_f32)
577
+ end
578
+
579
+ flat = if contiguous?
580
+ count.times.map do |i|
581
+ @host_memory.send("get_#{ffi_type}", i * comp_size)
582
+ end
583
+ else
584
+ # Logical iteration for non-contiguous arrays
585
+ values = []
586
+ iterate_shape do |offset|
587
+ if is_complex
588
+ values << @host_memory.send("get_#{ffi_type}", offset)
589
+ values << @host_memory.send("get_#{ffi_type}", offset + comp_size)
590
+ else
591
+ values << @host_memory.send("get_#{ffi_type}", offset)
592
+ end
593
+ end
594
+ values
595
+ end
596
+
597
+ half_decoder ? flat.map { |bits| half_decoder.call(bits) } : flat
598
+ end
599
+
600
+ # Iterate over shape and yield byte offsets
601
+ def iterate_shape(dim = 0, current_offset = 0, &block)
602
+ if dim == ndim - 1
603
+ @shape[dim].times do |i|
604
+ yield current_offset + i * @strides[dim]
605
+ end
606
+ else
607
+ @shape[dim].times do |i|
608
+ iterate_shape(dim + 1, current_offset + i * @strides[dim], &block)
609
+ end
610
+ end
611
+ end
612
+
613
+ # Reshape flat array to nested structure
614
+ # @param flat [Array] Flat array
615
+ # @param shape [Array<Integer>] Target shape
616
+ # @return [Array]
617
+ def reshape_to_nested(flat, shape)
618
+ return flat.first if shape.empty?
619
+ return flat if shape.size == 1
620
+
621
+ chunk_size = shape[1..].reduce(1, :*)
622
+ flat.each_slice(chunk_size).map do |chunk|
623
+ reshape_to_nested(chunk, shape[1..])
624
+ end
625
+ end
626
+
627
+ # Ensure data is on host
628
+ def ensure_host_data!
629
+ to_host if on_device?
630
+ raise InvalidOperationError, "No host data available" unless @host_memory
631
+ end
632
+
633
+ # Ensure data is on device
634
+ def ensure_device_data!
635
+ to_device if on_host?
636
+ raise InvalidOperationError, "No device data available" unless @device_memory
637
+ end
638
+
639
+ # Synchronize if on device
640
+ def synchronize_if_needed
641
+ CUDA::Device.current.synchronize if on_device?
642
+ rescue StandardError
643
+ # Ignore sync errors when reading
644
+ end
645
+ end
646
+ end