ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Memory
5
+ # @abstract Abstract base class for device memory resources
6
+ # Modeled after RMM's device_memory_resource interface
7
+ class DeviceMemoryResource
8
+ # Minimum alignment for all allocations (256 bytes per RMM standard)
9
+ ALIGNMENT = 256
10
+
11
+ # @return [Integer] Device index this resource manages
12
+ attr_reader :device_index
13
+
14
+ # @param device_index [Integer] GPU device index (default: current device)
15
+ # @return [void]
16
+ def initialize(device_index: nil)
17
+ @device_index = device_index || Ignis.configuration.default_device
18
+ @mutex = Mutex.new
19
+ end
20
+
21
+ # Allocate device memory with optional stream ordering
22
+ # @param bytes [Integer] Number of bytes to allocate
23
+ # @param stream [Ignis::CUDA::Stream, nil] Optional stream for async allocation
24
+ # @return [FFI::Pointer] Device pointer
25
+ # @raise [Ignis::MemoryError] If allocation fails
26
+ def allocate(bytes, stream: nil)
27
+ raise ArgumentError, "bytes must be positive, got #{bytes}" if bytes <= 0
28
+
29
+ aligned_bytes = align_up(bytes)
30
+
31
+ @mutex.synchronize do
32
+ ptr = do_allocate(aligned_bytes, stream)
33
+ Stats.record_allocation(aligned_bytes)
34
+ ptr
35
+ end
36
+ end
37
+
38
+ # Deallocate device memory with optional stream ordering
39
+ # @param ptr [FFI::Pointer] Device pointer to free
40
+ # @param bytes [Integer] Size of allocation
41
+ # @param stream [Ignis::CUDA::Stream, nil] Optional stream for async deallocation
42
+ # @return [void]
43
+ def deallocate(ptr, bytes, stream: nil)
44
+ return if ptr.null?
45
+
46
+ aligned_bytes = align_up(bytes)
47
+
48
+ @mutex.synchronize do
49
+ do_deallocate(ptr, aligned_bytes, stream)
50
+ Stats.record_deallocation(aligned_bytes)
51
+ end
52
+ end
53
+
54
+ # Check if this resource supports stream-ordered allocation
55
+ # @return [Boolean]
56
+ def supports_streams?
57
+ false
58
+ end
59
+
60
+ # Check if memory from this resource can be deallocated by another
61
+ # @param other [DeviceMemoryResource]
62
+ # @return [Boolean]
63
+ def is_equal?(other)
64
+ self.class == other.class && @device_index == other.device_index
65
+ end
66
+
67
+ # @return [String]
68
+ def to_s
69
+ "#{self.class.name}[device=#{@device_index}]"
70
+ end
71
+
72
+ # @return [String]
73
+ def inspect
74
+ "#<#{self.class.name}:0x#{object_id.to_s(16)} device=#{@device_index}>"
75
+ end
76
+
77
+ protected
78
+
79
+ # @abstract Subclasses must implement this
80
+ # @param bytes [Integer] Aligned bytes to allocate
81
+ # @param stream [Ignis::CUDA::Stream, nil]
82
+ # @return [FFI::Pointer]
83
+ def do_allocate(bytes, stream)
84
+ raise NotImplementedError, "#{self.class}#do_allocate not implemented"
85
+ end
86
+
87
+ # @abstract Subclasses must implement this
88
+ # @param ptr [FFI::Pointer]
89
+ # @param bytes [Integer]
90
+ # @param stream [Ignis::CUDA::Stream, nil]
91
+ # @return [void]
92
+ def do_deallocate(ptr, bytes, stream)
93
+ raise NotImplementedError, "#{self.class}#do_deallocate not implemented"
94
+ end
95
+
96
+ private
97
+
98
+ # Align size up to ALIGNMENT boundary
99
+ # @param bytes [Integer]
100
+ # @return [Integer]
101
+ def align_up(bytes)
102
+ ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Memory
5
+ # @abstract Abstract base class for host memory resources
6
+ class HostMemoryResource
7
+ # Minimum alignment for all allocations
8
+ ALIGNMENT = 64
9
+
10
+ # @return [void]
11
+ def initialize
12
+ @mutex = Mutex.new
13
+ end
14
+
15
+ # Allocate host memory
16
+ # @param bytes [Integer] Number of bytes to allocate
17
+ # @return [FFI::Pointer] Host pointer
18
+ # @raise [Ignis::MemoryError] If allocation fails
19
+ def allocate(bytes)
20
+ raise ArgumentError, "bytes must be positive, got #{bytes}" if bytes <= 0
21
+
22
+ aligned_bytes = align_up(bytes)
23
+
24
+ @mutex.synchronize do
25
+ ptr = do_allocate(aligned_bytes)
26
+ Stats.record_allocation(aligned_bytes)
27
+ ptr
28
+ end
29
+ end
30
+
31
+ # Deallocate host memory
32
+ # @param ptr [FFI::Pointer] Host pointer to free
33
+ # @param bytes [Integer] Size of allocation
34
+ # @return [void]
35
+ def deallocate(ptr, bytes)
36
+ return if ptr.null?
37
+
38
+ aligned_bytes = align_up(bytes)
39
+
40
+ @mutex.synchronize do
41
+ do_deallocate(ptr, aligned_bytes)
42
+ Stats.record_deallocation(aligned_bytes)
43
+ end
44
+ end
45
+
46
+ protected
47
+
48
+ # @abstract
49
+ def do_allocate(bytes)
50
+ raise NotImplementedError
51
+ end
52
+
53
+ # @abstract
54
+ def do_deallocate(ptr, bytes)
55
+ raise NotImplementedError
56
+ end
57
+
58
+ private
59
+
60
+ def align_up(bytes)
61
+ ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT
62
+ end
63
+ end
64
+
65
+ # Pinned (page-locked) host memory resource
66
+ # Uses cudaHostAlloc for faster GPU transfers
67
+ class PinnedHostMemoryResource < HostMemoryResource
68
+ # cudaHostAllocDefault flag
69
+ HOST_ALLOC_DEFAULT = 0x00
70
+ # cudaHostAllocPortable flag
71
+ HOST_ALLOC_PORTABLE = 0x01
72
+ # cudaHostAllocMapped flag
73
+ HOST_ALLOC_MAPPED = 0x02
74
+ # cudaHostAllocWriteCombined flag
75
+ HOST_ALLOC_WRITE_COMBINED = 0x04
76
+
77
+ # @param flags [Integer] cudaHostAlloc flags (default: 0)
78
+ def initialize(flags: HOST_ALLOC_DEFAULT)
79
+ super()
80
+ @flags = flags
81
+ CUDA::RuntimeAPI.ensure_loaded!
82
+ end
83
+
84
+ protected
85
+
86
+ def do_allocate(bytes)
87
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
88
+
89
+ if @flags == HOST_ALLOC_DEFAULT
90
+ status = CUDA::RuntimeAPI.cudaMallocHost(ptr_ptr, bytes)
91
+ CUDA::RuntimeAPI.check_status!(status, "cudaMallocHost(#{bytes} bytes)")
92
+ else
93
+ status = cuda_host_alloc(ptr_ptr, bytes, @flags)
94
+ CUDA::RuntimeAPI.check_status!(status, "cudaHostAlloc(#{bytes} bytes, flags=#{@flags})")
95
+ end
96
+
97
+ ptr_ptr.read_pointer
98
+ end
99
+
100
+ def do_deallocate(ptr, _bytes)
101
+ status = CUDA::RuntimeAPI.cudaFreeHost(ptr)
102
+ CUDA::RuntimeAPI.check_status!(status, "cudaFreeHost")
103
+ end
104
+
105
+ private
106
+
107
+ def cuda_host_alloc(ptr_ptr, size, flags)
108
+ CUDA::RuntimeAPI.cudaHostAlloc(ptr_ptr, size, flags)
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,242 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'device_memory_resource'
4
+
5
+ module Ignis
6
+ module Memory
7
+ # Coalescing best-fit pool memory resource
8
+ # Pre-allocates a large pool and sub-allocates from it
9
+ # Inspired by RMM's pool_memory_resource
10
+ class PoolMemoryResource < DeviceMemoryResource
11
+ # Default initial pool size: 256MB
12
+ DEFAULT_INITIAL_SIZE = 256 * 1024 * 1024
13
+
14
+ # Default maximum pool size: 90% of device memory
15
+ DEFAULT_MAX_RATIO = 0.9
16
+
17
+ # Block header for tracking allocations in the pool
18
+ Block = Struct.new(:ptr, :size, :free, keyword_init: true)
19
+
20
+ # @return [Integer] Current pool size
21
+ attr_reader :current_pool_size
22
+
23
+ # @return [Integer] Maximum pool size
24
+ attr_reader :maximum_pool_size
25
+
26
+ # @param upstream [DeviceMemoryResource] Upstream resource for pool allocations
27
+ # @param initial_pool_size [Integer] Initial pool size in bytes
28
+ # @param maximum_pool_size [Integer, nil] Maximum pool size (nil = 90% of device memory)
29
+ # @param device_index [Integer, nil] GPU device index
30
+ def initialize(
31
+ upstream: nil,
32
+ initial_pool_size: DEFAULT_INITIAL_SIZE,
33
+ maximum_pool_size: nil,
34
+ device_index: nil
35
+ )
36
+ super(device_index: device_index)
37
+
38
+ @upstream = upstream || CudaMemoryResource.new(device_index: @device_index)
39
+ @initial_pool_size = align_up(initial_pool_size)
40
+ @maximum_pool_size = maximum_pool_size || calculate_max_pool_size
41
+ @maximum_pool_size = align_up(@maximum_pool_size)
42
+
43
+ @current_pool_size = 0
44
+ @blocks = []
45
+ @free_list = []
46
+
47
+ allocate_initial_pool!
48
+ end
49
+
50
+ # @return [Boolean] false - synchronous pool allocation
51
+ def supports_streams?
52
+ false
53
+ end
54
+
55
+ # Release all pool memory back to upstream
56
+ # @return [void]
57
+ def destroy!
58
+ @mutex.synchronize do
59
+ @blocks.each do |block|
60
+ @upstream.deallocate(block.ptr, block.size, stream: nil)
61
+ end
62
+ @blocks.clear
63
+ @free_list.clear
64
+ @current_pool_size = 0
65
+ end
66
+ end
67
+
68
+ # @return [Hash] Pool statistics
69
+ def pool_stats
70
+ @mutex.synchronize do
71
+ total_free = @free_list.sum(&:size)
72
+ {
73
+ current_pool_size: @current_pool_size,
74
+ maximum_pool_size: @maximum_pool_size,
75
+ free_bytes: total_free,
76
+ used_bytes: @current_pool_size - total_free,
77
+ block_count: @blocks.size,
78
+ free_block_count: @free_list.size
79
+ }
80
+ end
81
+ end
82
+
83
+ protected
84
+
85
+ # @param bytes [Integer]
86
+ # @param stream [Ignis::CUDA::Stream, nil] Ignored for pool allocation
87
+ # @return [FFI::Pointer]
88
+ def do_allocate(bytes, _stream)
89
+ block = find_free_block(bytes)
90
+
91
+ if block
92
+ split_block_if_needed!(block, bytes)
93
+ block.ptr
94
+ else
95
+ expand_pool!(bytes)
96
+ block = find_free_block(bytes)
97
+ raise MemoryError, "Pool allocation failed for #{bytes} bytes" unless block
98
+
99
+ split_block_if_needed!(block, bytes)
100
+ block.ptr
101
+ end
102
+ end
103
+
104
+ # @param ptr [FFI::Pointer]
105
+ # @param bytes [Integer]
106
+ # @param stream [Ignis::CUDA::Stream, nil] Ignored for pool deallocation
107
+ # @return [void]
108
+ def do_deallocate(ptr, bytes, _stream)
109
+ block = @blocks.find { |b| b.ptr.address == ptr.address }
110
+ return unless block
111
+
112
+ block.free = true
113
+ @free_list << block
114
+ coalesce_free_blocks!
115
+ end
116
+
117
+ private
118
+
119
+ def allocate_initial_pool!
120
+ return if @initial_pool_size <= 0
121
+
122
+ ptr = @upstream.allocate(@initial_pool_size, stream: nil)
123
+ block = Block.new(ptr: ptr, size: @initial_pool_size, free: true)
124
+ @blocks << block
125
+ @free_list << block
126
+ @current_pool_size = @initial_pool_size
127
+
128
+ Ignis.logger.debug { "Pool initialized with #{format_bytes(@initial_pool_size)}" }
129
+ end
130
+
131
+ def find_free_block(bytes)
132
+ @free_list
133
+ .select { |b| b.free && b.size >= bytes }
134
+ .min_by(&:size)
135
+ end
136
+
137
+ def split_block_if_needed!(block, bytes)
138
+ block.free = false
139
+ @free_list.delete(block)
140
+
141
+ remainder = block.size - bytes
142
+ return if remainder < ALIGNMENT * 2
143
+
144
+ new_ptr = FFI::Pointer.new(block.ptr.address + bytes)
145
+ new_block = Block.new(ptr: new_ptr, size: remainder, free: true)
146
+ block.size = bytes
147
+
148
+ @blocks << new_block
149
+ @free_list << new_block
150
+ end
151
+
152
+ def expand_pool!(min_bytes)
153
+ if @current_pool_size >= @maximum_pool_size
154
+ raise MemoryError, "Pool exhausted: current=#{format_bytes(@current_pool_size)}, " \
155
+ "max=#{format_bytes(@maximum_pool_size)}, requested=#{format_bytes(min_bytes)}"
156
+ end
157
+
158
+ remaining = @maximum_pool_size - @current_pool_size
159
+ grow_size = if @maximum_pool_size > 0
160
+ [remaining / 2, min_bytes, ALIGNMENT].max
161
+ else
162
+ [@current_pool_size, min_bytes, @initial_pool_size].max
163
+ end
164
+ grow_size = align_up([grow_size, remaining].min)
165
+
166
+ Ignis.logger.debug { "Expanding pool by #{format_bytes(grow_size)}" }
167
+
168
+ ptr = @upstream.allocate(grow_size, stream: nil)
169
+ block = Block.new(ptr: ptr, size: grow_size, free: true)
170
+ @blocks << block
171
+ @free_list << block
172
+ @current_pool_size += grow_size
173
+ end
174
+
175
+ def coalesce_free_blocks!
176
+ return if @free_list.size < 2
177
+
178
+ sorted = @free_list.sort_by { |b| b.ptr.address }
179
+ merged = []
180
+ current = sorted.first
181
+
182
+ sorted.drop(1).each do |block|
183
+ if current.ptr.address + current.size == block.ptr.address
184
+ current.size += block.size
185
+ @blocks.delete(block)
186
+ else
187
+ merged << current
188
+ current = block
189
+ end
190
+ end
191
+ merged << current
192
+
193
+ @free_list.replace(merged)
194
+ end
195
+
196
+ def calculate_max_pool_size
197
+ CUDA::RuntimeAPI.ensure_loaded!
198
+ free_ptr = FFI::MemoryPointer.new(:size_t)
199
+ total_ptr = FFI::MemoryPointer.new(:size_t)
200
+
201
+ ensure_device do
202
+ status = CUDA::RuntimeAPI.cudaMemGetInfo(free_ptr, total_ptr)
203
+ CUDA::RuntimeAPI.check_status!(status, "cudaMemGetInfo")
204
+ end
205
+
206
+ total = total_ptr.read(:size_t)
207
+ (total * DEFAULT_MAX_RATIO).to_i
208
+ end
209
+
210
+ def ensure_device
211
+ current_ptr = FFI::MemoryPointer.new(:int)
212
+ CUDA::RuntimeAPI.cudaGetDevice(current_ptr)
213
+ original = current_ptr.read_int
214
+
215
+ if original != @device_index
216
+ status = CUDA::RuntimeAPI.cudaSetDevice(@device_index)
217
+ CUDA::RuntimeAPI.check_status!(status, "cudaSetDevice(#{@device_index})")
218
+ end
219
+
220
+ result = yield
221
+
222
+ if original != @device_index
223
+ CUDA::RuntimeAPI.cudaSetDevice(original)
224
+ end
225
+
226
+ result
227
+ end
228
+
229
+ def format_bytes(bytes)
230
+ if bytes >= 1024 * 1024 * 1024
231
+ format("%.2f GB", bytes.to_f / (1024 * 1024 * 1024))
232
+ elsif bytes >= 1024 * 1024
233
+ format("%.2f MB", bytes.to_f / (1024 * 1024))
234
+ elsif bytes >= 1024
235
+ format("%.2f KB", bytes.to_f / 1024)
236
+ else
237
+ "#{bytes} B"
238
+ end
239
+ end
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Memory
5
+ # Thread-safe allocation statistics tracking
6
+ # Provides real-time visibility into memory usage patterns
7
+ class Stats
8
+ class << self
9
+ # @return [Integer] Total bytes currently allocated
10
+ def total_allocated_bytes
11
+ @total_allocated_bytes ||= 0
12
+ end
13
+
14
+ # @return [Integer] Peak bytes allocated (high-water mark)
15
+ def peak_allocated_bytes
16
+ @peak_allocated_bytes ||= 0
17
+ end
18
+
19
+ # @return [Integer] Total allocation count since startup
20
+ def allocation_count
21
+ @allocation_count ||= 0
22
+ end
23
+
24
+ # @return [Integer] Total deallocation count since startup
25
+ def deallocation_count
26
+ @deallocation_count ||= 0
27
+ end
28
+
29
+ # Record an allocation
30
+ # @param bytes [Integer]
31
+ # @return [void]
32
+ def record_allocation(bytes)
33
+ mutex.synchronize do
34
+ @total_allocated_bytes = (total_allocated_bytes + bytes)
35
+ @peak_allocated_bytes = [@peak_allocated_bytes || 0, @total_allocated_bytes].max
36
+ @allocation_count = (allocation_count + 1)
37
+ end
38
+ end
39
+
40
+ # Record a deallocation
41
+ # @param bytes [Integer]
42
+ # @return [void]
43
+ def record_deallocation(bytes)
44
+ mutex.synchronize do
45
+ @total_allocated_bytes = [total_allocated_bytes - bytes, 0].max
46
+ @deallocation_count = (deallocation_count + 1)
47
+ end
48
+ end
49
+
50
+ # Reset all statistics
51
+ # @return [void]
52
+ def reset!
53
+ mutex.synchronize do
54
+ @total_allocated_bytes = 0
55
+ @peak_allocated_bytes = 0
56
+ @allocation_count = 0
57
+ @deallocation_count = 0
58
+ end
59
+ end
60
+
61
+ # @return [Hash] Statistics snapshot
62
+ def snapshot
63
+ mutex.synchronize do
64
+ {
65
+ total_allocated_bytes: total_allocated_bytes,
66
+ peak_allocated_bytes: peak_allocated_bytes,
67
+ allocation_count: allocation_count,
68
+ deallocation_count: deallocation_count,
69
+ active_allocations: allocation_count - deallocation_count
70
+ }
71
+ end
72
+ end
73
+
74
+ # @return [String] Human-readable statistics
75
+ def to_s
76
+ s = snapshot
77
+ format(
78
+ "Memory Stats: %s allocated (peak: %s), %d allocs, %d frees, %d active",
79
+ format_bytes(s[:total_allocated_bytes]),
80
+ format_bytes(s[:peak_allocated_bytes]),
81
+ s[:allocation_count],
82
+ s[:deallocation_count],
83
+ s[:active_allocations]
84
+ )
85
+ end
86
+
87
+ private
88
+
89
+ def mutex
90
+ @mutex ||= Mutex.new
91
+ end
92
+
93
+ def format_bytes(bytes)
94
+ if bytes >= 1024 * 1024 * 1024
95
+ format("%.2f GB", bytes.to_f / (1024 * 1024 * 1024))
96
+ elsif bytes >= 1024 * 1024
97
+ format("%.2f MB", bytes.to_f / (1024 * 1024))
98
+ elsif bytes >= 1024
99
+ format("%.2f KB", bytes.to_f / 1024)
100
+ else
101
+ "#{bytes} B"
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'memory/stats'
4
+ require_relative 'memory/device_memory_resource'
5
+ require_relative 'memory/cuda_memory_resource'
6
+ require_relative 'memory/cuda_async_memory_resource'
7
+ require_relative 'memory/pool_memory_resource'
8
+ require_relative 'memory/pinned_host_memory_resource'
9
+
10
+ module Ignis
11
+ # Memory management module providing RMM-inspired GPU memory pools
12
+ #
13
+ # This module provides efficient memory allocation for GPU workloads through:
14
+ # - DeviceMemoryResource: Base class for all device memory allocators
15
+ # - CudaMemoryResource: Simple cudaMalloc/cudaFree (baseline)
16
+ # - CudaAsyncMemoryResource: Stream-ordered cudaMallocAsync/cudaFreeAsync
17
+ # - PoolMemoryResource: Coalescing best-fit pool for high-frequency allocations
18
+ # - PinnedHostMemoryResource: Page-locked host memory for fast transfers
19
+ #
20
+ # @example Using a memory pool
21
+ # # Set global pool for all allocations
22
+ # pool = Ignis::Memory::PoolMemoryResource.new(initial_pool_size: 512.megabytes)
23
+ # Ignis::Memory.set_current_device_resource(pool)
24
+ #
25
+ # # All NvArray allocations now use the pool
26
+ # arr = Ignis::NvArray.new(shape: [1024, 1024], dtype: :float32)
27
+ #
28
+ # @example Stream-ordered allocation
29
+ # async_mr = Ignis::Memory::CudaAsyncMemoryResource.new
30
+ # ptr = async_mr.allocate(1024 * 1024, stream: my_stream)
31
+ # # ...use ptr...
32
+ # async_mr.deallocate(ptr, 1024 * 1024, stream: my_stream)
33
+ #
34
+ module Memory
35
+ class << self
36
+ # Get the current device memory resource
37
+ # Creates a default PoolMemoryResource if none is set
38
+ # @return [DeviceMemoryResource]
39
+ def get_current_device_resource
40
+ device_id = current_device_index
41
+ resource = per_device_resources[device_id]
42
+ return resource if resource
43
+
44
+ if Ignis.configuration.use_memory_pool
45
+ resource = PoolMemoryResource.new(device_index: device_id)
46
+ else
47
+ resource = CudaMemoryResource.new(device_index: device_id)
48
+ end
49
+
50
+ per_device_resources[device_id] = resource
51
+ resource
52
+ end
53
+
54
+ # Set the current device memory resource
55
+ # @param resource [DeviceMemoryResource]
56
+ # @return [DeviceMemoryResource] The previous resource
57
+ def set_current_device_resource(resource)
58
+ device_id = current_device_index
59
+ old = per_device_resources[device_id]
60
+ per_device_resources[device_id] = resource
61
+ old
62
+ end
63
+
64
+ # Get memory resource for a specific device
65
+ # @param device_id [Integer]
66
+ # @return [DeviceMemoryResource, nil]
67
+ def get_per_device_resource(device_id)
68
+ per_device_resources[device_id]
69
+ end
70
+
71
+ # Set memory resource for a specific device
72
+ # @param device_id [Integer]
73
+ # @param resource [DeviceMemoryResource]
74
+ # @return [DeviceMemoryResource] The previous resource
75
+ def set_per_device_resource(device_id, resource)
76
+ old = per_device_resources[device_id]
77
+ per_device_resources[device_id] = resource
78
+ old
79
+ end
80
+
81
+ # Get memory statistics
82
+ # @return [Hash]
83
+ def stats
84
+ Stats.snapshot
85
+ end
86
+
87
+ # Reset memory statistics
88
+ # @return [void]
89
+ def reset_stats!
90
+ Stats.reset!
91
+ end
92
+
93
+ # Allocate device memory using current resource
94
+ # @param bytes [Integer]
95
+ # @param stream [Ignis::CUDA::Stream, nil]
96
+ # @return [FFI::Pointer]
97
+ def allocate(bytes, stream: nil)
98
+ get_current_device_resource.allocate(bytes, stream: stream)
99
+ end
100
+
101
+ # Deallocate device memory using current resource
102
+ # @param ptr [FFI::Pointer]
103
+ # @param bytes [Integer]
104
+ # @param stream [Ignis::CUDA::Stream, nil]
105
+ # @return [void]
106
+ def deallocate(ptr, bytes, stream: nil)
107
+ get_current_device_resource.deallocate(ptr, bytes, stream: stream)
108
+ end
109
+
110
+ private
111
+
112
+ def per_device_resources
113
+ @per_device_resources ||= {}
114
+ end
115
+
116
+ def current_device_index
117
+ CUDA::RuntimeAPI.ensure_loaded!
118
+ ptr = FFI::MemoryPointer.new(:int)
119
+ CUDA::RuntimeAPI.cudaGetDevice(ptr)
120
+ ptr.read_int
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ VERSION = "0.1.0"
5
+ end