ignis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis.rb +94 -0
- data/lib/nnw/platform.rb +304 -0
- data/lib/nnw/shared/event_bus.rb +240 -0
- data/lib/nnw/shared/ffi_loader.rb +63 -0
- data/lib/nnw/shared/memory_contract.rb +204 -0
- data/lib/nnw/shared/nv_array.rb +710 -0
- data/lib/nnw/shared/recovery_protocol.rb +307 -0
- data/lib/nvruby/configuration.rb +217 -0
- data/lib/nvruby/cuda/device.rb +275 -0
- data/lib/nvruby/cuda/device_props.rb +202 -0
- data/lib/nvruby/cuda/graph.rb +265 -0
- data/lib/nvruby/cuda/graph_bindings.rb +119 -0
- data/lib/nvruby/cuda/library_loader.rb +285 -0
- data/lib/nvruby/cuda/memory.rb +410 -0
- data/lib/nvruby/cuda/runtime_api.rb +804 -0
- data/lib/nvruby/cuda/stream.rb +234 -0
- data/lib/nvruby/dtype.rb +139 -0
- data/lib/nvruby/epilogues.rb +438 -0
- data/lib/nvruby/errors.rb +303 -0
- data/lib/nvruby/half.rb +97 -0
- data/lib/nvruby/jit/compiled_kernel.rb +80 -0
- data/lib/nvruby/jit/compiler.rb +231 -0
- data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
- data/lib/nvruby/jit/kernel.rb +240 -0
- data/lib/nvruby/jit/kernel_module.rb +133 -0
- data/lib/nvruby/jit/kernels/activations.rb +179 -0
- data/lib/nvruby/jit/kernels/attention.rb +504 -0
- data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
- data/lib/nvruby/jit/kernels/loss.rb +213 -0
- data/lib/nvruby/jit/kernels/normalization.rb +200 -0
- data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
- data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
- data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
- data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
- data/lib/nvruby/linalg/epilog.rb +67 -0
- data/lib/nvruby/linalg/matmul.rb +247 -0
- data/lib/nvruby/linalg/matmul_plan.rb +229 -0
- data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
- data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
- data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
- data/lib/nvruby/memory/device_memory_resource.rb +106 -0
- data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
- data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
- data/lib/nvruby/memory/stats.rb +107 -0
- data/lib/nvruby/memory.rb +124 -0
- data/lib/nvruby/version.rb +5 -0
- metadata +108 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Memory
|
|
5
|
+
# @abstract Abstract base class for device memory resources
|
|
6
|
+
# Modeled after RMM's device_memory_resource interface
|
|
7
|
+
class DeviceMemoryResource
|
|
8
|
+
# Minimum alignment for all allocations (256 bytes per RMM standard)
|
|
9
|
+
ALIGNMENT = 256
|
|
10
|
+
|
|
11
|
+
# @return [Integer] Device index this resource manages
|
|
12
|
+
attr_reader :device_index
|
|
13
|
+
|
|
14
|
+
# @param device_index [Integer] GPU device index (default: current device)
|
|
15
|
+
# @return [void]
|
|
16
|
+
def initialize(device_index: nil)
|
|
17
|
+
@device_index = device_index || Ignis.configuration.default_device
|
|
18
|
+
@mutex = Mutex.new
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Allocate device memory with optional stream ordering
|
|
22
|
+
# @param bytes [Integer] Number of bytes to allocate
|
|
23
|
+
# @param stream [Ignis::CUDA::Stream, nil] Optional stream for async allocation
|
|
24
|
+
# @return [FFI::Pointer] Device pointer
|
|
25
|
+
# @raise [Ignis::MemoryError] If allocation fails
|
|
26
|
+
def allocate(bytes, stream: nil)
|
|
27
|
+
raise ArgumentError, "bytes must be positive, got #{bytes}" if bytes <= 0
|
|
28
|
+
|
|
29
|
+
aligned_bytes = align_up(bytes)
|
|
30
|
+
|
|
31
|
+
@mutex.synchronize do
|
|
32
|
+
ptr = do_allocate(aligned_bytes, stream)
|
|
33
|
+
Stats.record_allocation(aligned_bytes)
|
|
34
|
+
ptr
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Deallocate device memory with optional stream ordering
|
|
39
|
+
# @param ptr [FFI::Pointer] Device pointer to free
|
|
40
|
+
# @param bytes [Integer] Size of allocation
|
|
41
|
+
# @param stream [Ignis::CUDA::Stream, nil] Optional stream for async deallocation
|
|
42
|
+
# @return [void]
|
|
43
|
+
def deallocate(ptr, bytes, stream: nil)
|
|
44
|
+
return if ptr.null?
|
|
45
|
+
|
|
46
|
+
aligned_bytes = align_up(bytes)
|
|
47
|
+
|
|
48
|
+
@mutex.synchronize do
|
|
49
|
+
do_deallocate(ptr, aligned_bytes, stream)
|
|
50
|
+
Stats.record_deallocation(aligned_bytes)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Check if this resource supports stream-ordered allocation
|
|
55
|
+
# @return [Boolean]
|
|
56
|
+
def supports_streams?
|
|
57
|
+
false
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Check if memory from this resource can be deallocated by another
|
|
61
|
+
# @param other [DeviceMemoryResource]
|
|
62
|
+
# @return [Boolean]
|
|
63
|
+
def is_equal?(other)
|
|
64
|
+
self.class == other.class && @device_index == other.device_index
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# @return [String]
|
|
68
|
+
def to_s
|
|
69
|
+
"#{self.class.name}[device=#{@device_index}]"
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# @return [String]
|
|
73
|
+
def inspect
|
|
74
|
+
"#<#{self.class.name}:0x#{object_id.to_s(16)} device=#{@device_index}>"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
protected
|
|
78
|
+
|
|
79
|
+
# @abstract Subclasses must implement this
|
|
80
|
+
# @param bytes [Integer] Aligned bytes to allocate
|
|
81
|
+
# @param stream [Ignis::CUDA::Stream, nil]
|
|
82
|
+
# @return [FFI::Pointer]
|
|
83
|
+
def do_allocate(bytes, stream)
|
|
84
|
+
raise NotImplementedError, "#{self.class}#do_allocate not implemented"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# @abstract Subclasses must implement this
|
|
88
|
+
# @param ptr [FFI::Pointer]
|
|
89
|
+
# @param bytes [Integer]
|
|
90
|
+
# @param stream [Ignis::CUDA::Stream, nil]
|
|
91
|
+
# @return [void]
|
|
92
|
+
def do_deallocate(ptr, bytes, stream)
|
|
93
|
+
raise NotImplementedError, "#{self.class}#do_deallocate not implemented"
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
# Align size up to ALIGNMENT boundary
|
|
99
|
+
# @param bytes [Integer]
|
|
100
|
+
# @return [Integer]
|
|
101
|
+
def align_up(bytes)
|
|
102
|
+
((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Memory
|
|
5
|
+
# @abstract Abstract base class for host memory resources
|
|
6
|
+
class HostMemoryResource
|
|
7
|
+
# Minimum alignment for all allocations
|
|
8
|
+
ALIGNMENT = 64
|
|
9
|
+
|
|
10
|
+
# @return [void]
|
|
11
|
+
def initialize
|
|
12
|
+
@mutex = Mutex.new
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Allocate host memory
|
|
16
|
+
# @param bytes [Integer] Number of bytes to allocate
|
|
17
|
+
# @return [FFI::Pointer] Host pointer
|
|
18
|
+
# @raise [Ignis::MemoryError] If allocation fails
|
|
19
|
+
def allocate(bytes)
|
|
20
|
+
raise ArgumentError, "bytes must be positive, got #{bytes}" if bytes <= 0
|
|
21
|
+
|
|
22
|
+
aligned_bytes = align_up(bytes)
|
|
23
|
+
|
|
24
|
+
@mutex.synchronize do
|
|
25
|
+
ptr = do_allocate(aligned_bytes)
|
|
26
|
+
Stats.record_allocation(aligned_bytes)
|
|
27
|
+
ptr
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Deallocate host memory
|
|
32
|
+
# @param ptr [FFI::Pointer] Host pointer to free
|
|
33
|
+
# @param bytes [Integer] Size of allocation
|
|
34
|
+
# @return [void]
|
|
35
|
+
def deallocate(ptr, bytes)
|
|
36
|
+
return if ptr.null?
|
|
37
|
+
|
|
38
|
+
aligned_bytes = align_up(bytes)
|
|
39
|
+
|
|
40
|
+
@mutex.synchronize do
|
|
41
|
+
do_deallocate(ptr, aligned_bytes)
|
|
42
|
+
Stats.record_deallocation(aligned_bytes)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
protected
|
|
47
|
+
|
|
48
|
+
# @abstract
|
|
49
|
+
def do_allocate(bytes)
|
|
50
|
+
raise NotImplementedError
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# @abstract
|
|
54
|
+
def do_deallocate(ptr, bytes)
|
|
55
|
+
raise NotImplementedError
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def align_up(bytes)
|
|
61
|
+
((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Pinned (page-locked) host memory resource
|
|
66
|
+
# Uses cudaHostAlloc for faster GPU transfers
|
|
67
|
+
class PinnedHostMemoryResource < HostMemoryResource
|
|
68
|
+
# cudaHostAllocDefault flag
|
|
69
|
+
HOST_ALLOC_DEFAULT = 0x00
|
|
70
|
+
# cudaHostAllocPortable flag
|
|
71
|
+
HOST_ALLOC_PORTABLE = 0x01
|
|
72
|
+
# cudaHostAllocMapped flag
|
|
73
|
+
HOST_ALLOC_MAPPED = 0x02
|
|
74
|
+
# cudaHostAllocWriteCombined flag
|
|
75
|
+
HOST_ALLOC_WRITE_COMBINED = 0x04
|
|
76
|
+
|
|
77
|
+
# @param flags [Integer] cudaHostAlloc flags (default: 0)
|
|
78
|
+
def initialize(flags: HOST_ALLOC_DEFAULT)
|
|
79
|
+
super()
|
|
80
|
+
@flags = flags
|
|
81
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
protected
|
|
85
|
+
|
|
86
|
+
def do_allocate(bytes)
|
|
87
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
88
|
+
|
|
89
|
+
if @flags == HOST_ALLOC_DEFAULT
|
|
90
|
+
status = CUDA::RuntimeAPI.cudaMallocHost(ptr_ptr, bytes)
|
|
91
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaMallocHost(#{bytes} bytes)")
|
|
92
|
+
else
|
|
93
|
+
status = cuda_host_alloc(ptr_ptr, bytes, @flags)
|
|
94
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaHostAlloc(#{bytes} bytes, flags=#{@flags})")
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
ptr_ptr.read_pointer
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def do_deallocate(ptr, _bytes)
|
|
101
|
+
status = CUDA::RuntimeAPI.cudaFreeHost(ptr)
|
|
102
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaFreeHost")
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
private
|
|
106
|
+
|
|
107
|
+
def cuda_host_alloc(ptr_ptr, size, flags)
|
|
108
|
+
CUDA::RuntimeAPI.cudaHostAlloc(ptr_ptr, size, flags)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'device_memory_resource'
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module Memory
|
|
7
|
+
# Coalescing best-fit pool memory resource
|
|
8
|
+
# Pre-allocates a large pool and sub-allocates from it
|
|
9
|
+
# Inspired by RMM's pool_memory_resource
|
|
10
|
+
class PoolMemoryResource < DeviceMemoryResource
|
|
11
|
+
# Default initial pool size: 256MB
|
|
12
|
+
DEFAULT_INITIAL_SIZE = 256 * 1024 * 1024
|
|
13
|
+
|
|
14
|
+
# Default maximum pool size: 90% of device memory
|
|
15
|
+
DEFAULT_MAX_RATIO = 0.9
|
|
16
|
+
|
|
17
|
+
# Block header for tracking allocations in the pool
|
|
18
|
+
Block = Struct.new(:ptr, :size, :free, keyword_init: true)
|
|
19
|
+
|
|
20
|
+
# @return [Integer] Current pool size
|
|
21
|
+
attr_reader :current_pool_size
|
|
22
|
+
|
|
23
|
+
# @return [Integer] Maximum pool size
|
|
24
|
+
attr_reader :maximum_pool_size
|
|
25
|
+
|
|
26
|
+
# @param upstream [DeviceMemoryResource] Upstream resource for pool allocations
|
|
27
|
+
# @param initial_pool_size [Integer] Initial pool size in bytes
|
|
28
|
+
# @param maximum_pool_size [Integer, nil] Maximum pool size (nil = 90% of device memory)
|
|
29
|
+
# @param device_index [Integer, nil] GPU device index
|
|
30
|
+
def initialize(
|
|
31
|
+
upstream: nil,
|
|
32
|
+
initial_pool_size: DEFAULT_INITIAL_SIZE,
|
|
33
|
+
maximum_pool_size: nil,
|
|
34
|
+
device_index: nil
|
|
35
|
+
)
|
|
36
|
+
super(device_index: device_index)
|
|
37
|
+
|
|
38
|
+
@upstream = upstream || CudaMemoryResource.new(device_index: @device_index)
|
|
39
|
+
@initial_pool_size = align_up(initial_pool_size)
|
|
40
|
+
@maximum_pool_size = maximum_pool_size || calculate_max_pool_size
|
|
41
|
+
@maximum_pool_size = align_up(@maximum_pool_size)
|
|
42
|
+
|
|
43
|
+
@current_pool_size = 0
|
|
44
|
+
@blocks = []
|
|
45
|
+
@free_list = []
|
|
46
|
+
|
|
47
|
+
allocate_initial_pool!
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# @return [Boolean] false - synchronous pool allocation
|
|
51
|
+
def supports_streams?
|
|
52
|
+
false
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Release all pool memory back to upstream
|
|
56
|
+
# @return [void]
|
|
57
|
+
def destroy!
|
|
58
|
+
@mutex.synchronize do
|
|
59
|
+
@blocks.each do |block|
|
|
60
|
+
@upstream.deallocate(block.ptr, block.size, stream: nil)
|
|
61
|
+
end
|
|
62
|
+
@blocks.clear
|
|
63
|
+
@free_list.clear
|
|
64
|
+
@current_pool_size = 0
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# @return [Hash] Pool statistics
|
|
69
|
+
def pool_stats
|
|
70
|
+
@mutex.synchronize do
|
|
71
|
+
total_free = @free_list.sum(&:size)
|
|
72
|
+
{
|
|
73
|
+
current_pool_size: @current_pool_size,
|
|
74
|
+
maximum_pool_size: @maximum_pool_size,
|
|
75
|
+
free_bytes: total_free,
|
|
76
|
+
used_bytes: @current_pool_size - total_free,
|
|
77
|
+
block_count: @blocks.size,
|
|
78
|
+
free_block_count: @free_list.size
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
protected
|
|
84
|
+
|
|
85
|
+
# @param bytes [Integer]
|
|
86
|
+
# @param stream [Ignis::CUDA::Stream, nil] Ignored for pool allocation
|
|
87
|
+
# @return [FFI::Pointer]
|
|
88
|
+
def do_allocate(bytes, _stream)
|
|
89
|
+
block = find_free_block(bytes)
|
|
90
|
+
|
|
91
|
+
if block
|
|
92
|
+
split_block_if_needed!(block, bytes)
|
|
93
|
+
block.ptr
|
|
94
|
+
else
|
|
95
|
+
expand_pool!(bytes)
|
|
96
|
+
block = find_free_block(bytes)
|
|
97
|
+
raise MemoryError, "Pool allocation failed for #{bytes} bytes" unless block
|
|
98
|
+
|
|
99
|
+
split_block_if_needed!(block, bytes)
|
|
100
|
+
block.ptr
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# @param ptr [FFI::Pointer]
|
|
105
|
+
# @param bytes [Integer]
|
|
106
|
+
# @param stream [Ignis::CUDA::Stream, nil] Ignored for pool deallocation
|
|
107
|
+
# @return [void]
|
|
108
|
+
def do_deallocate(ptr, bytes, _stream)
|
|
109
|
+
block = @blocks.find { |b| b.ptr.address == ptr.address }
|
|
110
|
+
return unless block
|
|
111
|
+
|
|
112
|
+
block.free = true
|
|
113
|
+
@free_list << block
|
|
114
|
+
coalesce_free_blocks!
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
private
|
|
118
|
+
|
|
119
|
+
def allocate_initial_pool!
|
|
120
|
+
return if @initial_pool_size <= 0
|
|
121
|
+
|
|
122
|
+
ptr = @upstream.allocate(@initial_pool_size, stream: nil)
|
|
123
|
+
block = Block.new(ptr: ptr, size: @initial_pool_size, free: true)
|
|
124
|
+
@blocks << block
|
|
125
|
+
@free_list << block
|
|
126
|
+
@current_pool_size = @initial_pool_size
|
|
127
|
+
|
|
128
|
+
Ignis.logger.debug { "Pool initialized with #{format_bytes(@initial_pool_size)}" }
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def find_free_block(bytes)
|
|
132
|
+
@free_list
|
|
133
|
+
.select { |b| b.free && b.size >= bytes }
|
|
134
|
+
.min_by(&:size)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def split_block_if_needed!(block, bytes)
|
|
138
|
+
block.free = false
|
|
139
|
+
@free_list.delete(block)
|
|
140
|
+
|
|
141
|
+
remainder = block.size - bytes
|
|
142
|
+
return if remainder < ALIGNMENT * 2
|
|
143
|
+
|
|
144
|
+
new_ptr = FFI::Pointer.new(block.ptr.address + bytes)
|
|
145
|
+
new_block = Block.new(ptr: new_ptr, size: remainder, free: true)
|
|
146
|
+
block.size = bytes
|
|
147
|
+
|
|
148
|
+
@blocks << new_block
|
|
149
|
+
@free_list << new_block
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def expand_pool!(min_bytes)
|
|
153
|
+
if @current_pool_size >= @maximum_pool_size
|
|
154
|
+
raise MemoryError, "Pool exhausted: current=#{format_bytes(@current_pool_size)}, " \
|
|
155
|
+
"max=#{format_bytes(@maximum_pool_size)}, requested=#{format_bytes(min_bytes)}"
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
remaining = @maximum_pool_size - @current_pool_size
|
|
159
|
+
grow_size = if @maximum_pool_size > 0
|
|
160
|
+
[remaining / 2, min_bytes, ALIGNMENT].max
|
|
161
|
+
else
|
|
162
|
+
[@current_pool_size, min_bytes, @initial_pool_size].max
|
|
163
|
+
end
|
|
164
|
+
grow_size = align_up([grow_size, remaining].min)
|
|
165
|
+
|
|
166
|
+
Ignis.logger.debug { "Expanding pool by #{format_bytes(grow_size)}" }
|
|
167
|
+
|
|
168
|
+
ptr = @upstream.allocate(grow_size, stream: nil)
|
|
169
|
+
block = Block.new(ptr: ptr, size: grow_size, free: true)
|
|
170
|
+
@blocks << block
|
|
171
|
+
@free_list << block
|
|
172
|
+
@current_pool_size += grow_size
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def coalesce_free_blocks!
|
|
176
|
+
return if @free_list.size < 2
|
|
177
|
+
|
|
178
|
+
sorted = @free_list.sort_by { |b| b.ptr.address }
|
|
179
|
+
merged = []
|
|
180
|
+
current = sorted.first
|
|
181
|
+
|
|
182
|
+
sorted.drop(1).each do |block|
|
|
183
|
+
if current.ptr.address + current.size == block.ptr.address
|
|
184
|
+
current.size += block.size
|
|
185
|
+
@blocks.delete(block)
|
|
186
|
+
else
|
|
187
|
+
merged << current
|
|
188
|
+
current = block
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
merged << current
|
|
192
|
+
|
|
193
|
+
@free_list.replace(merged)
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def calculate_max_pool_size
|
|
197
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
198
|
+
free_ptr = FFI::MemoryPointer.new(:size_t)
|
|
199
|
+
total_ptr = FFI::MemoryPointer.new(:size_t)
|
|
200
|
+
|
|
201
|
+
ensure_device do
|
|
202
|
+
status = CUDA::RuntimeAPI.cudaMemGetInfo(free_ptr, total_ptr)
|
|
203
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaMemGetInfo")
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
total = total_ptr.read(:size_t)
|
|
207
|
+
(total * DEFAULT_MAX_RATIO).to_i
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def ensure_device
|
|
211
|
+
current_ptr = FFI::MemoryPointer.new(:int)
|
|
212
|
+
CUDA::RuntimeAPI.cudaGetDevice(current_ptr)
|
|
213
|
+
original = current_ptr.read_int
|
|
214
|
+
|
|
215
|
+
if original != @device_index
|
|
216
|
+
status = CUDA::RuntimeAPI.cudaSetDevice(@device_index)
|
|
217
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaSetDevice(#{@device_index})")
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
result = yield
|
|
221
|
+
|
|
222
|
+
if original != @device_index
|
|
223
|
+
CUDA::RuntimeAPI.cudaSetDevice(original)
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
result
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def format_bytes(bytes)
|
|
230
|
+
if bytes >= 1024 * 1024 * 1024
|
|
231
|
+
format("%.2f GB", bytes.to_f / (1024 * 1024 * 1024))
|
|
232
|
+
elsif bytes >= 1024 * 1024
|
|
233
|
+
format("%.2f MB", bytes.to_f / (1024 * 1024))
|
|
234
|
+
elsif bytes >= 1024
|
|
235
|
+
format("%.2f KB", bytes.to_f / 1024)
|
|
236
|
+
else
|
|
237
|
+
"#{bytes} B"
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Memory
|
|
5
|
+
# Thread-safe allocation statistics tracking
|
|
6
|
+
# Provides real-time visibility into memory usage patterns
|
|
7
|
+
class Stats
|
|
8
|
+
class << self
|
|
9
|
+
# @return [Integer] Total bytes currently allocated
|
|
10
|
+
def total_allocated_bytes
|
|
11
|
+
@total_allocated_bytes ||= 0
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# @return [Integer] Peak bytes allocated (high-water mark)
|
|
15
|
+
def peak_allocated_bytes
|
|
16
|
+
@peak_allocated_bytes ||= 0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# @return [Integer] Total allocation count since startup
|
|
20
|
+
def allocation_count
|
|
21
|
+
@allocation_count ||= 0
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# @return [Integer] Total deallocation count since startup
|
|
25
|
+
def deallocation_count
|
|
26
|
+
@deallocation_count ||= 0
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Record an allocation
|
|
30
|
+
# @param bytes [Integer]
|
|
31
|
+
# @return [void]
|
|
32
|
+
def record_allocation(bytes)
|
|
33
|
+
mutex.synchronize do
|
|
34
|
+
@total_allocated_bytes = (total_allocated_bytes + bytes)
|
|
35
|
+
@peak_allocated_bytes = [@peak_allocated_bytes || 0, @total_allocated_bytes].max
|
|
36
|
+
@allocation_count = (allocation_count + 1)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Record a deallocation
|
|
41
|
+
# @param bytes [Integer]
|
|
42
|
+
# @return [void]
|
|
43
|
+
def record_deallocation(bytes)
|
|
44
|
+
mutex.synchronize do
|
|
45
|
+
@total_allocated_bytes = [total_allocated_bytes - bytes, 0].max
|
|
46
|
+
@deallocation_count = (deallocation_count + 1)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Reset all statistics
|
|
51
|
+
# @return [void]
|
|
52
|
+
def reset!
|
|
53
|
+
mutex.synchronize do
|
|
54
|
+
@total_allocated_bytes = 0
|
|
55
|
+
@peak_allocated_bytes = 0
|
|
56
|
+
@allocation_count = 0
|
|
57
|
+
@deallocation_count = 0
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# @return [Hash] Statistics snapshot
|
|
62
|
+
def snapshot
|
|
63
|
+
mutex.synchronize do
|
|
64
|
+
{
|
|
65
|
+
total_allocated_bytes: total_allocated_bytes,
|
|
66
|
+
peak_allocated_bytes: peak_allocated_bytes,
|
|
67
|
+
allocation_count: allocation_count,
|
|
68
|
+
deallocation_count: deallocation_count,
|
|
69
|
+
active_allocations: allocation_count - deallocation_count
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# @return [String] Human-readable statistics
|
|
75
|
+
def to_s
|
|
76
|
+
s = snapshot
|
|
77
|
+
format(
|
|
78
|
+
"Memory Stats: %s allocated (peak: %s), %d allocs, %d frees, %d active",
|
|
79
|
+
format_bytes(s[:total_allocated_bytes]),
|
|
80
|
+
format_bytes(s[:peak_allocated_bytes]),
|
|
81
|
+
s[:allocation_count],
|
|
82
|
+
s[:deallocation_count],
|
|
83
|
+
s[:active_allocations]
|
|
84
|
+
)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
private
|
|
88
|
+
|
|
89
|
+
def mutex
|
|
90
|
+
@mutex ||= Mutex.new
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def format_bytes(bytes)
|
|
94
|
+
if bytes >= 1024 * 1024 * 1024
|
|
95
|
+
format("%.2f GB", bytes.to_f / (1024 * 1024 * 1024))
|
|
96
|
+
elsif bytes >= 1024 * 1024
|
|
97
|
+
format("%.2f MB", bytes.to_f / (1024 * 1024))
|
|
98
|
+
elsif bytes >= 1024
|
|
99
|
+
format("%.2f KB", bytes.to_f / 1024)
|
|
100
|
+
else
|
|
101
|
+
"#{bytes} B"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'memory/stats'
|
|
4
|
+
require_relative 'memory/device_memory_resource'
|
|
5
|
+
require_relative 'memory/cuda_memory_resource'
|
|
6
|
+
require_relative 'memory/cuda_async_memory_resource'
|
|
7
|
+
require_relative 'memory/pool_memory_resource'
|
|
8
|
+
require_relative 'memory/pinned_host_memory_resource'
|
|
9
|
+
|
|
10
|
+
module Ignis
|
|
11
|
+
# Memory management module providing RMM-inspired GPU memory pools
|
|
12
|
+
#
|
|
13
|
+
# This module provides efficient memory allocation for GPU workloads through:
|
|
14
|
+
# - DeviceMemoryResource: Base class for all device memory allocators
|
|
15
|
+
# - CudaMemoryResource: Simple cudaMalloc/cudaFree (baseline)
|
|
16
|
+
# - CudaAsyncMemoryResource: Stream-ordered cudaMallocAsync/cudaFreeAsync
|
|
17
|
+
# - PoolMemoryResource: Coalescing best-fit pool for high-frequency allocations
|
|
18
|
+
# - PinnedHostMemoryResource: Page-locked host memory for fast transfers
|
|
19
|
+
#
|
|
20
|
+
# @example Using a memory pool
|
|
21
|
+
# # Set global pool for all allocations
|
|
22
|
+
# pool = Ignis::Memory::PoolMemoryResource.new(initial_pool_size: 512.megabytes)
|
|
23
|
+
# Ignis::Memory.set_current_device_resource(pool)
|
|
24
|
+
#
|
|
25
|
+
# # All NvArray allocations now use the pool
|
|
26
|
+
# arr = Ignis::NvArray.new(shape: [1024, 1024], dtype: :float32)
|
|
27
|
+
#
|
|
28
|
+
# @example Stream-ordered allocation
|
|
29
|
+
# async_mr = Ignis::Memory::CudaAsyncMemoryResource.new
|
|
30
|
+
# ptr = async_mr.allocate(1024 * 1024, stream: my_stream)
|
|
31
|
+
# # ...use ptr...
|
|
32
|
+
# async_mr.deallocate(ptr, 1024 * 1024, stream: my_stream)
|
|
33
|
+
#
|
|
34
|
+
module Memory
|
|
35
|
+
class << self
|
|
36
|
+
# Get the current device memory resource
|
|
37
|
+
# Creates a default PoolMemoryResource if none is set
|
|
38
|
+
# @return [DeviceMemoryResource]
|
|
39
|
+
def get_current_device_resource
|
|
40
|
+
device_id = current_device_index
|
|
41
|
+
resource = per_device_resources[device_id]
|
|
42
|
+
return resource if resource
|
|
43
|
+
|
|
44
|
+
if Ignis.configuration.use_memory_pool
|
|
45
|
+
resource = PoolMemoryResource.new(device_index: device_id)
|
|
46
|
+
else
|
|
47
|
+
resource = CudaMemoryResource.new(device_index: device_id)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
per_device_resources[device_id] = resource
|
|
51
|
+
resource
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Set the current device memory resource
|
|
55
|
+
# @param resource [DeviceMemoryResource]
|
|
56
|
+
# @return [DeviceMemoryResource] The previous resource
|
|
57
|
+
def set_current_device_resource(resource)
|
|
58
|
+
device_id = current_device_index
|
|
59
|
+
old = per_device_resources[device_id]
|
|
60
|
+
per_device_resources[device_id] = resource
|
|
61
|
+
old
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Get memory resource for a specific device
|
|
65
|
+
# @param device_id [Integer]
|
|
66
|
+
# @return [DeviceMemoryResource, nil]
|
|
67
|
+
def get_per_device_resource(device_id)
|
|
68
|
+
per_device_resources[device_id]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Set memory resource for a specific device
|
|
72
|
+
# @param device_id [Integer]
|
|
73
|
+
# @param resource [DeviceMemoryResource]
|
|
74
|
+
# @return [DeviceMemoryResource] The previous resource
|
|
75
|
+
def set_per_device_resource(device_id, resource)
|
|
76
|
+
old = per_device_resources[device_id]
|
|
77
|
+
per_device_resources[device_id] = resource
|
|
78
|
+
old
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Get memory statistics
|
|
82
|
+
# @return [Hash]
|
|
83
|
+
def stats
|
|
84
|
+
Stats.snapshot
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Reset memory statistics
|
|
88
|
+
# @return [void]
|
|
89
|
+
def reset_stats!
|
|
90
|
+
Stats.reset!
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Allocate device memory using current resource
|
|
94
|
+
# @param bytes [Integer]
|
|
95
|
+
# @param stream [Ignis::CUDA::Stream, nil]
|
|
96
|
+
# @return [FFI::Pointer]
|
|
97
|
+
def allocate(bytes, stream: nil)
|
|
98
|
+
get_current_device_resource.allocate(bytes, stream: stream)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Deallocate device memory using current resource
|
|
102
|
+
# @param ptr [FFI::Pointer]
|
|
103
|
+
# @param bytes [Integer]
|
|
104
|
+
# @param stream [Ignis::CUDA::Stream, nil]
|
|
105
|
+
# @return [void]
|
|
106
|
+
def deallocate(ptr, bytes, stream: nil)
|
|
107
|
+
get_current_device_resource.deallocate(ptr, bytes, stream: stream)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
private
|
|
111
|
+
|
|
112
|
+
def per_device_resources
|
|
113
|
+
@per_device_resources ||= {}
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def current_device_index
|
|
117
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
118
|
+
ptr = FFI::MemoryPointer.new(:int)
|
|
119
|
+
CUDA::RuntimeAPI.cudaGetDevice(ptr)
|
|
120
|
+
ptr.read_int
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|