ignis-autograd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: fcf74e67da11fb5e8dc37233c01fe0f47e4e60cbe038911e4fff43a582376084
4
+ data.tar.gz: 7a57cf46813c15e2a067195be05ffcc7bdeafdc63e7b66a2544c42e8aa2dc539
5
+ SHA512:
6
+ metadata.gz: 7fc7b3badc27f7a94d3a7ff6416db11084aa04ed10b9c88d40ed99c3e1889a8868275168907def8ad8ab8f14ba9604d4b43dcc20c3f97a99d2c51c68304af0a5
7
+ data.tar.gz: 918f846d561833195db3fce0cb41d64484fd18d907223f1f032621a4b01e6979e9acca56b7886ac70c157f6ad19a60aa36521340ce4c34cc929f416cfefdcaf8
data/README.md ADDED
@@ -0,0 +1,14 @@
1
+ # ignis-autograd
2
+
3
+ Reverse-mode automatic differentiation over GPU arrays, on the [`ignis`](https://rubygems.org/gems/ignis) foundation.
4
+
5
+ Adds `Ignis::AI::Tensor` (a differentiable tensor) and an autograd tape. Build computation graphs on the GPU and get exact gradients (verified against finite differences).
6
+
7
+ ```ruby
8
+ require "ignis-autograd"
9
+ x = Ignis::AI::Tensor.from_host([1.0, 2.0, 3.0], shape: [3], requires_grad: true)
10
+ (x * x).sum.backward!
11
+ x.grad.to_host # => [2.0, 4.0, 6.0]
12
+ ```
13
+
14
+ MIT.
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # ignis-autograd — reverse-mode automatic differentiation over GPU arrays.
4
+ # Adds Ignis::AI::Tensor (a differentiable tensor) and the autograd tape on top
5
+ # of the Ignis GPU foundation.
6
+
7
+ require "ignis"
8
+
9
+ require_relative "nnw/ai/tape"
10
+ require_relative "nnw/ai/tensor"
11
+ require_relative "nnw/ai/device"
@@ -0,0 +1,257 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module AI
5
+ # Device — dynamic GPU capability detection and configuration.
6
+ #
7
+ # Queries GPU properties at runtime: VRAM, compute capability, SM count, etc.
8
+ # All model configurations adapt based on the actual hardware present.
9
+ # No hardcoded GPU assumptions.
10
+ class Device
11
+ # GPU properties struct
12
+ DeviceProperties = Struct.new(
13
+ :id, :name, :compute_capability,
14
+ :total_memory_bytes, :total_memory_mb, :total_memory_gb,
15
+ :sm_count, :max_threads_per_block, :max_threads_per_sm,
16
+ :warp_size, :clock_rate_mhz, :memory_clock_mhz,
17
+ :l2_cache_size, :shared_mem_per_block,
18
+ keyword_init: true
19
+ )
20
+
21
+ class << self
22
+ # Query all GPU devices and cache properties.
23
+ # @return [Array<DeviceProperties>]
24
+ def all_devices
25
+ @all_devices ||= enumerate_devices
26
+ end
27
+
28
+ # Get properties for a specific device.
29
+ # @param device_id [Integer]
30
+ # @return [DeviceProperties]
31
+ def properties(device_id = 0)
32
+ all_devices[device_id] || raise("No GPU device #{device_id} found")
33
+ end
34
+
35
+ # Total VRAM on device in bytes.
36
+ # @param device_id [Integer]
37
+ # @return [Integer]
38
+ def total_memory(device_id = 0)
39
+ properties(device_id).total_memory_bytes
40
+ end
41
+
42
+ # Estimate free VRAM (queries cudaMemGetInfo).
43
+ # @param device_id [Integer]
44
+ # @return [Integer] free bytes
45
+ def free_memory(device_id = 0)
46
+ query_free_memory(device_id)
47
+ end
48
+
49
+ # Number of available GPUs.
50
+ # @return [Integer]
51
+ def count
52
+ all_devices.length
53
+ end
54
+
55
+ # Summary string for logging.
56
+ # @return [String]
57
+ def summary
58
+ lines = ["GPU Devices (#{count}):"]
59
+ all_devices.each do |dev|
60
+ lines << " [#{dev.id}] #{dev.name} | #{dev.total_memory_gb}GB VRAM | " \
61
+ "CC #{dev.compute_capability} | #{dev.sm_count} SMs"
62
+ end
63
+ lines.join("\n")
64
+ end
65
+
66
+ # Recommend optimal batch size and sequence length for a model.
67
+ # @param model_params [Integer] total parameters
68
+ # @param dtype_bytes [Integer] bytes per parameter (2 for FP16, 4 for FP32)
69
+ # @param device_id [Integer]
70
+ # @param target_utilization [Float] fraction of VRAM to use (0.0-1.0)
71
+ # @return [Hash] :batch_size, :seq_len, :use_flash_attention, :use_gradient_checkpointing
72
+ def recommend_config(model_params, dtype_bytes: 4, device_id: 0, target_utilization: 0.85)
73
+ dev = properties(device_id)
74
+ available_bytes = (dev.total_memory_bytes * target_utilization).to_i
75
+
76
+ # Weight memory
77
+ weight_bytes = model_params * dtype_bytes
78
+
79
+ # Optimizer state (Adam: 2x params for m, v)
80
+ optimizer_bytes = model_params * 4 * 2
81
+
82
+ # Gradient storage
83
+ gradient_bytes = model_params * dtype_bytes
84
+
85
+ # Fixed overhead
86
+ fixed_bytes = weight_bytes + optimizer_bytes + gradient_bytes
87
+
88
+ # Remaining for activations
89
+ activation_budget = available_bytes - fixed_bytes
90
+
91
+ if activation_budget <= 0
92
+ return {
93
+ batch_size: 1,
94
+ seq_len: 128,
95
+ use_flash_attention: true,
96
+ use_gradient_checkpointing: true,
97
+ warning: "Model too large for this GPU. Consider model parallelism or FP16."
98
+ }
99
+ end
100
+
101
+ # Estimate activation memory per token per layer
102
+ # Rough estimate: 4 * hidden_dim * dtype_bytes per token per layer
103
+ # Hidden dim ~ sqrt(model_params / 12) for typical transformers
104
+ estimated_hidden = Math.sqrt(model_params / 12.0).to_i
105
+ estimated_layers = [model_params / (estimated_hidden * estimated_hidden * 12), 1].max
106
+ activation_per_token = 4 * estimated_hidden * dtype_bytes * estimated_layers
107
+
108
+ # Target: batch_size * seq_len * activation_per_token <= activation_budget
109
+ total_tokens = activation_budget / [activation_per_token, 1].max
110
+
111
+ # Prefer seq_len of 1024, adjust batch_size
112
+ seq_len = [1024, total_tokens].min
113
+ batch_size = [total_tokens / seq_len, 1].max
114
+
115
+ # Flash attention saves O(N²) memory — worth it for seq_len > 512
116
+ use_flash = seq_len > 512
117
+
118
+ # Gradient checkpointing if we're tight on memory
119
+ use_checkpointing = activation_budget < weight_bytes * 2
120
+
121
+ {
122
+ batch_size: batch_size.to_i,
123
+ seq_len: seq_len.to_i,
124
+ use_flash_attention: use_flash,
125
+ use_gradient_checkpointing: use_checkpointing,
126
+ estimated_vram_usage_gb: (fixed_bytes / (1024.0**3)).round(2),
127
+ available_vram_gb: (available_bytes / (1024.0**3)).round(2),
128
+ activation_budget_gb: (activation_budget / (1024.0**3)).round(2)
129
+ }
130
+ end
131
+
132
+ # Check if multi-GPU is available and worth using.
133
+ # @return [Hash] :multi_gpu, :device_ids, :strategy
134
+ def multi_gpu_config
135
+ devs = all_devices
136
+ if devs.length <= 1
137
+ return { multi_gpu: false, device_ids: [0], strategy: :single }
138
+ end
139
+
140
+ # Check if devices are compatible (same compute capability)
141
+ ccs = devs.map(&:compute_capability).uniq
142
+ if ccs.length == 1
143
+ { multi_gpu: true, device_ids: devs.map(&:id), strategy: :data_parallel }
144
+ else
145
+ # Heterogeneous GPUs — only use matching ones
146
+ dominant_cc = devs.group_by(&:compute_capability).max_by { |_, v| v.length }[0]
147
+ matching = devs.select { |d| d.compute_capability == dominant_cc }
148
+ {
149
+ multi_gpu: matching.length > 1,
150
+ device_ids: matching.map(&:id),
151
+ strategy: :data_parallel,
152
+ warning: "Heterogeneous GPUs detected. Using #{matching.length} " \
153
+ "devices with CC #{dominant_cc}."
154
+ }
155
+ end
156
+ end
157
+
158
+ # Clear cached device info (call after hardware changes).
159
+ # @return [void]
160
+ def reset!
161
+ @all_devices = nil
162
+ end
163
+
164
+ private
165
+
166
+ # Enumerate all CUDA devices.
167
+ # @return [Array<DeviceProperties>]
168
+ def enumerate_devices
169
+ devices = []
170
+
171
+ begin
172
+ device_count = Ignis::CUDA::Device.count
173
+ rescue
174
+ return devices
175
+ end
176
+
177
+ device_count.times do |id|
178
+ begin
179
+ props = query_device_properties(id)
180
+ devices << props
181
+ rescue => e
182
+ Ignis.logger.warn("Failed to query GPU #{id}: #{e.message}")
183
+ end
184
+ end
185
+
186
+ devices
187
+ end
188
+
189
+ # Query device properties via CUDA Runtime API.
190
+ # @param device_id [Integer]
191
+ # @return [DeviceProperties]
192
+ def query_device_properties(device_id)
193
+ # Use cudaGetDeviceProperties or Ignis wrappers
194
+ if defined?(Ignis::CUDA::Device) && Ignis::CUDA::Device.respond_to?(:properties)
195
+ props = Ignis::CUDA::Device.properties(device_id)
196
+ DeviceProperties.new(
197
+ id: device_id,
198
+ name: props[:name] || "GPU #{device_id}",
199
+ compute_capability: props[:compute_capability] || "0.0",
200
+ total_memory_bytes: props[:total_global_mem] || 0,
201
+ total_memory_mb: (props[:total_global_mem] || 0) / (1024 * 1024),
202
+ total_memory_gb: ((props[:total_global_mem] || 0) / (1024.0**3)).round(2),
203
+ sm_count: props[:multi_processor_count] || 0,
204
+ max_threads_per_block: props[:max_threads_per_block] || 1024,
205
+ max_threads_per_sm: props[:max_threads_per_multi_processor] || 2048,
206
+ warp_size: props[:warp_size] || 32,
207
+ clock_rate_mhz: (props[:clock_rate] || 0) / 1000,
208
+ memory_clock_mhz: (props[:memory_clock_rate] || 0) / 1000,
209
+ l2_cache_size: props[:l2_cache_size] || 0,
210
+ shared_mem_per_block: props[:shared_mem_per_block] || 49152
211
+ )
212
+ else
213
+ # Fallback: use cudaMemGetInfo for at least memory info
214
+ free, total = query_memory_info(device_id)
215
+ DeviceProperties.new(
216
+ id: device_id,
217
+ name: "CUDA Device #{device_id}",
218
+ compute_capability: "0.0",
219
+ total_memory_bytes: total,
220
+ total_memory_mb: total / (1024 * 1024),
221
+ total_memory_gb: (total / (1024.0**3)).round(2),
222
+ sm_count: 0,
223
+ max_threads_per_block: 1024,
224
+ max_threads_per_sm: 2048,
225
+ warp_size: 32,
226
+ clock_rate_mhz: 0,
227
+ memory_clock_mhz: 0,
228
+ l2_cache_size: 0,
229
+ shared_mem_per_block: 49152
230
+ )
231
+ end
232
+ end
233
+
234
+ # Query free and total memory via cudaMemGetInfo.
235
+ # @param device_id [Integer]
236
+ # @return [Array(Integer, Integer)] [free_bytes, total_bytes]
237
+ def query_memory_info(device_id)
238
+ if defined?(Ignis::CUDA::RuntimeAPI)
239
+ free_ptr = Fiddle::Pointer.malloc(8, Fiddle::RUBY_FREE)
240
+ total_ptr = Fiddle::Pointer.malloc(8, Fiddle::RUBY_FREE)
241
+ Ignis::CUDA::RuntimeAPI.cudaMemGetInfo(free_ptr, total_ptr)
242
+ [free_ptr[0, 8].unpack1("Q"), total_ptr[0, 8].unpack1("Q")]
243
+ else
244
+ [0, 0]
245
+ end
246
+ end
247
+
248
+ # Query current free VRAM.
249
+ # @param device_id [Integer]
250
+ # @return [Integer]
251
+ def query_free_memory(device_id)
252
+ query_memory_info(device_id)[0]
253
+ end
254
+ end
255
+ end
256
+ end
257
+ end
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module AI
5
+ # Tape — fiber-local reverse-mode automatic differentiation.
6
+ #
7
+ # Each Ruby fiber/thread gets its own tape. Operations record
8
+ # backward functions during forward pass. backward! does topological
9
+ # sort and reverse walk to compute gradients.
10
+ #
11
+ # @example
12
+ # a = Tensor.from_host([2.0], shape: [1], requires_grad: true)
13
+ # b = a * a
14
+ # b.backward!
15
+ # a.grad.to_host # => [4.0]
16
+ class Tape
17
+ # Thread-local tape key
18
+ TAPE_KEY = :nnw_ai_tape
19
+ NO_GRAD_KEY = :nnw_ai_no_grad
20
+
21
+ # An entry on the tape representing one operation.
22
+ Entry = Struct.new(:output, :inputs, :backward_fn, keyword_init: true)
23
+
24
+ class << self
25
+ # Record an operation on the tape.
26
+ # @param output [Tensor] the result tensor
27
+ # @param inputs [Array<Tensor>] input tensors
28
+ # @yield [Ignis::Shared::NvArray] receives gradient, must return Array of NvArrays
29
+ # @return [void]
30
+ def record(output, inputs:, &backward_fn)
31
+ return if no_grad_active?
32
+ return unless output.requires_grad
33
+
34
+ tape = current_tape
35
+ entry = Entry.new(output: output, inputs: inputs, backward_fn: backward_fn)
36
+ output._tape_id = tape.length
37
+ tape << entry
38
+ end
39
+
40
+ # Run reverse-mode AD from a tensor.
41
+ # @param tensor [Tensor] the output tensor to differentiate
42
+ # @param grad_output [Ignis::Shared::NvArray] initial gradient
43
+ # @return [void]
44
+ def backward!(tensor, grad_output)
45
+ tape = current_tape
46
+ return if tape.empty?
47
+
48
+ # Build a map of tensor object_id → accumulated gradient (NvArray).
49
+ # This is the single source of truth during the reverse walk; leaf
50
+ # .grad is written ONCE afterwards. Writing both during the walk caused
51
+ # double-counting when a leaf was reused (e.g. x in x*x): grad_map[x]
52
+ # and x.grad aliased the same buffer, so the second occurrence
53
+ # accumulated into it twice.
54
+ grad_map = {}
55
+ grad_map[tensor.object_id] = grad_output
56
+ leaves = {} # object_id => leaf Tensor that received gradient
57
+
58
+ # Buffers grad_map EXCLUSIVELY OWNS, tracked by Ruby object identity.
59
+ # accumulate_grads! mutates its dst in place, so the tape must never
60
+ # store or accumulate a buffer that another grad_map entry also
61
+ # references — an in-place add would silently corrupt the aliased entry.
62
+ # Backward closures are free to return aliased buffers (e.g. `+` returns
63
+ # [grad, grad]; `-` returns [grad, neg_grad] reusing the upstream grad).
64
+ # We clone on the way in to restore exclusive ownership. Clones happen
65
+ # ONLY on these aliasing paths; the common case (fresh buffer per input)
66
+ # never clones.
67
+ owned = {}.compare_by_identity
68
+ owned[grad_output] = true
69
+
70
+ # Walk tape in reverse order (topological by construction)
71
+ tape.reverse_each do |entry|
72
+ output = entry.output
73
+ output_grad = grad_map[output.object_id]
74
+ next unless output_grad
75
+
76
+ # Call backward function to get input gradients
77
+ input_grads = entry.backward_fn.call(output_grad)
78
+
79
+ # Accumulate gradients for each input
80
+ entry.inputs.each_with_index do |input_tensor, i|
81
+ next unless input_tensor.requires_grad
82
+ input_grad = input_grads[i]
83
+ next unless input_grad
84
+
85
+ if grad_map.key?(input_tensor.object_id)
86
+ dst = grad_map[input_tensor.object_id]
87
+ # Never accumulate a buffer into itself (would compute 2*dst):
88
+ # clone so we add a snapshot of src's current value.
89
+ src = input_grad.equal?(dst) ? input_grad.clone : input_grad
90
+ accumulate_grads!(dst, src)
91
+ else
92
+ # Take exclusive ownership. If this exact buffer is already owned
93
+ # by another entry (the aliasing case), clone before storing.
94
+ input_grad = input_grad.clone if owned[input_grad]
95
+ grad_map[input_tensor.object_id] = input_grad
96
+ owned[input_grad] = true
97
+ end
98
+
99
+ leaves[input_tensor.object_id] = input_tensor if input_tensor.is_leaf
100
+ end
101
+ end
102
+
103
+ # Assign accumulated gradients to leaf tensors. Accumulate into any
104
+ # pre-existing .grad so gradient accumulation across multiple
105
+ # backward! calls (e.g. micro-batching) still works.
106
+ leaves.each do |oid, leaf|
107
+ g = grad_map[oid]
108
+ next unless g
109
+
110
+ if leaf.grad && !leaf.grad.equal?(g)
111
+ accumulate_grads!(leaf.grad, g)
112
+ else
113
+ leaf.grad = g
114
+ end
115
+ end
116
+
117
+ # Clear tape after backward (each backward is a fresh computation)
118
+ clear!
119
+ end
120
+
121
+ # Disable gradient recording inside block.
122
+ # @yield block where no gradients are recorded
123
+ # @return [Object] block return value
124
+ def no_grad(&block)
125
+ prev = Thread.current[NO_GRAD_KEY]
126
+ Thread.current[NO_GRAD_KEY] = true
127
+ begin
128
+ block.call
129
+ ensure
130
+ Thread.current[NO_GRAD_KEY] = prev
131
+ end
132
+ end
133
+
134
+ # Check if no_grad is currently active.
135
+ # @return [Boolean]
136
+ def no_grad_active?
137
+ Thread.current[NO_GRAD_KEY] == true
138
+ end
139
+
140
+ # Gradient checkpointing: recompute activations during backward.
141
+ # Stores only inputs + output. Reruns forward in backward pass.
142
+ # Critical for large models on 12GB VRAM.
143
+ # @param inputs [Array<Tensor>] input tensors to save
144
+ # @yield block that computes the forward pass
145
+ # @return [Tensor] the output tensor
146
+ def gradient_checkpoint(inputs, &forward_fn)
147
+ # Run forward with no_grad to avoid double recording
148
+ output = no_grad { forward_fn.call }
149
+
150
+ # Record a special tape entry that recomputes forward in backward
151
+ if output.requires_grad
152
+ saved_inputs = inputs.map { |t| t.data }
153
+ record(output, inputs: inputs) do |grad|
154
+ # Recompute forward pass to get intermediate values
155
+ recomputed = forward_fn.call
156
+ # Now the tape has entries for this recomputation
157
+ # Run backward on the recomputed output
158
+ Tape.backward!(recomputed, grad)
159
+ # Collect input gradients
160
+ inputs.map { |t| t.grad }
161
+ end
162
+ end
163
+
164
+ output
165
+ end
166
+
167
+ # Get current thread's tape.
168
+ # @return [Array<Entry>]
169
+ def current_tape
170
+ Thread.current[TAPE_KEY] ||= []
171
+ end
172
+
173
+ # Clear current thread's tape.
174
+ # @return [void]
175
+ def clear!
176
+ Thread.current[TAPE_KEY] = []
177
+ end
178
+
179
+ # Get tape size (for debugging).
180
+ # @return [Integer]
181
+ def size
182
+ current_tape.length
183
+ end
184
+
185
+ private
186
+
187
+ # Accumulate gradients: dst += src using GPU kernel
188
+ # @param dst [Ignis::Shared::NvArray]
189
+ # @param src [Ignis::Shared::NvArray]
190
+ # @return [void]
191
+ def accumulate_grads!(dst, src)
192
+ n = dst.numel
193
+ kernel = Ignis::JIT::Kernels::Elementwise.accumulate
194
+ kernel.launch(grid: [(n + 255) / 256], block: [256], args: [dst, src, n])
195
+ Ignis.synchronize
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end