ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,804 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fiddle'
4
+ require 'fiddle/import'
5
+
6
+ module Ignis
7
+ module CUDA
8
+ # CUDA Runtime API — Fiddle-only hot-path bindings.
9
+ #
10
+ # Rule: This file uses ONLY Fiddle for hot-path calls (malloc, free, memcpy,
11
+ # stream, event, sync). FFI struct calls (cudaGetDeviceProperties) live in
12
+ # device_props.rb (Rule 4: never mix FFI and Fiddle in the same file).
13
+ #
14
+ # Cross-platform: Uses Ignis::Platform to resolve cudart path on Windows/Linux.
15
+ module RuntimeAPI
16
+ # CUDA memory copy directions
17
+ MEMCPY_HOST_TO_HOST = 0
18
+ MEMCPY_HOST_TO_DEVICE = 1
19
+ MEMCPY_DEVICE_TO_HOST = 2
20
+ MEMCPY_DEVICE_TO_DEVICE = 3
21
+ MEMCPY_DEFAULT = 4
22
+
23
+ # CUDA device flags
24
+ DEVICE_SCHEDULE_AUTO = 0
25
+ DEVICE_SCHEDULE_SPIN = 1
26
+ DEVICE_SCHEDULE_YIELD = 2
27
+ DEVICE_SCHEDULE_BLOCKING_SYNC = 4
28
+ DEVICE_MAP_HOST = 8
29
+ DEVICE_LMEM_RESIZE_TO_MAX = 16
30
+
31
+ # CUDA host alloc flags
32
+ HOST_ALLOC_DEFAULT = 0
33
+ HOST_ALLOC_PORTABLE = 1
34
+ HOST_ALLOC_MAPPED = 2
35
+ HOST_ALLOC_WRITECOMBINED = 4
36
+
37
+ # Resolve CUDA runtime library path at load time.
38
+ # Uses Ignis::Platform if available, falls back to OS detection.
39
+ CUDART_LIB = if defined?(Ignis::Platform)
40
+ Ignis::Platform.cudart_path
41
+ elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
42
+ File.join('C:', 'Program Files', 'NVIDIA GPU Computing Toolkit',
43
+ 'CUDA', 'v13.0', 'bin', 'cudart64_130.dll')
44
+ else
45
+ 'libcudart.so.13'
46
+ end
47
+
48
+ @loaded = false
49
+ @handle = nil
50
+ @functions = {}
51
+
52
+ class << self
53
+ # @return [Fiddle::Handle, nil] the loaded DLL handle
54
+ attr_reader :handle
55
+
56
+ # Ensure the CUDA runtime is loaded and all functions are bound.
57
+ # @return [void]
58
+ # @raise [Fiddle::DLError] if the library cannot be loaded
59
+ def ensure_loaded!
60
+ return if @loaded
61
+
62
+ @handle = Fiddle::Handle.new(CUDART_LIB)
63
+ attach_all_functions!
64
+ @loaded = true
65
+ end
66
+
67
+ # @return [Boolean]
68
+ def loaded?
69
+ @loaded
70
+ end
71
+
72
+ # ================================================================
73
+ # Device Management
74
+ # ================================================================
75
+
76
+ # @return [Integer] number of CUDA devices
77
+ def get_device_count
78
+ ensure_loaded!
79
+ ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT)
80
+ status = @functions[:cudaGetDeviceCount].call(ptr)
81
+ check_status!(status, 'cudaGetDeviceCount')
82
+ ptr[0, Fiddle::SIZEOF_INT].unpack1('l')
83
+ end
84
+
85
+ # @return [Integer] current device index
86
+ def get_device
87
+ ensure_loaded!
88
+ ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT)
89
+ status = @functions[:cudaGetDevice].call(ptr)
90
+ check_status!(status, 'cudaGetDevice')
91
+ ptr[0, Fiddle::SIZEOF_INT].unpack1('l')
92
+ end
93
+
94
+ # @param device [Integer] device index to set
95
+ # @return [void]
96
+ def set_device(device)
97
+ ensure_loaded!
98
+ status = @functions[:cudaSetDevice].call(device)
99
+ check_status!(status, 'cudaSetDevice')
100
+ end
101
+
102
+ # @return [void]
103
+ def device_synchronize
104
+ ensure_loaded!
105
+ status = @functions[:cudaDeviceSynchronize].call
106
+ check_status!(status, 'cudaDeviceSynchronize')
107
+ end
108
+
109
+ # @return [void]
110
+ def device_reset
111
+ ensure_loaded!
112
+ status = @functions[:cudaDeviceReset].call
113
+ check_status!(status, 'cudaDeviceReset')
114
+ end
115
+
116
+ # @param attr_id [Integer] CUDA device attribute ID
117
+ # @param device [Integer] device index
118
+ # @return [Integer] attribute value
119
+ def device_get_attribute(attr_id, device)
120
+ ensure_loaded!
121
+ ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT)
122
+ status = @functions[:cudaDeviceGetAttribute].call(ptr, attr_id, device)
123
+ check_status!(status, 'cudaDeviceGetAttribute')
124
+ ptr[0, Fiddle::SIZEOF_INT].unpack1('l')
125
+ end
126
+
127
+ # ================================================================
128
+ # Memory Management (Hot Path)
129
+ # ================================================================
130
+
131
+ # Allocate device memory.
132
+ # @param size [Integer] bytes to allocate
133
+ # @return [Fiddle::Pointer] device pointer
134
+ def malloc(size)
135
+ ensure_loaded!
136
+ ptr_buf = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
137
+ status = @functions[:cudaMalloc].call(ptr_buf, size)
138
+ check_status!(status, "cudaMalloc(#{size})")
139
+ Fiddle::Pointer.new(ptr_buf[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
140
+ end
141
+
142
+ # Free device memory.
143
+ # @param ptr [Fiddle::Pointer] device pointer to free
144
+ # @return [void]
145
+ def free(ptr)
146
+ ensure_loaded!
147
+ status = @functions[:cudaFree].call(ptr)
148
+ check_status!(status, 'cudaFree')
149
+ end
150
+
151
+ # Copy memory.
152
+ # @param dst [Fiddle::Pointer] destination
153
+ # @param src [Fiddle::Pointer] source
154
+ # @param count [Integer] bytes to copy
155
+ # @param kind [Integer] copy direction constant
156
+ # @return [void]
157
+ def memcpy(dst, src, count, kind)
158
+ ensure_loaded!
159
+ status = @functions[:cudaMemcpy].call(dst, src, count, kind)
160
+ check_status!(status, "cudaMemcpy(#{count} bytes, kind=#{kind})")
161
+ end
162
+
163
+ # Async memory copy.
164
+ # @param dst [Fiddle::Pointer]
165
+ # @param src [Fiddle::Pointer]
166
+ # @param count [Integer]
167
+ # @param kind [Integer]
168
+ # @param stream [Fiddle::Pointer] CUDA stream
169
+ # @return [void]
170
+ def memcpy_async(dst, src, count, kind, stream)
171
+ ensure_loaded!
172
+ status = @functions[:cudaMemcpyAsync].call(dst, src, count, kind, stream)
173
+ check_status!(status, "cudaMemcpyAsync(#{count} bytes)")
174
+ end
175
+
176
+ # Set device memory.
177
+ # @param ptr [Fiddle::Pointer] device pointer
178
+ # @param value [Integer] byte value to set
179
+ # @param count [Integer] bytes to set
180
+ # @return [void]
181
+ def memset(ptr, value, count)
182
+ ensure_loaded!
183
+ status = @functions[:cudaMemset].call(ptr, value, count)
184
+ check_status!(status, "cudaMemset(#{count} bytes)")
185
+ end
186
+
187
+ # Allocate pinned host memory.
188
+ # @param size [Integer] bytes
189
+ # @param flags [Integer] allocation flags
190
+ # @return [Fiddle::Pointer] host pointer
191
+ def host_alloc(size, flags = HOST_ALLOC_DEFAULT)
192
+ ensure_loaded!
193
+ ptr_buf = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
194
+ status = @functions[:cudaHostAlloc].call(ptr_buf, size, flags)
195
+ check_status!(status, "cudaHostAlloc(#{size})")
196
+ Fiddle::Pointer.new(ptr_buf[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
197
+ end
198
+
199
+ # Free pinned host memory.
200
+ # @param ptr [Fiddle::Pointer]
201
+ # @return [void]
202
+ def free_host(ptr)
203
+ ensure_loaded!
204
+ status = @functions[:cudaFreeHost].call(ptr)
205
+ check_status!(status, 'cudaFreeHost')
206
+ end
207
+
208
+ # Query GPU memory info.
209
+ # @return [Hash] {free_bytes:, total_bytes:}
210
+ def mem_get_info
211
+ ensure_loaded!
212
+ free_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_SIZE_T)
213
+ total_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_SIZE_T)
214
+ status = @functions[:cudaMemGetInfo].call(free_ptr, total_ptr)
215
+ check_status!(status, 'cudaMemGetInfo')
216
+ {
217
+ free_bytes: free_ptr[0, Fiddle::SIZEOF_SIZE_T].unpack1('Q'),
218
+ total_bytes: total_ptr[0, Fiddle::SIZEOF_SIZE_T].unpack1('Q')
219
+ }
220
+ end
221
+
222
+ # ================================================================
223
+ # Stream-Ordered Memory (CUDA 11.2+)
224
+ # ================================================================
225
+
226
+ # @param size [Integer] bytes
227
+ # @param stream [Fiddle::Pointer] CUDA stream
228
+ # @return [Fiddle::Pointer] device pointer
229
+ def malloc_async(size, stream)
230
+ ensure_loaded!
231
+ ptr_buf = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
232
+ status = @functions[:cudaMallocAsync].call(ptr_buf, size, stream)
233
+ check_status!(status, "cudaMallocAsync(#{size})")
234
+ Fiddle::Pointer.new(ptr_buf[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
235
+ end
236
+
237
+ # @param ptr [Fiddle::Pointer] device pointer
238
+ # @param stream [Fiddle::Pointer] CUDA stream
239
+ # @return [void]
240
+ def free_async(ptr, stream)
241
+ ensure_loaded!
242
+ status = @functions[:cudaFreeAsync].call(ptr, stream)
243
+ check_status!(status, 'cudaFreeAsync')
244
+ end
245
+
246
+ # ================================================================
247
+ # Stream Management
248
+ # ================================================================
249
+
250
+ # Create a CUDA stream.
251
+ # @return [Fiddle::Pointer] stream handle
252
+ def stream_create
253
+ ensure_loaded!
254
+ ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
255
+ status = @functions[:cudaStreamCreate].call(ptr)
256
+ check_status!(status, 'cudaStreamCreate')
257
+ Fiddle::Pointer.new(ptr[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
258
+ end
259
+
260
+ # Destroy a CUDA stream.
261
+ # @param stream [Fiddle::Pointer]
262
+ # @return [void]
263
+ def stream_destroy(stream)
264
+ ensure_loaded!
265
+ status = @functions[:cudaStreamDestroy].call(stream)
266
+ check_status!(status, 'cudaStreamDestroy')
267
+ end
268
+
269
+ # Synchronize a stream (blocks until all commands complete).
270
+ # @param stream [Fiddle::Pointer]
271
+ # @return [void]
272
+ def stream_synchronize(stream)
273
+ ensure_loaded!
274
+ status = @functions[:cudaStreamSynchronize].call(stream)
275
+ check_status!(status, 'cudaStreamSynchronize')
276
+ end
277
+
278
+ # Query stream completion status.
279
+ # @param stream [Fiddle::Pointer]
280
+ # @return [Boolean] true if all work complete
281
+ def stream_query(stream)
282
+ ensure_loaded!
283
+ status = @functions[:cudaStreamQuery].call(stream)
284
+ status.zero?
285
+ end
286
+
287
+ # ================================================================
288
+ # Event Management
289
+ # ================================================================
290
+
291
+ # Create a CUDA event.
292
+ # @return [Fiddle::Pointer] event handle
293
+ def event_create
294
+ ensure_loaded!
295
+ ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
296
+ status = @functions[:cudaEventCreate].call(ptr)
297
+ check_status!(status, 'cudaEventCreate')
298
+ Fiddle::Pointer.new(ptr[0, Fiddle::SIZEOF_VOIDP].unpack1('Q'))
299
+ end
300
+
301
+ # Destroy a CUDA event.
302
+ # @param event [Fiddle::Pointer]
303
+ # @return [void]
304
+ def event_destroy(event)
305
+ ensure_loaded!
306
+ status = @functions[:cudaEventDestroy].call(event)
307
+ check_status!(status, 'cudaEventDestroy')
308
+ end
309
+
310
+ # Record an event in a stream.
311
+ # @param event [Fiddle::Pointer]
312
+ # @param stream [Fiddle::Pointer]
313
+ # @return [void]
314
+ def event_record(event, stream)
315
+ ensure_loaded!
316
+ status = @functions[:cudaEventRecord].call(event, stream)
317
+ check_status!(status, 'cudaEventRecord')
318
+ end
319
+
320
+ # Block until event completes.
321
+ # @param event [Fiddle::Pointer]
322
+ # @return [void]
323
+ def event_synchronize(event)
324
+ ensure_loaded!
325
+ status = @functions[:cudaEventSynchronize].call(event)
326
+ check_status!(status, 'cudaEventSynchronize')
327
+ end
328
+
329
+ # Compute elapsed time between two events.
330
+ # @param start_event [Fiddle::Pointer]
331
+ # @param end_event [Fiddle::Pointer]
332
+ # @return [Float] elapsed time in milliseconds
333
+ def event_elapsed_time(start_event, end_event)
334
+ ensure_loaded!
335
+ ms_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_FLOAT)
336
+ status = @functions[:cudaEventElapsedTime].call(ms_ptr, start_event, end_event)
337
+ check_status!(status, 'cudaEventElapsedTime')
338
+ ms_ptr[0, Fiddle::SIZEOF_FLOAT].unpack1('e')
339
+ end
340
+
341
+ # ================================================================
342
+ # Error Handling
343
+ # ================================================================
344
+
345
+ # @return [Integer] last error code
346
+ def get_last_error
347
+ ensure_loaded!
348
+ @functions[:cudaGetLastError].call
349
+ end
350
+
351
+ # @return [Integer] last error without clearing
352
+ def peek_at_last_error
353
+ ensure_loaded!
354
+ @functions[:cudaPeekAtLastError].call
355
+ end
356
+
357
+ # Get error string for a status code.
358
+ # @param status [Integer]
359
+ # @return [String] error description
360
+ def get_error_string(status)
361
+ ensure_loaded!
362
+ ptr = @functions[:cudaGetErrorString].call(status)
363
+ Fiddle::Pointer.new(ptr).to_s
364
+ end
365
+
366
+ # Get error name for a status code.
367
+ # @param status [Integer]
368
+ # @return [String] error name
369
+ def get_error_name(status)
370
+ ensure_loaded!
371
+ ptr = @functions[:cudaGetErrorName].call(status)
372
+ Fiddle::Pointer.new(ptr).to_s
373
+ end
374
+
375
+ # ================================================================
376
+ # Version Info
377
+ # ================================================================
378
+
379
+ # @return [Integer] CUDA runtime version
380
+ def runtime_version
381
+ ensure_loaded!
382
+ ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT)
383
+ status = @functions[:cudaRuntimeGetVersion].call(ptr)
384
+ check_status!(status, 'cudaRuntimeGetVersion')
385
+ ptr[0, Fiddle::SIZEOF_INT].unpack1('l')
386
+ end
387
+
388
+ # @return [Integer] CUDA driver version
389
+ def driver_version
390
+ ensure_loaded!
391
+ ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT)
392
+ status = @functions[:cudaDriverGetVersion].call(ptr)
393
+ check_status!(status, 'cudaDriverGetVersion')
394
+ ptr[0, Fiddle::SIZEOF_INT].unpack1('l')
395
+ end
396
+
397
+ # Check CUDA status and raise error if not success.
398
+ # @param status [Integer]
399
+ # @param context [String]
400
+ # @return [void]
401
+ # @raise [CudaRuntimeError]
402
+ def check_status!(status, context = 'CUDA operation')
403
+ return if status.zero?
404
+
405
+ ensure_loaded!
406
+ error_name = get_error_name(status)
407
+ error_string = get_error_string(status)
408
+ raise CudaRuntimeError.new(
409
+ "#{context}: #{error_name} - #{error_string}",
410
+ cuda_code: status
411
+ )
412
+ end
413
+
414
+ private
415
+
416
+ # Bind all CUDA runtime functions via Fiddle::Function.
417
+ # @return [void]
418
+ def attach_all_functions!
419
+ # Device management
420
+ bind(:cudaGetDeviceCount, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
421
+ bind(:cudaGetDevice, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
422
+ bind(:cudaSetDevice, [Fiddle::TYPE_INT], Fiddle::TYPE_INT)
423
+ bind(:cudaDeviceSynchronize, [], Fiddle::TYPE_INT)
424
+ bind(:cudaDeviceReset, [], Fiddle::TYPE_INT)
425
+ bind(:cudaDeviceGetAttribute, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_INT], Fiddle::TYPE_INT)
426
+ bind(:cudaGetDeviceProperties, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT], Fiddle::TYPE_INT)
427
+
428
+ # Memory management (hot path)
429
+ bind(:cudaMalloc, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T], Fiddle::TYPE_INT)
430
+ bind(:cudaFree, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
431
+ bind(:cudaMemcpy, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT], Fiddle::TYPE_INT)
432
+ bind(:cudaMemcpyAsync, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
433
+ bind(:cudaMemset, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_SIZE_T], Fiddle::TYPE_INT)
434
+ bind(:cudaMemsetAsync, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
435
+ bind(:cudaHostAlloc, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT], Fiddle::TYPE_INT)
436
+ bind(:cudaFreeHost, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
437
+ bind(:cudaMemGetInfo, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
438
+ bind(:cudaHostRegister, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT], Fiddle::TYPE_INT)
439
+ bind(:cudaHostUnregister, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
440
+ bind(:cudaMallocHost, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T], Fiddle::TYPE_INT)
441
+
442
+ # Stream-ordered memory
443
+ bind(:cudaMallocAsync, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
444
+ bind(:cudaFreeAsync, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
445
+ bind(:cudaMallocFromPoolAsync, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
446
+ bind(:cudaMemPoolCreate, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
447
+ bind(:cudaMemPoolDestroy, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
448
+ bind(:cudaMemPoolSetAttribute, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
449
+ bind(:cudaMemPoolGetAttribute, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
450
+ bind(:cudaDeviceGetDefaultMemPool, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT], Fiddle::TYPE_INT)
451
+ bind(:cudaDeviceSetMemPool, [Fiddle::TYPE_INT, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
452
+
453
+ # Stream management
454
+ bind(:cudaStreamCreate, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
455
+ bind(:cudaStreamDestroy, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
456
+ bind(:cudaStreamSynchronize, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
457
+ bind(:cudaStreamQuery, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
458
+
459
+ # Event management
460
+ bind(:cudaEventCreate, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
461
+ bind(:cudaEventDestroy, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
462
+ bind(:cudaEventRecord, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
463
+ bind(:cudaEventSynchronize, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
464
+ bind(:cudaEventElapsedTime, [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
465
+
466
+ # Error handling
467
+ bind(:cudaGetLastError, [], Fiddle::TYPE_INT)
468
+ bind(:cudaPeekAtLastError, [], Fiddle::TYPE_INT)
469
+ bind(:cudaGetErrorString, [Fiddle::TYPE_INT], Fiddle::TYPE_VOIDP)
470
+ bind(:cudaGetErrorName, [Fiddle::TYPE_INT], Fiddle::TYPE_VOIDP)
471
+
472
+ # Version
473
+ bind(:cudaRuntimeGetVersion, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
474
+ bind(:cudaDriverGetVersion, [Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT)
475
+ end
476
+
477
+ # Bind a single CUDA runtime function via Fiddle.
478
+ # @param name [Symbol] function name
479
+ # @param arg_types [Array<Integer>] Fiddle type constants for arguments
480
+ # @param ret_type [Integer] Fiddle type constant for return
481
+ def bind(name, arg_types, ret_type)
482
+ @functions[name] = Fiddle::Function.new(
483
+ @handle[name.to_s],
484
+ arg_types,
485
+ ret_type
486
+ )
487
+ rescue Fiddle::DLError
488
+ $stderr.puts "[RuntimeAPI] WARNING: #{name} not found in #{CUDART_DLL}"
489
+ end
490
+
491
+ # ================================================================
492
+ # Backward-Compatible Shims
493
+ # ================================================================
494
+ # These methods maintain the old RuntimeAPI.cudaXxx call signatures
495
+ # so existing collective/transport code continues to work without
496
+ # modification. They delegate to the new Fiddle-based methods above.
497
+ # The raw Fiddle function is available for callers that need status
498
+ # codes instead of exceptions.
499
+
500
+ public
501
+
502
+ # @!group Backward-Compatible Shims
503
+
504
+ # Raw CUDA function calls returning status codes (for legacy callers).
505
+ # These call the underlying Fiddle::Function directly.
506
+
507
+ # @param device_id [Integer]
508
+ # @return [Integer] CUDA status code
509
+ def cudaSetDevice(device_id)
510
+ ensure_loaded!
511
+ @functions[:cudaSetDevice].call(device_id)
512
+ end
513
+
514
+ # @return [Integer] status
515
+ def cudaDeviceSynchronize
516
+ ensure_loaded!
517
+ @functions[:cudaDeviceSynchronize].call
518
+ end
519
+
520
+ # @return [Integer] status
521
+ def cudaDeviceReset
522
+ ensure_loaded!
523
+ @functions[:cudaDeviceReset].call
524
+ end
525
+
526
+ # @param ptr_out [Fiddle::Pointer, FFI::Pointer] pointer-to-pointer
527
+ # @return [Integer] status
528
+ def cudaGetDeviceCount(ptr_out)
529
+ ensure_loaded!
530
+ @functions[:cudaGetDeviceCount].call(ptr_out)
531
+ end
532
+
533
+ # @param ptr_out [Fiddle::Pointer, FFI::Pointer] pointer-to-int
534
+ # @return [Integer] status
535
+ def cudaGetDevice(ptr_out)
536
+ ensure_loaded!
537
+ @functions[:cudaGetDevice].call(ptr_out)
538
+ end
539
+
540
+ # @param ptr_out [Fiddle::Pointer] pointer-to-int
541
+ # @param attr_id [Integer]
542
+ # @param device [Integer]
543
+ # @return [Integer] status
544
+ def cudaDeviceGetAttribute(ptr_out, attr_id, device)
545
+ ensure_loaded!
546
+ @functions[:cudaDeviceGetAttribute].call(ptr_out, attr_id, device)
547
+ end
548
+
549
+ # @param ptr_ptr [Fiddle::Pointer] pointer-to-pointer
550
+ # @param size [Integer]
551
+ # @return [Integer] status
552
+ def cudaMalloc(ptr_ptr, size)
553
+ ensure_loaded!
554
+ @functions[:cudaMalloc].call(ptr_ptr, size)
555
+ end
556
+
557
+ # @param ptr [Fiddle::Pointer]
558
+ # @return [Integer] status
559
+ def cudaFree(ptr)
560
+ ensure_loaded!
561
+ @functions[:cudaFree].call(ptr)
562
+ end
563
+
564
+ # @return [Integer] status
565
+ def cudaMemcpy(dst, src, count, kind)
566
+ ensure_loaded!
567
+ kind_int = resolve_memcpy_kind(kind)
568
+ @functions[:cudaMemcpy].call(dst, src, count, kind_int)
569
+ end
570
+
571
+ # @return [Integer] status
572
+ def cudaMemcpyAsync(dst, src, count, kind, stream)
573
+ ensure_loaded!
574
+ kind_int = resolve_memcpy_kind(kind)
575
+ @functions[:cudaMemcpyAsync].call(dst, src, count, kind_int, stream)
576
+ end
577
+
578
+ # @return [Integer] status
579
+ def cudaMemset(ptr, value, count)
580
+ ensure_loaded!
581
+ @functions[:cudaMemset].call(ptr, value, count)
582
+ end
583
+
584
+ # @return [Integer] status
585
+ def cudaMemsetAsync(ptr, value, count, stream)
586
+ ensure_loaded!
587
+ @functions[:cudaMemsetAsync].call(ptr, value, count, stream)
588
+ end
589
+
590
+ # @return [Integer] status
591
+ def cudaMemGetInfo(free_ptr, total_ptr)
592
+ ensure_loaded!
593
+ @functions[:cudaMemGetInfo].call(free_ptr, total_ptr)
594
+ end
595
+
596
+ # @param ptr_ptr [Fiddle::Pointer] pointer-to-pointer
597
+ # @param size [Integer]
598
+ # @param flags [Integer]
599
+ # @return [Integer] status
600
+ def cudaHostAlloc(ptr_ptr, size, flags)
601
+ ensure_loaded!
602
+ @functions[:cudaHostAlloc].call(ptr_ptr, size, flags)
603
+ end
604
+
605
+ # @param ptr [Fiddle::Pointer]
606
+ # @return [Integer] status
607
+ def cudaFreeHost(ptr)
608
+ ensure_loaded!
609
+ @functions[:cudaFreeHost].call(ptr)
610
+ end
611
+
612
+ # @return [Integer] status
613
+ def cudaStreamCreate(ptr_out)
614
+ ensure_loaded!
615
+ @functions[:cudaStreamCreate].call(ptr_out)
616
+ end
617
+
618
+ # @return [Integer] status
619
+ def cudaStreamDestroy(stream)
620
+ ensure_loaded!
621
+ @functions[:cudaStreamDestroy].call(stream)
622
+ end
623
+
624
+ # @return [Integer] status
625
+ def cudaStreamSynchronize(stream)
626
+ ensure_loaded!
627
+ @functions[:cudaStreamSynchronize].call(stream)
628
+ end
629
+
630
+ # @return [Integer] status
631
+ def cudaEventCreate(ptr_out)
632
+ ensure_loaded!
633
+ @functions[:cudaEventCreate].call(ptr_out)
634
+ end
635
+
636
+ # @return [Integer] status
637
+ def cudaEventDestroy(event)
638
+ ensure_loaded!
639
+ @functions[:cudaEventDestroy].call(event)
640
+ end
641
+
642
+ # @return [Integer] status
643
+ def cudaEventRecord(event, stream)
644
+ ensure_loaded!
645
+ @functions[:cudaEventRecord].call(event, stream)
646
+ end
647
+
648
+ # @return [Integer] status
649
+ def cudaEventSynchronize(event)
650
+ ensure_loaded!
651
+ @functions[:cudaEventSynchronize].call(event)
652
+ end
653
+
654
+ # @return [Integer] status
655
+ def cudaEventElapsedTime(ms_ptr, start_event, end_event)
656
+ ensure_loaded!
657
+ @functions[:cudaEventElapsedTime].call(ms_ptr, start_event, end_event)
658
+ end
659
+
660
+ # @return [Integer] status
661
+ def cudaGetLastError
662
+ ensure_loaded!
663
+ @functions[:cudaGetLastError].call
664
+ end
665
+
666
+ # @return [Integer] status
667
+ def cudaPeekAtLastError
668
+ ensure_loaded!
669
+ @functions[:cudaPeekAtLastError].call
670
+ end
671
+
672
+ # @param status [Integer]
673
+ # @return [String]
674
+ def cudaGetErrorString(status)
675
+ get_error_string(status)
676
+ end
677
+
678
+ # @param status [Integer]
679
+ # @return [String]
680
+ def cudaGetErrorName(status)
681
+ get_error_name(status)
682
+ end
683
+
684
+ # @param ptr [Fiddle::Pointer] host pointer to register
685
+ # @param size [Integer] bytes
686
+ # @param flags [Integer]
687
+ # @return [Integer] status
688
+ def cudaHostRegister(ptr, size, flags)
689
+ ensure_loaded!
690
+ @functions[:cudaHostRegister]&.call(ptr, size, flags) || 0
691
+ end
692
+
693
+ # @param ptr [Fiddle::Pointer] host pointer to unregister
694
+ # @return [Integer] status
695
+ def cudaHostUnregister(ptr)
696
+ ensure_loaded!
697
+ @functions[:cudaHostUnregister]&.call(ptr) || 0
698
+ end
699
+
700
+ # @param ptr [Fiddle::Pointer] pointer-to-pointer
701
+ # @param size [Integer]
702
+ # @return [Integer] status
703
+ def cudaMallocHost(ptr_ptr, size)
704
+ ensure_loaded!
705
+ @functions[:cudaMallocHost]&.call(ptr_ptr, size) || cudaHostAlloc(ptr_ptr, size, 0)
706
+ end
707
+
708
+ # @param ptr_out [Fiddle::Pointer]
709
+ # @param size [Integer]
710
+ # @param stream [Fiddle::Pointer]
711
+ # @return [Integer] status
712
+ def cudaMallocAsync(ptr_out, size, stream)
713
+ ensure_loaded!
714
+ @functions[:cudaMallocAsync].call(ptr_out, size, stream)
715
+ end
716
+
717
+ # @param ptr [Fiddle::Pointer]
718
+ # @param stream [Fiddle::Pointer]
719
+ # @return [Integer] status
720
+ def cudaFreeAsync(ptr, stream)
721
+ ensure_loaded!
722
+ @functions[:cudaFreeAsync].call(ptr, stream)
723
+ end
724
+
725
+ # @param ptr_out [Fiddle::Pointer] pointer to store version
726
+ # @return [Integer] status
727
+ def cudaRuntimeGetVersion(ptr_out)
728
+ ensure_loaded!
729
+ @functions[:cudaRuntimeGetVersion].call(ptr_out)
730
+ end
731
+
732
+ # @return [Integer] status
733
+ def cudaMemPoolDestroy(pool)
734
+ ensure_loaded!
735
+ @functions[:cudaMemPoolDestroy]&.call(pool) || 0
736
+ end
737
+
738
+ # @return [Integer] status
739
+ def cudaMallocFromPoolAsync(ptr_ptr, size, pool, stream)
740
+ ensure_loaded!
741
+ @functions[:cudaMallocFromPoolAsync]&.call(ptr_ptr, size, pool, stream) || 0
742
+ end
743
+
744
+ # @return [Integer] status
745
+ def cudaDeviceGetDefaultMemPool(pool_ptr, device)
746
+ ensure_loaded!
747
+ @functions[:cudaDeviceGetDefaultMemPool]&.call(pool_ptr, device) || 0
748
+ end
749
+
750
+ # @return [Integer] status
751
+ def cudaMemPoolSetAttribute(pool, attr, value_ptr)
752
+ ensure_loaded!
753
+ @functions[:cudaMemPoolSetAttribute]&.call(pool, attr, value_ptr) || 0
754
+ end
755
+
756
+ # @return [Integer] status
757
+ def cudaMemPoolGetAttribute(pool, attr, value_ptr)
758
+ ensure_loaded!
759
+ @functions[:cudaMemPoolGetAttribute]&.call(pool, attr, value_ptr) || 0
760
+ end
761
+
762
+ # @return [Integer] status
763
+ def cudaDeviceSetMemPool(device, pool)
764
+ ensure_loaded!
765
+ @functions[:cudaDeviceSetMemPool]&.call(device, pool) || 0
766
+ end
767
+
768
+ # @return [Integer] status
769
+ def cudaMemPoolCreate(pool_ptr, props)
770
+ ensure_loaded!
771
+ @functions[:cudaMemPoolCreate]&.call(pool_ptr, props) || 0
772
+ end
773
+
774
+ # @return [Integer] status
775
+ def cudaGetDeviceProperties(prop_ptr, device)
776
+ ensure_loaded!
777
+ @functions[:cudaGetDeviceProperties]&.call(prop_ptr, device) || 0
778
+ end
779
+
780
+ # @!endgroup
781
+
782
+ private
783
+
784
+ # Resolve symbolic or integer memcpy kind.
785
+ # @param kind [Symbol, Integer]
786
+ # @return [Integer]
787
+ def resolve_memcpy_kind(kind)
788
+ case kind
789
+ when Integer then kind
790
+ when :host_to_host then MEMCPY_HOST_TO_HOST
791
+ when :host_to_device then MEMCPY_HOST_TO_DEVICE
792
+ when :device_to_host then MEMCPY_DEVICE_TO_HOST
793
+ when :device_to_device then MEMCPY_DEVICE_TO_DEVICE
794
+ when :default then MEMCPY_DEFAULT
795
+ else kind.to_i
796
+ end
797
+ end
798
+ end
799
+
800
+ # Also bind cudaHostRegister / cudaHostUnregister in attach_all_functions!
801
+ # This is handled by adding to the bind list
802
+ end
803
+ end
804
+ end