RubyGems - ignis-collective - Versions diffs - 0.0.1 - Mend

ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/README.md +7 -0
data/lib/ignis-collective.rb +9 -0
data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
data/lib/nvruby/collective/algorithms/ring.rb +421 -0
data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
data/lib/nvruby/collective/algorithms/tree.rb +291 -0
data/lib/nvruby/collective/array_ops.rb +240 -0
data/lib/nvruby/collective/communicator.rb +633 -0
data/lib/nvruby/collective/communicator_healer.rb +276 -0
data/lib/nvruby/collective/device_manager.rb +216 -0
data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
data/lib/nvruby/collective/health_monitor.rb +333 -0
data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
data/lib/nvruby/collective/p2p_bindings.rb +121 -0
data/lib/nvruby/collective/resilient_transport.rb +296 -0
data/lib/nvruby/collective/topology.rb +347 -0
data/lib/nvruby/collective/transport/base.rb +138 -0
data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
data/lib/nvruby/collective/transport_selector.rb +200 -0
data/lib/nvruby/collective/vmm_bindings.rb +212 -0
data/lib/nvruby/collective.rb +156 -0
metadata +92 -0

data/lib/nvruby/collective/algorithms/reduction_ops.rb ADDED Viewed

@@ -0,0 +1,168 @@
+# frozen_string_literal: true
+require "ignis"
+module Ignis
+  module Collective
+    module Algorithms
+      # Reduction operations for collective primitives
+      # These operations combine tensor elements during reduce/allreduce
+      module ReductionOps
+        # Valid reduction operations.
+        OPS = %i[sum prod min max avg].freeze
+        # Sum all elements (a + b)
+        def self.sum(a, b, result, count, dtype, stream = nil)
+          execute(:sum, a, b, result, count, dtype, stream)
+        end
+        # Multiply all elements (a * b)
+        def self.prod(a, b, result, count, dtype, stream = nil)
+          execute(:prod, a, b, result, count, dtype, stream)
+        end
+        # Element-wise minimum
+        def self.min(a, b, result, count, dtype, stream = nil)
+          execute(:min, a, b, result, count, dtype, stream)
+        end
+        # Element-wise maximum
+        def self.max(a, b, result, count, dtype, stream = nil)
+          execute(:max, a, b, result, count, dtype, stream)
+        end
+        # Average step. NOTE: averaging is "sum across all ranks, then divide by
+        # the participant count ONCE at the end". The per-pair reduction step is
+        # therefore a plain sum; the caller (Communicator) performs the final
+        # divide-by-N. (Previously this silently returned a sum with no divide.)
+        def self.avg(a, b, result, count, dtype, stream = nil, _n_participants = nil)
+          execute(:sum, a, b, result, count, dtype, stream)
+        end
+        # Execute reduction operation by name: result = op(a, b), elementwise.
+        # @param op [Symbol] :sum, :prod, :min, :max, or :avg (avg == sum per step)
+        # @param a [FFI::Pointer] First operand (device pointer)
+        # @param b [FFI::Pointer] Second operand (device pointer)
+        # @param result [FFI::Pointer] Result buffer (may alias a for in-place)
+        # @param count [Integer] Element count
+        # @param dtype [Symbol] Data type
+        # @param stream [FFI::Pointer, nil] CUDA stream
+        # @return [void]
+        def self.execute(op, a, b, result, count, dtype, stream = nil)
+          reduce = (op == :avg ? :sum : op)
+          raise ArgumentError, "Unknown reduction operation: #{op}" unless %i[sum prod min max].include?(reduce)
+          return if count.zero?
+          if dtype == :float32
+            gpu_elementwise(reduce, a, b, result, count)
+          else
+            # Non-fp32 dtypes use the (correct, slower) host path: the fused JIT
+            # kernels are typed `float`, so reinterpreting fp16/fp64/int buffers
+            # through them would be wrong.
+            host_elementwise_fallback(host_op(reduce), a, b, result, count, dtype)
+          end
+        end
+        class << self
+          private
+          # GPU elementwise reduction for float32 via the fused JIT kernels.
+          def gpu_elementwise(op, a, b, result, count)
+            kernel = case op
+                     when :sum  then Ignis::JIT::Kernels::Elementwise.add_forward
+                     when :prod then Ignis::JIT::Kernels::Elementwise.mul_forward
+                     when :min  then Ignis::JIT::Kernels::Elementwise.min_forward
+                     when :max  then Ignis::JIT::Kernels::Elementwise.max_forward
+                     end
+            kernel.launch(grid: [(count + 255) / 256], block: [256], args: [a, b, result, count])
+            Ignis.synchronize
+          end
+          # Map a reduction op to the host-fallback op name.
+          def host_op(op)
+            op == :sum ? :add : op
+          end
+          # Fallback host-side elementwise (for when NVRTC unavailable)
+          def host_elementwise_fallback(op, a, b, result, count, dtype)
+            elem_size = dtype_size(dtype)
+            total_size = count * elem_size
+            # Allocate host buffers
+            host_a = FFI::MemoryPointer.new(:uint8, total_size)
+            host_b = FFI::MemoryPointer.new(:uint8, total_size)
+            host_result = FFI::MemoryPointer.new(:uint8, total_size)
+            # Copy from device to host
+            CUDA::RuntimeAPI.cudaMemcpy(host_a, a, total_size, CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST)
+            CUDA::RuntimeAPI.cudaMemcpy(host_b, b, total_size, CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST)
+            # Perform operation
+            count.times do |i|
+              offset = i * elem_size
+              val_a = read_element(host_a, offset, dtype)
+              val_b = read_element(host_b, offset, dtype)
+              val_result = case op
+                           when :add then val_a + val_b
+                           when :mul then val_a * val_b
+                           when :min then [val_a, val_b].min
+                           when :max then [val_a, val_b].max
+                           end
+              write_element(host_result, offset, val_result, dtype)
+            end
+            # Copy back to device
+            CUDA::RuntimeAPI.cudaMemcpy(result, host_result, total_size, CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE)
+          end
+          # Get size of dtype in bytes
+          def dtype_size(dtype)
+            case dtype
+            when :float32, :int32, :uint32 then 4
+            when :float64, :int64, :uint64 then 8
+            when :float16, :bfloat16, :int16, :uint16 then 2
+            when :int8, :uint8 then 1
+            else 4  # Default to float32
+            end
+          end
+          # Convert Ruby dtype to C type string
+          def dtype_to_ctype(dtype)
+            case dtype
+            when :float32 then "float"
+            when :float64 then "double"
+            when :float16 then "__half"
+            when :int32 then "int"
+            when :int64 then "long long"
+            else "float"
+            end
+          end
+          # Read element from host buffer
+          def read_element(buffer, offset, dtype)
+            case dtype
+            when :float32 then buffer.get_float32(offset)
+            when :float64 then buffer.get_float64(offset)
+            when :int32 then buffer.get_int32(offset)
+            when :int64 then buffer.get_int64(offset)
+            else buffer.get_float32(offset)
+            end
+          end
+          # Write element to host buffer
+          def write_element(buffer, offset, value, dtype)
+            case dtype
+            when :float32 then buffer.put_float32(offset, value)
+            when :float64 then buffer.put_float64(offset, value)
+            when :int32 then buffer.put_int32(offset, value.to_i)
+            when :int64 then buffer.put_int64(offset, value.to_i)
+            else buffer.put_float32(offset, value)
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/nvruby/collective/algorithms/ring.rb ADDED Viewed

@@ -0,0 +1,421 @@
+# frozen_string_literal: true
+require_relative "reduction_ops"
+module Ignis
+  module Collective
+    module Algorithms
+      # Ring AllReduce algorithm implementation
+      #
+      # The Ring algorithm performs AllReduce in 2*(N-1) steps where N is number of GPUs:
+      # 1. Scatter-Reduce phase: N-1 steps, each GPU sends a chunk and receives+reduces another
+      # 2. AllGather phase: N-1 steps, each GPU sends its reduced chunk and receives another
+      #
+      # Bandwidth complexity: 2 * (N-1)/N * data_size (asymptotically optimal)
+      # Latency complexity: 2 * (N-1) * alpha (linear in N)
+      #
+      # Best for: Large messages (>1MB) where bandwidth dominates latency
+      class Ring
+        # Chunk metadata for pipelining
+        ChunkInfo = Struct.new(:chunk_id, :offset, :size, :src_rank, :dst_rank, keyword_init: true)
+        # @return [Array<Integer>] Ring order (GPU IDs in ring sequence)
+        attr_reader :ring_order
+        # @return [Integer] Number of participants
+        attr_reader :n_gpus
+        # @return [TransportSelector] Transport selector for GPU pairs
+        attr_reader :transport_selector
+        # @param ring_order [Array<Integer>] GPU IDs in ring order
+        # @param transport_selector [TransportSelector] Transport selector
+        def initialize(ring_order:, transport_selector:)
+          @ring_order = ring_order.dup.freeze
+          @n_gpus = ring_order.size
+          @transport_selector = transport_selector
+          @chunk_counts = {}
+        end
+        # Perform Ring AllReduce
+        #
+        # @param buffers [Array<FFI::Pointer>] Device buffers (one per GPU in ring_order)
+        # @param sizes [Array<Integer>] Buffer sizes in bytes
+        # @param dtype [Symbol] Data type (:float32, :float64, etc.)
+        # @param op [Symbol] Reduction operation (:sum, :prod, :min, :max)
+        # @param streams [Array<CUDA::Stream, FFI::Pointer>] CUDA streams per GPU
+        # @return [void]
+        def all_reduce(buffers:, sizes:, dtype:, op:, streams:)
+          validate_inputs!(buffers, sizes, streams)
+          return if @n_gpus == 1  # Single GPU - no-op
+          # Even element-wise chunk layout (handles non-divisible sizes without
+          # overrunning the buffer on the last chunk).
+          total_size = sizes[0]
+          layout = chunk_layout(total_size, dtype_elem_size(dtype))
+          # Allocate temp buffers for receive
+          recv_buffers = allocate_recv_buffers(total_size)
+          begin
+            # Phase 1: Scatter-Reduce
+            scatter_reduce!(buffers, recv_buffers, layout, dtype, op, streams)
+            # Phase 2: AllGather
+            all_gather!(buffers, recv_buffers, layout, streams)
+          ensure
+            free_recv_buffers(recv_buffers)
+          end
+        end
+        # Perform only the scatter-reduce phase (for testing/benchmarking)
+        def scatter_reduce_only(buffers:, sizes:, dtype:, op:, streams:)
+          layout = chunk_layout(sizes[0], dtype_elem_size(dtype))
+          recv_buffers = allocate_recv_buffers(sizes[0])
+          begin
+            scatter_reduce!(buffers, recv_buffers, layout, dtype, op, streams)
+          ensure
+            free_recv_buffers(recv_buffers)
+          end
+        end
+        # Perform Ring AllGather - gather all chunks to all GPUs
+        #
+        # Each GPU starts with a chunk of data. After AllGather, each GPU
+        # has all chunks from all GPUs concatenated.
+        #
+        # @param send_buffers [Array<FFI::Pointer>] Source buffers (one per GPU, each with local chunk)
+        # @param recv_buffers [Array<FFI::Pointer>] Dest buffers (one per GPU, sized for all chunks)
+        # @param send_sizes [Array<Integer>] Size of each GPU's local chunk
+        # @param streams [Array<CUDA::Stream, FFI::Pointer>] CUDA streams
+        # @return [void]
+        def all_gather_standalone(send_buffers:, recv_buffers:, send_sizes:, streams:)
+          validate_inputs_gather!(send_buffers, recv_buffers, streams)
+          return if @n_gpus == 1
+          chunk_size = send_sizes[0]
+          # Copy each GPU's local chunk to its position in the result buffer
+          @n_gpus.times do |rank|
+            gpu_id = @ring_order[rank]
+            CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
+            src_offset = 0
+            dst_offset = rank * chunk_size
+            stream_ptr = get_stream_ptr(streams[rank])
+            # Copy local data to correct position
+            CUDA::RuntimeAPI.cudaMemcpyAsync(
+              ptr_offset(recv_buffers[rank], dst_offset),
+              send_buffers[rank],
+              chunk_size,
+              CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
+              stream_ptr
+            )
+          end
+          synchronize_all_streams!(streams)
+          # N-1 ring steps to propagate all chunks
+          (@n_gpus - 1).times do |step|
+            @n_gpus.times do |rank|
+              gpu_id = @ring_order[rank]
+              # Calculate which chunk to send (the one we just received)
+              send_chunk_id = (rank - step + @n_gpus) % @n_gpus
+              send_offset = send_chunk_id * chunk_size
+              next_rank = (rank + 1) % @n_gpus
+              next_gpu = @ring_order[next_rank]
+              transport = @transport_selector.select_transport(gpu_id, next_gpu)
+              stream_ptr = get_stream_ptr(streams[rank])
+              src_ptr = ptr_offset(recv_buffers[rank], send_offset)
+              dst_ptr = ptr_offset(recv_buffers[next_rank], send_offset)
+              move!(transport, dst_ptr, src_ptr, chunk_size, stream_ptr)
+            end
+            synchronize_all_streams!(streams)
+          end
+        end
+        # Perform Ring ReduceScatter - reduce and scatter result
+        #
+        # Each GPU starts with a full buffer. After ReduceScatter, each GPU
+        # has 1/N of the reduced result (different chunks on different GPUs).
+        #
+        # @param buffers [Array<FFI::Pointer>] Buffers (one per GPU, full size)
+        # @param result_buffers [Array<FFI::Pointer>] Result buffers (one per GPU, chunk size)
+        # @param sizes [Array<Integer>] Full buffer sizes
+        # @param dtype [Symbol] Data type
+        # @param op [Symbol] Reduction operation
+        # @param streams [Array<CUDA::Stream, FFI::Pointer>] CUDA streams
+        # @return [void]
+        def reduce_scatter(buffers:, result_buffers:, sizes:, dtype:, op:, streams:)
+          validate_inputs!(buffers, sizes, streams)
+          return if @n_gpus == 1
+          total_size = sizes[0]
+          layout = chunk_layout(total_size, dtype_elem_size(dtype))
+          # Allocate temp buffers
+          temp_buffers = allocate_recv_buffers(total_size)
+          begin
+            # Scatter-Reduce phase only (same as first half of AllReduce)
+            scatter_reduce!(buffers, temp_buffers, layout, dtype, op, streams)
+            # Copy each GPU's final chunk to result buffer
+            @n_gpus.times do |rank|
+              gpu_id = @ring_order[rank]
+              CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
+              # After scatter-reduce, GPU[rank] has fully reduced chunk[(rank+1) % N]
+              final_chunk_id = (rank + 1) % @n_gpus
+              src_offset, n_bytes, = layout[final_chunk_id]
+              next if n_bytes.zero?
+              stream_ptr = get_stream_ptr(streams[rank])
+              CUDA::RuntimeAPI.cudaMemcpyAsync(
+                result_buffers[rank],
+                ptr_offset(buffers[rank], src_offset),
+                n_bytes,
+                CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
+                stream_ptr
+              )
+            end
+            synchronize_all_streams!(streams)
+          ensure
+            free_recv_buffers(temp_buffers)
+          end
+        end
+        # Largest chunk size in bytes (ceil division) — used by callers only for
+        # allocating result buffers big enough to hold any single chunk.
+        # @param total_size [Integer] Total buffer size in bytes
+        # @return [Integer] Max chunk size in bytes
+        def calculate_chunk_size(total_size)
+          (total_size + @n_gpus - 1) / @n_gpus
+        end
+        # Even element-wise chunk layout (NCCL-style). Distributes elements as
+        # evenly as possible across the N chunks so they tile the whole buffer
+        # with NO overrun even when (n_elements % N) != 0 — the previous
+        # ceil-rounded byte chunking read/wrote past the buffer on the last chunk.
+        # @return [Array<Array(Integer,Integer,Integer,Integer)>]
+        #   one [offset_bytes, n_bytes, offset_elems, n_elems] per chunk
+        def chunk_layout(total_bytes, elem_size)
+          total_elems = total_bytes / elem_size
+          base = total_elems / @n_gpus
+          rem  = total_elems % @n_gpus
+          off_e = 0
+          Array.new(@n_gpus) do |k|
+            n_e = base + (k < rem ? 1 : 0)
+            entry = [off_e * elem_size, n_e * elem_size, off_e, n_e]
+            off_e += n_e
+            entry
+          end
+        end
+        private
+        # Move bytes via the selected transport, failing LOUDLY rather than
+        # silently skipping (a non-P2P transport without copy_async would
+        # otherwise drop the chunk and corrupt the reduction with no error).
+        def move!(transport, dst, src, n_bytes, stream_ptr)
+          if transport.respond_to?(:copy_async)
+            transport.copy_async(dst, src, n_bytes, stream_ptr)
+          else
+            raise NotImplementedError,
+                  "Transport #{transport.class} has no copy_async; non-P2P ring " \
+                  "movement is not wired yet (refusing to silently drop data)"
+          end
+        end
+        def validate_inputs!(buffers, sizes, streams)
+          unless buffers.size == @n_gpus
+            raise ArgumentError, "Expected #{@n_gpus} buffers, got #{buffers.size}"
+          end
+          unless sizes.size == @n_gpus
+            raise ArgumentError, "Expected #{@n_gpus} sizes, got #{sizes.size}"
+          end
+          unless streams.size == @n_gpus
+            raise ArgumentError, "Expected #{@n_gpus} streams, got #{streams.size}"
+          end
+          # All sizes should be equal for basic ring
+          unless sizes.uniq.size == 1
+            raise ArgumentError, "All buffer sizes must be equal for Ring AllReduce"
+          end
+        end
+        def validate_inputs_gather!(send_buffers, recv_buffers, streams)
+          unless send_buffers.size == @n_gpus
+            raise ArgumentError, "Expected #{@n_gpus} send buffers, got #{send_buffers.size}"
+          end
+          unless recv_buffers.size == @n_gpus
+            raise ArgumentError, "Expected #{@n_gpus} recv buffers, got #{recv_buffers.size}"
+          end
+          unless streams.size == @n_gpus
+            raise ArgumentError, "Expected #{@n_gpus} streams, got #{streams.size}"
+          end
+        end
+        # Allocate temporary receive buffers on each GPU
+        def allocate_recv_buffers(size)
+          CUDA::RuntimeAPI.ensure_loaded!
+          @ring_order.map do |gpu_id|
+            # Set device context
+            status = CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
+            CUDA::RuntimeAPI.check_status!(status, "Set device #{gpu_id}")
+            # Allocate buffer
+            ptr_ptr = FFI::MemoryPointer.new(:pointer)
+            status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
+            CUDA::RuntimeAPI.check_status!(status, "Alloc recv buffer GPU #{gpu_id}")
+            ptr_ptr.read_pointer
+          end
+        end
+        # Free temporary receive buffers
+        def free_recv_buffers(recv_buffers)
+          recv_buffers.each_with_index do |buf, i|
+            next unless buf && !buf.null?
+            CUDA::RuntimeAPI.cudaSetDevice(@ring_order[i])
+            CUDA::RuntimeAPI.cudaFree(buf)
+          rescue StandardError
+            # Ignore cleanup errors
+          end
+        end
+        # Scatter-Reduce phase: N-1 steps
+        # In each step, GPU[i] sends chunk[(i-step) % N] to GPU[(i+1) % N]
+        # and receives chunk[(i-step-1) % N] from GPU[(i-1) % N], reducing it
+        def scatter_reduce!(buffers, recv_buffers, layout, dtype, op, streams)
+          (@n_gpus - 1).times do |step|
+            # Each GPU sends one chunk to its successor
+            @n_gpus.times do |rank|
+              gpu_id = @ring_order[rank]
+              send_chunk_id = (rank - step) % @n_gpus
+              send_offset, n_bytes, = layout[send_chunk_id]
+              next if n_bytes.zero?
+              next_rank = (rank + 1) % @n_gpus
+              next_gpu = @ring_order[next_rank]
+              send_transport = @transport_selector.select_transport(gpu_id, next_gpu)
+              stream_ptr = get_stream_ptr(streams[rank])
+              src_ptr = ptr_offset(buffers[rank], send_offset)
+              dst_ptr = ptr_offset(recv_buffers[next_rank], send_offset)
+              move!(send_transport, dst_ptr, src_ptr, n_bytes, stream_ptr)
+            end
+            # Synchronize all GPUs after send
+            synchronize_all_streams!(streams)
+            # Now each GPU reduces the received chunk with its local chunk
+            @n_gpus.times do |rank|
+              gpu_id = @ring_order[rank]
+              recv_chunk_id = (rank - step - 1) % @n_gpus
+              recv_offset, _n_bytes, _off_e, elem_count = layout[recv_chunk_id]
+              next if elem_count.zero?
+              CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
+              local_ptr = ptr_offset(buffers[rank], recv_offset)
+              recv_ptr = ptr_offset(recv_buffers[rank], recv_offset)
+              stream_ptr = get_stream_ptr(streams[rank])
+              # Reduce: local = reduce(local, recv)
+              ReductionOps.execute(op, local_ptr, recv_ptr, local_ptr, elem_count, dtype, stream_ptr)
+            end
+            # Synchronize before next step
+            synchronize_all_streams!(streams)
+          end
+        end
+        # AllGather phase: N-1 steps
+        # In each step, GPU[i] sends its fully-reduced chunk to GPU[(i+1) % N]
+        def all_gather!(buffers, recv_buffers, layout, streams)
+          (@n_gpus - 1).times do |step|
+            @n_gpus.times do |rank|
+              gpu_id = @ring_order[rank]
+              # After scatter-reduce, GPU[i] has fully reduced chunk[(i+1) % N]
+              send_chunk_id = (rank - step + 1) % @n_gpus
+              send_offset, n_bytes, = layout[send_chunk_id]
+              next if n_bytes.zero?
+              next_rank = (rank + 1) % @n_gpus
+              next_gpu = @ring_order[next_rank]
+              send_transport = @transport_selector.select_transport(gpu_id, next_gpu)
+              stream_ptr = get_stream_ptr(streams[rank])
+              src_ptr = ptr_offset(buffers[rank], send_offset)
+              dst_ptr = ptr_offset(buffers[next_rank], send_offset)
+              move!(send_transport, dst_ptr, src_ptr, n_bytes, stream_ptr)
+            end
+            synchronize_all_streams!(streams)
+          end
+        end
+        # Get stream pointer for FFI
+        def get_stream_ptr(stream)
+          case stream
+          when FFI::Pointer
+            stream
+          when CUDA::Stream
+            stream.ptr
+          else
+            FFI::Pointer::NULL
+          end
+        end
+        # Offset a pointer by bytes
+        def ptr_offset(ptr, offset)
+          FFI::Pointer.new(:uint8, ptr.address + offset)
+        end
+        # Get element size for dtype
+        def dtype_elem_size(dtype)
+          case dtype
+          when :float32, :int32 then 4
+          when :float64, :int64 then 8
+          when :float16, :bfloat16 then 2
+          else 4
+          end
+        end
+        # Synchronize all streams
+        def synchronize_all_streams!(streams)
+          streams.each_with_index do |stream, i|
+            CUDA::RuntimeAPI.cudaSetDevice(@ring_order[i])
+            stream_ptr = get_stream_ptr(stream)
+            if stream_ptr.null?
+              CUDA::RuntimeAPI.cudaDeviceSynchronize
+            else
+              CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
+            end
+          end
+        end
+      end
+    end
+  end
+end