RubyGems - ignis-collective - Versions diffs - 0.0.1 - Mend

ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/README.md +7 -0
data/lib/ignis-collective.rb +9 -0
data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
data/lib/nvruby/collective/algorithms/ring.rb +421 -0
data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
data/lib/nvruby/collective/algorithms/tree.rb +291 -0
data/lib/nvruby/collective/array_ops.rb +240 -0
data/lib/nvruby/collective/communicator.rb +633 -0
data/lib/nvruby/collective/communicator_healer.rb +276 -0
data/lib/nvruby/collective/device_manager.rb +216 -0
data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
data/lib/nvruby/collective/health_monitor.rb +333 -0
data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
data/lib/nvruby/collective/p2p_bindings.rb +121 -0
data/lib/nvruby/collective/resilient_transport.rb +296 -0
data/lib/nvruby/collective/topology.rb +347 -0
data/lib/nvruby/collective/transport/base.rb +138 -0
data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
data/lib/nvruby/collective/transport_selector.rb +200 -0
data/lib/nvruby/collective/vmm_bindings.rb +212 -0
data/lib/nvruby/collective.rb +156 -0
metadata +92 -0

data/lib/nvruby/collective/resilient_transport.rb ADDED Viewed

@@ -0,0 +1,296 @@
+# frozen_string_literal: true
+require_relative "transport/base"
+require_relative "transport/p2p_transport"
+require_relative "transport/ipc_transport"
+require_relative "transport/host_staged_transport"
+module Ignis
+  module Collective
+    # Resilient transport wrapper with retry, fallback, and circuit breaker
+    # Inspired by RapidsMPF's three-phase protocol and error handling patterns
+    #
+    # @example Usage
+    #   transport = ResilientTransport.new(
+    #     src_device: 0, dst_device: 1,
+    #     topology: topology_detector
+    #   )
+    #   transport.send_async(src_ptr, dst_ptr, size, stream)
+    #
+    class ResilientTransport
+      # Maximum retry attempts before fallback
+      MAX_RETRIES = 3
+      # Retry delays with exponential backoff (seconds)
+      RETRY_DELAYS = [0.1, 0.5, 1.0].freeze
+      # Transport fallback chain (highest → lowest performance)
+      FALLBACK_CHAIN = [:p2p, :ipc, :host_staged].freeze
+      # Circuit breaker threshold (failures before marking unhealthy)
+      CIRCUIT_BREAKER_THRESHOLD = 3
+      # Circuit breaker reset time (seconds)
+      CIRCUIT_BREAKER_RESET = 60.0
+      # CUDA error codes that indicate transport failure
+      RECOVERABLE_ERRORS = [
+        702,  # CUDA_ERROR_LAUNCH_TIMEOUT
+        716,  # CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
+        999,  # CUDA_ERROR_UNKNOWN
+      ].freeze
+      # @return [Integer] Source GPU device ID
+      attr_reader :src_device
+      # @return [Integer] Destination GPU device ID
+      attr_reader :dst_device
+      # @return [Symbol] Current transport type
+      attr_reader :current_transport_type
+      # @return [Transport::Base] Active transport
+      attr_reader :active_transport
+      # @return [Hash] Transport health status
+      attr_reader :health_status
+      # Create resilient transport wrapper
+      #
+      # @param src_device [Integer] Source GPU
+      # @param dst_device [Integer] Destination GPU
+      # @param topology [Topology::Detector] Topology for path detection
+      # @param preferred_transport [Symbol, nil] Force specific transport
+      def initialize(src_device:, dst_device:, topology:, preferred_transport: nil)
+        @src_device = src_device
+        @dst_device = dst_device
+        @topology = topology
+        @preferred_transport = preferred_transport
+        @transports = {}
+        @health_status = Hash.new { |h, k| h[k] = { failures: 0, last_failure: nil } }
+        @current_transport_type = nil
+        @active_transport = nil
+        @initialized = false
+      end
+      # Initialize transports
+      # @return [void]
+      def initialize!
+        return if @initialized
+        select_initial_transport!
+        @initialized = true
+      end
+      # Send data with retry and fallback
+      #
+      # @param src_ptr [FFI::Pointer] Source buffer
+      # @param dst_ptr [FFI::Pointer] Destination buffer
+      # @param size [Integer] Bytes to transfer
+      # @param stream [FFI::Pointer, nil] CUDA stream
+      # @return [Boolean] Success status
+      def send_async(src_ptr, dst_ptr, size, stream = nil)
+        ensure_initialized!
+        attempt = 0
+        last_error = nil
+        while attempt < MAX_RETRIES
+          begin
+            # Transports expose copy_async(dst, src, size, stream) (P2P) or
+            # send_async(buffer, size, stream) (base) — NOT a 4-arg send_async,
+            # which raised ArgumentError on every attempt before.
+            result = if @active_transport.respond_to?(:copy_async)
+                       @active_transport.copy_async(dst_ptr, src_ptr, size, stream)
+                     else
+                       @active_transport.send_async(src_ptr, size, stream)
+                     end
+            reset_circuit_breaker!(@current_transport_type)
+            return result
+          rescue StandardError => e
+            last_error = e
+            record_failure!(@current_transport_type, e)
+            attempt += 1
+            if attempt < MAX_RETRIES
+              sleep(RETRY_DELAYS[[attempt - 1, RETRY_DELAYS.size - 1].min])
+            end
+          end
+        end
+        # All retries failed, try fallback
+        if try_fallback!
+          send_async(src_ptr, dst_ptr, size, stream)
+        else
+          raise TransportError, "All transports failed: #{last_error&.message}"
+        end
+      end
+      # Synchronize transfer completion
+      #
+      # @param stream [FFI::Pointer, nil] CUDA stream
+      # @return [void]
+      def synchronize(_stream = nil)
+        # Transports define synchronize! (no args), not synchronize(stream).
+        @active_transport&.synchronize!
+      end
+      # Check if transport is healthy
+      # @return [Boolean] True if healthy
+      def healthy?
+        @active_transport&.ready? && !circuit_open?(@current_transport_type)
+      end
+      # Check if any transport is available
+      # @return [Boolean] True if ready
+      def ready?
+        @initialized && @active_transport&.ready?
+      end
+      # Get estimated bandwidth
+      # @return [Float] GB/s
+      def estimated_bandwidth
+        @active_transport&.estimated_bandwidth || 0.0
+      end
+      # Force fallback to next transport in chain
+      # @return [Boolean] True if fallback succeeded
+      def force_fallback!
+        try_fallback!
+      end
+      # Reset all circuit breakers
+      # @return [void]
+      def reset_health!
+        @health_status.clear
+      end
+      # Clean up resources
+      # @return [void]
+      def destroy!
+        @transports.each_value(&:destroy!)
+        @transports.clear
+        @active_transport = nil
+        @initialized = false
+      end
+      # @return [String] Human-readable description
+      def to_s
+        status = healthy? ? "healthy" : "degraded"
+        "ResilientTransport[#{@src_device}→#{@dst_device}]: " \
+          "#{@current_transport_type} (#{status})"
+      end
+      private
+      def ensure_initialized!
+        initialize! unless @initialized
+      end
+      def select_initial_transport!
+        if @preferred_transport && !circuit_open?(@preferred_transport)
+          @current_transport_type = @preferred_transport
+          @active_transport = create_transport(@preferred_transport)
+          @active_transport.initialize!
+          return
+        end
+        # Select based on topology
+        path = @topology.matrix.path(@src_device, @dst_device)
+        transport_type = if path&.nvlink?
+                           :p2p
+                         elsif path&.pcie_p2p?
+                           :p2p
+                         elsif path&.p2p_supported
+                           :ipc
+                         else
+                           :host_staged
+                         end
+        @current_transport_type = transport_type
+        @active_transport = create_transport(transport_type)
+        @active_transport.initialize!
+      end
+      def create_transport(type)
+        @transports[type] ||= case type
+                              when :p2p
+                                Transport::P2PTransport.new(
+                                  src_device: @src_device,
+                                  dst_device: @dst_device,
+                                  interconnect_type: detect_interconnect_type
+                                )
+                              when :ipc
+                                Transport::IPCTransport.new(
+                                  src_device: @src_device,
+                                  dst_device: @dst_device
+                                )
+                              when :host_staged
+                                Transport::HostStagedTransport.new(
+                                  src_device: @src_device,
+                                  dst_device: @dst_device
+                                )
+                              else
+                                raise ArgumentError, "Unknown transport: #{type}"
+                              end
+      end
+      def detect_interconnect_type
+        path = @topology.matrix.path(@src_device, @dst_device)
+        path&.interconnect_type || :pcie_p2p
+      end
+      def try_fallback!
+        current_idx = FALLBACK_CHAIN.index(@current_transport_type) || -1
+        FALLBACK_CHAIN[(current_idx + 1)..].each do |transport_type|
+          next if circuit_open?(transport_type)
+          begin
+            @current_transport_type = transport_type
+            @active_transport = create_transport(transport_type)
+            @active_transport.initialize!
+            return true
+          rescue StandardError => e
+            record_failure!(transport_type, e)
+          end
+        end
+        false
+      end
+      def record_failure!(transport_type, error)
+        status = @health_status[transport_type]
+        status[:failures] += 1
+        status[:last_failure] = Time.now
+        status[:last_error] = error.message
+      end
+      def reset_circuit_breaker!(transport_type)
+        @health_status[transport_type] = { failures: 0, last_failure: nil }
+      end
+      def circuit_open?(transport_type)
+        status = @health_status[transport_type]
+        return false if status[:failures] < CIRCUIT_BREAKER_THRESHOLD
+        return false if status[:last_failure].nil?
+        # Check if reset time has passed
+        elapsed = Time.now - status[:last_failure]
+        if elapsed >= CIRCUIT_BREAKER_RESET
+          reset_circuit_breaker!(transport_type)
+          return false
+        end
+        true
+      end
+    end
+    # NOTE: TransportError is defined once in collective.rb as `< Error`.
+    # It used to be redefined here as `< StandardError`, which caused a
+    # "superclass mismatch" TypeError when collective.rb was required (it loads
+    # this file before reopening the class), blocking the entire NvCCL layer.
+  end
+end

data/lib/nvruby/collective/topology.rb ADDED Viewed

@@ -0,0 +1,347 @@
+# frozen_string_literal: true
+require_relative "p2p_bindings"
+module Ignis
+  module Collective
+    # GPU topology detection and interconnect analysis
+    # Detects NVLink, PCIe P2P, and shared memory paths between GPUs
+    module Topology
+      # Interconnect types ranked by performance
+      INTERCONNECT_TYPES = {
+        nvlink: { bandwidth_gbps: 900, latency_us: 1 },
+        pcie_p2p: { bandwidth_gbps: 32, latency_us: 5 },
+        host_staged: { bandwidth_gbps: 12, latency_us: 20 },
+        none: { bandwidth_gbps: 0, latency_us: Float::INFINITY }
+      }.freeze
+      # Represents a connection path between two GPUs
+      class Path
+        # @return [Integer] Source GPU device ID
+        attr_reader :src_device
+        # @return [Integer] Destination GPU device ID
+        attr_reader :dst_device
+        # @return [Symbol] Interconnect type (:nvlink, :pcie_p2p, :host_staged, :none)
+        attr_reader :interconnect_type
+        # @return [Integer] Performance rank (0 = best)
+        attr_reader :performance_rank
+        # @return [Boolean] Whether P2P access is supported
+        attr_reader :p2p_supported
+        # @return [Boolean] Whether native atomics are supported
+        attr_reader :native_atomics
+        # @param src_device [Integer] Source GPU device ID
+        # @param dst_device [Integer] Destination GPU device ID
+        # @param interconnect_type [Symbol] Detected interconnect type
+        # @param performance_rank [Integer] Performance rank from CUDA
+        # @param p2p_supported [Boolean] P2P support status
+        # @param native_atomics [Boolean] Native atomic support
+        def initialize(src_device:, dst_device:, interconnect_type:,
+                       performance_rank:, p2p_supported:, native_atomics: false)
+          @src_device = src_device
+          @dst_device = dst_device
+          @interconnect_type = interconnect_type
+          @performance_rank = performance_rank
+          @p2p_supported = p2p_supported
+          @native_atomics = native_atomics
+        end
+        # Estimated bandwidth in GB/s
+        # @return [Float] Bandwidth estimate
+        def estimated_bandwidth
+          INTERCONNECT_TYPES.dig(@interconnect_type, :bandwidth_gbps) || 0
+        end
+        # Estimated latency in microseconds
+        # @return [Float] Latency estimate
+        def estimated_latency
+          INTERCONNECT_TYPES.dig(@interconnect_type, :latency_us) || Float::INFINITY
+        end
+        # @return [Boolean] Whether direct P2P is possible
+        def direct_access?
+          @p2p_supported && [:nvlink, :pcie_p2p].include?(@interconnect_type)
+        end
+        # @return [Boolean] Whether this path uses NVLink
+        def nvlink?
+          @interconnect_type == :nvlink
+        end
+        # @return [Boolean] Whether this path uses PCIe P2P
+        def pcie_p2p?
+          @interconnect_type == :pcie_p2p && @p2p_supported
+        end
+        # Alias for estimated_bandwidth for test compatibility
+        # @return [Float] Bandwidth in GB/s
+        def bandwidth_gbps
+          estimated_bandwidth
+        end
+        # @return [String] Human-readable description
+        def to_s
+          "Path[#{@src_device}→#{@dst_device}]: #{@interconnect_type} " \
+            "(rank=#{@performance_rank}, p2p=#{@p2p_supported})"
+        end
+      end
+      # Topology matrix for a set of GPUs
+      class Matrix
+        # @return [Array<Integer>] List of GPU device IDs
+        attr_reader :device_ids
+        # @return [Hash<Array<Integer>, Path>] Map of [src, dst] to Path
+        attr_reader :paths
+        # @param device_ids [Array<Integer>] GPU device IDs to analyze
+        def initialize(device_ids)
+          @device_ids = device_ids.dup.freeze
+          @paths = {}
+          build_matrix!
+        end
+        # Get path between two GPUs
+        # @param src [Integer] Source GPU
+        # @param dst [Integer] Destination GPU
+        # @return [Path, nil] Path object or nil if same device
+        def path(src, dst)
+          return nil if src == dst
+          @paths[[src, dst]]
+        end
+        # Get optimal ring order based on topology
+        # Minimizes total latency by placing NVLink-connected GPUs adjacent
+        # @return [Array<Integer>] Ordered device IDs for ring algorithm
+        def optimal_ring_order
+          return @device_ids.dup if @device_ids.size <= 2
+          # Greedy nearest-neighbor heuristic
+          remaining = @device_ids.dup
+          order = [remaining.shift]
+          until remaining.empty?
+            current = order.last
+            # Find GPU with best connection to current
+            best_next = remaining.min_by do |gpu|
+              path_obj = path(current, gpu)
+              path_obj ? path_obj.performance_rank : Float::INFINITY
+            end
+            order << best_next
+            remaining.delete(best_next)
+          end
+          order
+        end
+        # Get all paths with NVLink connectivity
+        # @return [Array<Path>] Paths with NVLink
+        def nvlink_paths
+          @paths.values.select { |p| p.interconnect_type == :nvlink }
+        end
+        # Get all paths with P2P support
+        # @return [Array<Path>] Paths with P2P
+        def p2p_paths
+          @paths.values.select(&:p2p_supported)
+        end
+        # Check if all GPUs have full P2P mesh
+        # @return [Boolean] True if all pairs have P2P
+        def full_p2p_mesh?
+          @paths.values.all?(&:p2p_supported)
+        end
+        # @return [String] Human-readable matrix representation
+        def to_s
+          header = "Topology Matrix (#{@device_ids.size} GPUs)\n"
+          rows = @device_ids.map do |src|
+            cols = @device_ids.map do |dst|
+              if src == dst
+                "  -  "
+              else
+                path_obj = path(src, dst)
+                type_abbr = path_obj.interconnect_type.to_s[0..3].upcase
+                "#{type_abbr.ljust(5)}"
+              end
+            end
+            "GPU#{src}: #{cols.join(' | ')}"
+          end
+          header + rows.join("\n")
+        end
+        private
+        def build_matrix!
+          P2PBindings.ensure_loaded!
+          @device_ids.each do |src|
+            @device_ids.each do |dst|
+              next if src == dst
+              @paths[[src, dst]] = detect_path(src, dst)
+            end
+          end
+        end
+        # Detect interconnect between two GPUs
+        # @param src [Integer] Source GPU
+        # @param dst [Integer] Destination GPU
+        # @return [Path] Detected path
+        def detect_path(src, dst)
+          # Check P2P accessibility
+          can_access_ptr = FFI::MemoryPointer.new(:int)
+          status = P2PBindings.cudaDeviceCanAccessPeer(can_access_ptr, src, dst)
+          P2PBindings.check_status!(status, "Check P2P access #{src}→#{dst}")
+          p2p_supported = can_access_ptr.read_int == 1
+          # Get performance rank (0 = NVLink, higher = PCIe)
+          performance_rank = 99
+          interconnect_type = :host_staged
+          if p2p_supported
+            perf_ptr = FFI::MemoryPointer.new(:int)
+            status = P2PBindings.cudaDeviceGetP2PAttribute(
+              perf_ptr,
+              P2PBindings::P2P_ATTR_PERFORMANCE_RANK,
+              src,
+              dst
+            )
+            if status.zero?
+              performance_rank = perf_ptr.read_int
+              # Performance rank 0 indicates NVLink (highest performance)
+              interconnect_type = if performance_rank.zero?
+                                    :nvlink
+                                  else
+                                    :pcie_p2p
+                                  end
+            else
+              # P2P supported but can't get rank - assume PCIe
+              interconnect_type = :pcie_p2p
+              performance_rank = 1
+            end
+          end
+          # Check native atomic support
+          native_atomics = false
+          if p2p_supported
+            atomic_ptr = FFI::MemoryPointer.new(:int)
+            status = P2PBindings.cudaDeviceGetP2PAttribute(
+              atomic_ptr,
+              P2PBindings::P2P_ATTR_NATIVE_ATOMIC_SUPPORTED,
+              src,
+              dst
+            )
+            native_atomics = status.zero? && atomic_ptr.read_int == 1
+          end
+          Path.new(
+            src_device: src,
+            dst_device: dst,
+            interconnect_type: interconnect_type,
+            performance_rank: performance_rank,
+            p2p_supported: p2p_supported,
+            native_atomics: native_atomics
+          )
+        end
+      end
+      # GPU topology detector - main entry point
+      class Detector
+        # @return [Matrix] Current topology matrix
+        attr_reader :matrix
+        # Detect topology for specified GPUs
+        # @param device_ids [Array<Integer>, nil] GPU IDs or nil for all GPUs
+        def initialize(device_ids: nil)
+          @device_ids = device_ids || all_device_ids
+          @matrix = Matrix.new(@device_ids)
+        end
+        # @return [Array<Integer>] All visible GPU device IDs
+        def all_device_ids
+          CUDA::Device.list.map(&:index)
+        end
+        # @return [Integer] Number of GPUs in this topology
+        def gpu_count
+          @device_ids.size
+        end
+        # Get interconnect type between two GPUs
+        # @param device_a [Integer] First GPU
+        # @param device_b [Integer] Second GPU
+        # @return [Symbol] Interconnect type
+        def interconnect_type(device_a, device_b)
+          path = @matrix.path(device_a, device_b)
+          path&.interconnect_type || :none
+        end
+        # Get optimal ring order for collective operations
+        # @return [Array<Integer>] Ordered GPU IDs
+        def optimal_ring_order
+          @matrix.optimal_ring_order
+        end
+        # Check if specific GPU pair has NVLink
+        # @param device_a [Integer] First GPU
+        # @param device_b [Integer] Second GPU
+        # @return [Boolean] True if NVLink connected
+        def nvlink_connected?(device_a, device_b)
+          path = @matrix.path(device_a, device_b)
+          path&.interconnect_type == :nvlink
+        end
+        # Check if P2P is available between GPUs
+        # @param device_a [Integer] First GPU
+        # @param device_b [Integer] Second GPU
+        # @return [Boolean] True if P2P available
+        def p2p_available?(device_a, device_b)
+          path = @matrix.path(device_a, device_b)
+          path&.p2p_supported || false
+        end
+        # Enable P2P access between all GPUs in the topology
+        # @return [Hash<Array<Integer>, Boolean>] Map of [src, dst] to success
+        def enable_all_p2p!
+          results = {}
+          @matrix.p2p_paths.each do |path|
+            src = path.src_device
+            dst = path.dst_device
+            # Set source device context
+            status = CUDA::RuntimeAPI.cudaSetDevice(src)
+            CUDA::RuntimeAPI.check_status!(status, "Set device #{src}")
+            # Enable peer access
+            status = P2PBindings.cudaDeviceEnablePeerAccess(dst, 0)
+            # Status 0 = success, 704 = already enabled
+            results[[src, dst]] = status.zero? || status == 704
+          end
+          results
+        end
+        # @return [String] Summary of detected topology
+        def to_s
+          nvlink_count = @matrix.nvlink_paths.size
+          p2p_count = @matrix.p2p_paths.size
+          total_pairs = @device_ids.size * (@device_ids.size - 1)
+          "Topology: #{@device_ids.size} GPUs, " \
+            "#{nvlink_count}/#{total_pairs} NVLink, " \
+            "#{p2p_count}/#{total_pairs} P2P"
+        end
+      end
+    end
+  end
+end