RubyGems - ignis-collective - Versions diffs - 0.0.1 - Mend

ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/README.md +7 -0
data/lib/ignis-collective.rb +9 -0
data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
data/lib/nvruby/collective/algorithms/ring.rb +421 -0
data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
data/lib/nvruby/collective/algorithms/tree.rb +291 -0
data/lib/nvruby/collective/array_ops.rb +240 -0
data/lib/nvruby/collective/communicator.rb +633 -0
data/lib/nvruby/collective/communicator_healer.rb +276 -0
data/lib/nvruby/collective/device_manager.rb +216 -0
data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
data/lib/nvruby/collective/health_monitor.rb +333 -0
data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
data/lib/nvruby/collective/p2p_bindings.rb +121 -0
data/lib/nvruby/collective/resilient_transport.rb +296 -0
data/lib/nvruby/collective/topology.rb +347 -0
data/lib/nvruby/collective/transport/base.rb +138 -0
data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
data/lib/nvruby/collective/transport_selector.rb +200 -0
data/lib/nvruby/collective/vmm_bindings.rb +212 -0
data/lib/nvruby/collective.rb +156 -0
metadata +92 -0

data/lib/nvruby/collective/communicator_healer.rb ADDED Viewed

@@ -0,0 +1,276 @@
+# frozen_string_literal: true
+require_relative "health_monitor"
+require_relative "topology"
+require_relative "transport_selector"
+module Ignis
+  module Collective
+    # Communicator healing for dynamic reconfiguration on GPU failure
+    # Inspired by Universal Checkpointing (USENIX ATC 2025) patterns
+    #
+    # Enables recovery without full restart:
+    #   1. Detect failed GPUs via HealthMonitor
+    #   2. Exclude from active set
+    #   3. Rebuild topology and transports
+    #   4. Invalidate stale CUDA Graphs
+    #   5. Resume operations with reduced GPU count
+    #
+    # @example Usage with communicator
+    #   healer = CommunicatorHealer.new(communicator)
+    #   monitor.on_failure { |gpu| healer.heal!([gpu]) }
+    #
+    class CommunicatorHealer
+      # @return [Communicator] Parent communicator
+      attr_reader :communicator
+      # @return [Array<Integer>] Currently active GPU IDs
+      attr_reader :active_devices
+      # @return [Array<Integer>] Failed GPU IDs
+      attr_reader :failed_devices
+      # @return [Integer] Total heal operations performed
+      attr_reader :heal_count
+      # @return [Array<Hash>] Heal history
+      attr_reader :heal_history
+      # Create healer for a communicator
+      #
+      # @param communicator [Communicator] Parent communicator
+      def initialize(communicator)
+        @communicator = communicator
+        @active_devices = communicator.gpu_ids.dup
+        @failed_devices = []
+        @heal_count = 0
+        @heal_history = []
+        @callbacks = { pre_heal: [], post_heal: [] }
+        @cuda_graph_cache = []
+      end
+      # Perform healing operation - exclude failed GPUs and rebuild
+      #
+      # @param failed_gpu_ids [Array<Integer>] GPUs to exclude
+      # @return [Boolean] True if healing succeeded
+      def heal!(failed_gpu_ids)
+        return true if failed_gpu_ids.empty?
+        notify_pre_heal(failed_gpu_ids)
+        begin
+          # 1. Record failed devices
+          @failed_devices |= failed_gpu_ids
+          @active_devices -= failed_gpu_ids
+          # 2. Validate we have enough GPUs left
+          if @active_devices.size < minimum_gpu_count
+            raise HealingError, "Too few GPUs remaining: #{@active_devices.size}"
+          end
+          # 3. Invalidate CUDA Graphs (they reference old topology)
+          invalidate_cuda_graphs!
+          # 4. Rebuild topology for survivors
+          rebuild_topology!
+          # 5. Rebuild transports
+          rebuild_transports!
+          # 6. Update communicator state
+          update_communicator_state!
+          # 7. Record success
+          record_heal(failed_gpu_ids, :success)
+          notify_post_heal(failed_gpu_ids, :success)
+          true
+        rescue StandardError => e
+          record_heal(failed_gpu_ids, :failed, e.message)
+          notify_post_heal(failed_gpu_ids, :failed)
+          raise
+        end
+      end
+      # Attempt to recover a failed GPU
+      #
+      # @param gpu_id [Integer] GPU to recover
+      # @return [Boolean] True if recovery succeeded
+      def recover!(gpu_id)
+        return false unless @failed_devices.include?(gpu_id)
+        # Test if GPU is responsive
+        return false unless test_gpu_health(gpu_id)
+        # Reintegrate
+        @failed_devices.delete(gpu_id)
+        @active_devices << gpu_id
+        @active_devices.sort!
+        # Rebuild topology with recovered GPU
+        rebuild_topology!
+        rebuild_transports!
+        update_communicator_state!
+        record_heal([gpu_id], :recovered)
+        true
+      rescue StandardError
+        false
+      end
+      # Register CUDA Graph for invalidation on heal
+      #
+      # @param graph [CUDA::Graph, FFI::Pointer] Graph to track
+      # @return [void]
+      def register_cuda_graph(graph)
+        @cuda_graph_cache << graph unless @cuda_graph_cache.include?(graph)
+      end
+      # Unregister CUDA Graph
+      #
+      # @param graph [CUDA::Graph, FFI::Pointer] Graph to untrack
+      # @return [void]
+      def unregister_cuda_graph(graph)
+        @cuda_graph_cache.delete(graph)
+      end
+      # Get current world size (active GPUs)
+      # @return [Integer] Number of active GPUs
+      def world_size
+        @active_devices.size
+      end
+      # Check if any GPUs have failed
+      # @return [Boolean] True if degraded
+      def degraded?
+        @failed_devices.any?
+      end
+      # Get health summary
+      # @return [Hash] Health statistics
+      def health_summary
+        {
+          active_count: @active_devices.size,
+          failed_count: @failed_devices.size,
+          active_devices: @active_devices.dup,
+          failed_devices: @failed_devices.dup,
+          heal_count: @heal_count,
+          degraded: degraded?
+        }
+      end
+      # Register pre-heal callback
+      # @yield [failed_gpu_ids] Called before healing
+      def on_pre_heal(&block)
+        @callbacks[:pre_heal] << block
+      end
+      # Register post-heal callback
+      # @yield [failed_gpu_ids, status] Called after healing
+      def on_post_heal(&block)
+        @callbacks[:post_heal] << block
+      end
+      # @return [String] Human-readable status
+      def to_s
+        status = degraded? ? "degraded" : "healthy"
+        "CommunicatorHealer[#{@active_devices.size}/#{@communicator.gpu_ids.size} active, #{status}]"
+      end
+      private
+      def minimum_gpu_count
+        # Need at least 1 GPU for any operation
+        # Could be made configurable
+        1
+      end
+      def invalidate_cuda_graphs!
+        @cuda_graph_cache.each do |graph|
+          begin
+            if graph.respond_to?(:invalidate!)
+              graph.invalidate!
+            elsif graph.respond_to?(:destroy!)
+              graph.destroy!
+            end
+          rescue StandardError
+            # Best effort - graph may already be invalid
+          end
+        end
+        @cuda_graph_cache.clear
+      end
+      def rebuild_topology!
+        # Create new topology for active devices only
+        @new_topology = Topology::Detector.new(device_ids: @active_devices)
+      end
+      def rebuild_transports!
+        # Destroy old transports
+        if @communicator.respond_to?(:transport_selector)
+          @communicator.transport_selector&.destroy!
+        end
+        # Create new transport selector for active devices
+        @new_transport_selector = TransportSelector.new(@active_devices)
+        @new_transport_selector.initialize!
+      end
+      def update_communicator_state!
+        # Update communicator's internal state
+        # This requires the Communicator to expose update methods
+        if @communicator.respond_to?(:update_topology!)
+          @communicator.update_topology!(@new_topology)
+        end
+        if @communicator.respond_to?(:update_transport_selector!)
+          @communicator.update_transport_selector!(@new_transport_selector)
+        end
+        if @communicator.respond_to?(:update_device_ids!)
+          @communicator.update_device_ids!(@active_devices)
+        end
+        # Update ring order for collective algorithms
+        if @communicator.respond_to?(:update_ring_order!)
+          @communicator.update_ring_order!(@new_topology.optimal_ring_order)
+        end
+      end
+      def test_gpu_health(gpu_id)
+        CUDA::RuntimeAPI.ensure_loaded!
+        CUDA::RuntimeAPI.set_device(gpu_id)
+        CUDA::RuntimeAPI.device_synchronize
+        true
+      rescue StandardError
+        false
+      end
+      def record_heal(gpu_ids, status, error = nil)
+        @heal_count += 1
+        @heal_history << {
+          timestamp: Time.now,
+          gpu_ids: gpu_ids.dup,
+          status: status,
+          error: error,
+          active_after: @active_devices.size
+        }
+        # Keep history bounded
+        @heal_history.shift if @heal_history.size > 100
+      end
+      def notify_pre_heal(gpu_ids)
+        @callbacks[:pre_heal].each { |cb| cb.call(gpu_ids) }
+      end
+      def notify_post_heal(gpu_ids, status)
+        @callbacks[:post_heal].each { |cb| cb.call(gpu_ids, status) }
+      end
+    end
+    # Error during healing operation
+    class HealingError < StandardError; end
+  end
+end

data/lib/nvruby/collective/device_manager.rb ADDED Viewed

@@ -0,0 +1,216 @@
+# frozen_string_literal: true
+require_relative "p2p_bindings"
+require_relative "topology"
+module Ignis
+  module Collective
+    # Multi-GPU device manager
+    # Handles device enumeration, context management, and peer access configuration
+    class DeviceManager
+      # @return [Array<Integer>] Managed GPU device IDs
+      attr_reader :device_ids
+      # @return [Hash<Integer, CUDA::Device>] Device objects by ID
+      attr_reader :devices
+      # @return [Topology::Detector] Topology detector
+      attr_reader :topology
+      # @return [Hash<Array<Integer>, Boolean>] P2P access status
+      attr_reader :p2p_access_enabled
+      # Create device manager for specified GPUs
+      # @param device_ids [Array<Integer>, nil] GPUs to manage (nil = all)
+      def initialize(device_ids: nil)
+        @device_ids = (device_ids || all_device_ids).dup.freeze
+        @devices = {}
+        @topology = nil
+        @p2p_access_enabled = {}
+        @initialized = false
+        validate_devices!
+        create_device_objects!
+      end
+      # Initialize device manager and detect topology
+      # @return [void]
+      def initialize!
+        return if @initialized
+        detect_topology!
+        @initialized = true
+      end
+      # Detect GPU topology
+      # @return [Topology::Detector] Topology detector
+      def detect_topology!
+        @topology = Topology::Detector.new(device_ids: @device_ids)
+      end
+      # Enable P2P access between all GPU pairs where available
+      # @return [Hash<Array<Integer>, Boolean>] Map of (src, dst) to success status
+      def enable_all_p2p_access!
+        return @p2p_access_enabled unless @p2p_access_enabled.empty?
+        detect_topology! unless @topology
+        P2PBindings.ensure_loaded!
+        CUDA::RuntimeAPI.ensure_loaded!
+        @device_ids.each do |src|
+          @device_ids.each do |dst|
+            next if src == dst
+            # Check if P2P is possible
+            unless @topology.p2p_available?(src, dst)
+              @p2p_access_enabled[[src, dst]] = false
+              next
+            end
+            # Set source device context
+            status = CUDA::RuntimeAPI.cudaSetDevice(src)
+            CUDA::RuntimeAPI.check_status!(status, "Set device #{src}")
+            # Enable peer access
+            status = P2PBindings.cudaDeviceEnablePeerAccess(dst, 0)
+            # 0 = success, 704 = already enabled
+            @p2p_access_enabled[[src, dst]] = status.zero? || status == 704
+          end
+        end
+        @p2p_access_enabled
+      end
+      # Disable all P2P access
+      # @return [void]
+      def disable_all_p2p_access!
+        @p2p_access_enabled.each_key do |(src, dst)|
+          CUDA::RuntimeAPI.cudaSetDevice(src)
+          P2PBindings.cudaDeviceDisablePeerAccess(dst)
+        rescue StandardError
+          # Ignore errors during cleanup
+        end
+        @p2p_access_enabled.clear
+      end
+      # Get optimal ring order for collective operations
+      # @return [Array<Integer>] Ordered device IDs
+      def optimal_ring_order
+        detect_topology! unless @topology
+        @topology.optimal_ring_order
+      end
+      # Get device by ID
+      # @param device_id [Integer] GPU device ID
+      # @return [CUDA::Device, nil] Device object
+      def device(device_id)
+        @devices[device_id]
+      end
+      # Set current CUDA device
+      # @param device_id [Integer] GPU to activate
+      # @return [void]
+      def set_device!(device_id)
+        validate_device_id!(device_id)
+        @devices[device_id].set_current!
+      end
+      # Synchronize a device
+      # @param device_id [Integer] GPU to synchronize
+      # @return [void]
+      def synchronize!(device_id)
+        validate_device_id!(device_id)
+        @devices[device_id].synchronize
+      end
+      # Synchronize all managed devices
+      # @return [void]
+      def synchronize_all!
+        @device_ids.each { |id| synchronize!(id) }
+      end
+      # Get number of managed GPUs
+      # @return [Integer] GPU count
+      def size
+        @device_ids.size
+      end
+      # Check if fully initialized
+      # @return [Boolean] True if ready
+      def ready?
+        @initialized && @topology
+      end
+      # Get P2P capability summary
+      # @return [Hash] P2P statistics
+      def p2p_summary
+        return {} unless @topology
+        matrix = @topology.matrix
+        {
+          gpu_count: @device_ids.size,
+          total_paths: @device_ids.size * (@device_ids.size - 1),
+          p2p_enabled: @p2p_access_enabled.count { |_, v| v },
+          nvlink_paths: matrix.nvlink_paths.size,
+          full_mesh: matrix.full_p2p_mesh?,
+        }
+      end
+      # Clean up resources
+      # @return [void]
+      def destroy!
+        disable_all_p2p_access!
+        @devices.clear
+        @topology = nil
+        @initialized = false
+      end
+      # @return [String] Human-readable summary
+      def to_s
+        names = @devices.values.map { |d| "#{d.index}:#{d.name[0..15]}" }
+        "DeviceManager[#{names.join(', ')}]"
+      end
+      private
+      # Get all available GPU IDs
+      # @return [Array<Integer>] All device IDs
+      def all_device_ids
+        CUDA::Device.list.map(&:index)
+      end
+      # Validate requested device IDs exist
+      # @return [void]
+      # @raise [ArgumentError] If invalid device ID
+      def validate_devices!
+        all_ids = all_device_ids
+        @device_ids.each do |id|
+          next if all_ids.include?(id)
+          raise ArgumentError, "Invalid device ID #{id}. Available: #{all_ids}"
+        end
+      end
+      # Validate single device ID
+      # @param device_id [Integer] GPU ID
+      # @return [void]
+      # @raise [ArgumentError] If invalid
+      def validate_device_id!(device_id)
+        return if @device_ids.include?(device_id)
+        raise ArgumentError, "Device #{device_id} not managed by this DeviceManager"
+      end
+      # Create CUDA::Device objects for all managed GPUs
+      # @return [void]
+      def create_device_objects!
+        @device_ids.each do |id|
+          @devices[id] = CUDA::Device.new(id)
+        end
+      end
+    end
+  end
+end