ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,156 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'collective/p2p_bindings'
4
+ require_relative 'collective/vmm_bindings'
5
+ require_relative 'collective/topology'
6
+ require_relative 'collective/device_manager'
7
+ require_relative 'collective/transport/base'
8
+ require_relative 'collective/transport/p2p_transport'
9
+ require_relative 'collective/transport/ipc_transport'
10
+ require_relative 'collective/transport/vmm_ipc_transport'
11
+ require_relative 'collective/transport/vmm_ipc_structs'
12
+ require_relative 'collective/transport/host_staged_transport'
13
+ require_relative 'collective/transport_selector'
14
+ require_relative 'collective/algorithms/reduction_ops'
15
+ require_relative 'collective/algorithms/ring'
16
+ require_relative 'collective/algorithms/tree'
17
+ require_relative 'collective/communicator'
18
+ require_relative 'collective/array_ops'
19
+ require_relative 'collective/nvarray_adapter'
20
+ require_relative 'collective/health_monitor'
21
+ require_relative 'collective/communicator_healer'
22
+ require_relative 'collective/resilient_transport'
23
+ require_relative 'collective/dynamic_optimizer'
24
+
25
+ require "ignis"
26
+
27
+ module Ignis
28
+ module Collective
29
+ # NvCCL (Ignis Collective Communications Library) version
30
+ # Note: NvCCL is NOT NCCL — this is an original design.
31
+ VERSION = '0.1.0'
32
+
33
+ # Custom error classes
34
+ class Error < StandardError; end
35
+ class TopologyError < Error; end
36
+ class TransportError < Error; end
37
+ class CommunicatorError < Error; end
38
+
39
+ @booted = false
40
+
41
+ class << self
42
+ # Boot the NvCCL collective layer.
43
+ #
44
+ # Enumerates CUDA devices, detects topology, registers
45
+ # RecoveryProtocol callbacks, subscribes to EventBus events,
46
+ # and starts the HealthMonitor.
47
+ #
48
+ # @return [void]
49
+ def boot!
50
+ return if @booted
51
+
52
+ register_recovery_callbacks!
53
+ subscribe_event_bus!
54
+ start_health_monitor!
55
+
56
+ @booted = true
57
+ $stderr.puts "[NvCCL] Booted — #{device_count} GPU(s) detected"
58
+ end
59
+
60
+ # @return [Boolean]
61
+ def booted?
62
+ @booted
63
+ end
64
+
65
+ # Create a new communicator for the specified GPUs.
66
+ #
67
+ # @param gpu_ids [Array<Integer>] GPU device IDs to include
68
+ # @param rank [Integer] Process rank (for multi-process, default 0)
69
+ # @param world_size [Integer] Total processes (default 1)
70
+ # @return [Communicator]
71
+ def create_communicator(gpu_ids:, rank: 0, world_size: 1)
72
+ Communicator.new(gpu_ids: gpu_ids, rank: rank, world_size: world_size)
73
+ end
74
+
75
+ # Detect GPU topology for all available devices.
76
+ # @return [Topology::Detector]
77
+ def detect_topology
78
+ Topology::Detector.new
79
+ end
80
+
81
+ # Get list of available GPUs.
82
+ # @return [Array<CUDA::Device>]
83
+ def available_devices
84
+ CUDA::Device.list
85
+ rescue StandardError
86
+ []
87
+ end
88
+
89
+ # @return [Integer]
90
+ def device_count
91
+ CUDA::Device.count
92
+ rescue StandardError
93
+ 0
94
+ end
95
+
96
+ # Check if multi-GPU is available.
97
+ # @return [Boolean]
98
+ def multi_gpu_available?
99
+ device_count > 1
100
+ end
101
+
102
+ private
103
+
104
+ # Register NvCCL's RecoveryProtocol callbacks.
105
+ # @return [void]
106
+ def register_recovery_callbacks!
107
+ Ignis::Shared::RecoveryProtocol.register(
108
+ layer: :nvccl,
109
+ on_degraded: ->(**kwargs) {
110
+ gpu_id = kwargs[:gpu_id]
111
+ $stderr.puts "[NvCCL] GPU #{gpu_id} degraded — suspending collective operations"
112
+ },
113
+ on_recovering: ->(**kwargs) {
114
+ $stderr.puts "[NvCCL] Entering recovery — rebuilding communicator"
115
+ },
116
+ on_healthy: ->(**kwargs) {
117
+ recovered = kwargs[:recovered_gpus] || []
118
+ $stderr.puts "[NvCCL] Recovery complete — active GPUs: #{recovered}"
119
+ Ignis::Shared::EventBus.publish(:topology_changed, payload: { active_gpus: recovered })
120
+ },
121
+ on_failed: ->(**_kwargs) {
122
+ $stderr.puts "[NvCCL] CRITICAL — recovery failed, NvCCL entering FAILED state"
123
+ }
124
+ )
125
+ end
126
+
127
+ # Subscribe to EventBus events that NvCCL handles.
128
+ # @return [void]
129
+ def subscribe_event_bus!
130
+ # When compute is done, NvCCL may need to do an all-reduce
131
+ Ignis::Shared::EventBus.subscribe(:compute_done, handler_id: :nvccl_compute) do |payload|
132
+ # NvCCL will check if an all-reduce is needed based on training config
133
+ # For now, just track the event
134
+ end
135
+
136
+ # WNAIS backpressure signals
137
+ Ignis::Shared::EventBus.subscribe(:backpressure_on, handler_id: :nvccl_bp_on) do |_payload|
138
+ $stderr.puts "[NvCCL] Backpressure received — throttling collective operations"
139
+ end
140
+
141
+ Ignis::Shared::EventBus.subscribe(:backpressure_off, handler_id: :nvccl_bp_off) do |_payload|
142
+ $stderr.puts "[NvCCL] Backpressure released — resuming collective operations"
143
+ end
144
+ end
145
+
146
+ # Start the HealthMonitor background thread.
147
+ # @return [void]
148
+ def start_health_monitor!
149
+ if defined?(HealthMonitor)
150
+ @health_monitor = HealthMonitor.new
151
+ # HealthMonitor will publish :health_alert and :gpu_failed events
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ignis-collective
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - NNW / Ignis contributors
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: ignis
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - '='
17
+ - !ruby/object:Gem::Version
18
+ version: 0.0.1
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - '='
24
+ - !ruby/object:Gem::Version
25
+ version: 0.0.1
26
+ description: |
27
+ ignis-collective brings NCCL-style collective communication to Ruby: ring/tree
28
+ all-reduce, and P2P / IPC (VMM) / host-staged / TCP transports, on the Ignis
29
+ foundation. EXPERIMENTAL — the transports require multiple GPUs / nodes and
30
+ cannot be exercised on a single GPU, so this is shipped separately from the
31
+ verified single-GPU stack.
32
+ executables: []
33
+ extensions: []
34
+ extra_rdoc_files: []
35
+ files:
36
+ - README.md
37
+ - lib/ignis-collective.rb
38
+ - lib/nvruby/collective.rb
39
+ - lib/nvruby/collective/algorithms/double_binary_tree.rb
40
+ - lib/nvruby/collective/algorithms/pipeliner.rb
41
+ - lib/nvruby/collective/algorithms/reduction_ops.rb
42
+ - lib/nvruby/collective/algorithms/ring.rb
43
+ - lib/nvruby/collective/algorithms/topology_router.rb
44
+ - lib/nvruby/collective/algorithms/tree.rb
45
+ - lib/nvruby/collective/array_ops.rb
46
+ - lib/nvruby/collective/communicator.rb
47
+ - lib/nvruby/collective/communicator_healer.rb
48
+ - lib/nvruby/collective/device_manager.rb
49
+ - lib/nvruby/collective/dynamic_optimizer.rb
50
+ - lib/nvruby/collective/health_monitor.rb
51
+ - lib/nvruby/collective/net/nd_adapter.rb
52
+ - lib/nvruby/collective/net/nd_bindings.rb
53
+ - lib/nvruby/collective/net/rdma_transport.rb
54
+ - lib/nvruby/collective/nvarray_adapter.rb
55
+ - lib/nvruby/collective/p2p_bindings.rb
56
+ - lib/nvruby/collective/resilient_transport.rb
57
+ - lib/nvruby/collective/topology.rb
58
+ - lib/nvruby/collective/transport/base.rb
59
+ - lib/nvruby/collective/transport/host_staged_transport.rb
60
+ - lib/nvruby/collective/transport/ipc_transport.rb
61
+ - lib/nvruby/collective/transport/p2p_transport.rb
62
+ - lib/nvruby/collective/transport/rdma_transports.rb
63
+ - lib/nvruby/collective/transport/rio_transport.rb
64
+ - lib/nvruby/collective/transport/tcp_transport.rb
65
+ - lib/nvruby/collective/transport/vmm_ipc_structs.rb
66
+ - lib/nvruby/collective/transport/vmm_ipc_transport.rb
67
+ - lib/nvruby/collective/transport_selector.rb
68
+ - lib/nvruby/collective/vmm_bindings.rb
69
+ homepage: https://github.com/tigel-agm/Ignis
70
+ licenses:
71
+ - MIT
72
+ metadata:
73
+ source_code_uri: https://github.com/tigel-agm/Ignis
74
+ rubygems_mfa_required: 'true'
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '3.1'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubygems_version: 3.6.9
90
+ specification_version: 4
91
+ summary: Multi-GPU collective communication (NvCCL) for Ruby — EXPERIMENTAL
92
+ test_files: []