ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'collective/p2p_bindings'
|
|
4
|
+
require_relative 'collective/vmm_bindings'
|
|
5
|
+
require_relative 'collective/topology'
|
|
6
|
+
require_relative 'collective/device_manager'
|
|
7
|
+
require_relative 'collective/transport/base'
|
|
8
|
+
require_relative 'collective/transport/p2p_transport'
|
|
9
|
+
require_relative 'collective/transport/ipc_transport'
|
|
10
|
+
require_relative 'collective/transport/vmm_ipc_transport'
|
|
11
|
+
require_relative 'collective/transport/vmm_ipc_structs'
|
|
12
|
+
require_relative 'collective/transport/host_staged_transport'
|
|
13
|
+
require_relative 'collective/transport_selector'
|
|
14
|
+
require_relative 'collective/algorithms/reduction_ops'
|
|
15
|
+
require_relative 'collective/algorithms/ring'
|
|
16
|
+
require_relative 'collective/algorithms/tree'
|
|
17
|
+
require_relative 'collective/communicator'
|
|
18
|
+
require_relative 'collective/array_ops'
|
|
19
|
+
require_relative 'collective/nvarray_adapter'
|
|
20
|
+
require_relative 'collective/health_monitor'
|
|
21
|
+
require_relative 'collective/communicator_healer'
|
|
22
|
+
require_relative 'collective/resilient_transport'
|
|
23
|
+
require_relative 'collective/dynamic_optimizer'
|
|
24
|
+
|
|
25
|
+
require "ignis"
|
|
26
|
+
|
|
27
|
+
module Ignis
|
|
28
|
+
module Collective
|
|
29
|
+
# NvCCL (Ignis Collective Communications Library) version
|
|
30
|
+
# Note: NvCCL is NOT NCCL — this is an original design.
|
|
31
|
+
VERSION = '0.1.0'
|
|
32
|
+
|
|
33
|
+
# Custom error classes
|
|
34
|
+
class Error < StandardError; end
|
|
35
|
+
class TopologyError < Error; end
|
|
36
|
+
class TransportError < Error; end
|
|
37
|
+
class CommunicatorError < Error; end
|
|
38
|
+
|
|
39
|
+
@booted = false
|
|
40
|
+
|
|
41
|
+
class << self
|
|
42
|
+
# Boot the NvCCL collective layer.
|
|
43
|
+
#
|
|
44
|
+
# Enumerates CUDA devices, detects topology, registers
|
|
45
|
+
# RecoveryProtocol callbacks, subscribes to EventBus events,
|
|
46
|
+
# and starts the HealthMonitor.
|
|
47
|
+
#
|
|
48
|
+
# @return [void]
|
|
49
|
+
def boot!
|
|
50
|
+
return if @booted
|
|
51
|
+
|
|
52
|
+
register_recovery_callbacks!
|
|
53
|
+
subscribe_event_bus!
|
|
54
|
+
start_health_monitor!
|
|
55
|
+
|
|
56
|
+
@booted = true
|
|
57
|
+
$stderr.puts "[NvCCL] Booted — #{device_count} GPU(s) detected"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# @return [Boolean]
|
|
61
|
+
def booted?
|
|
62
|
+
@booted
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Create a new communicator for the specified GPUs.
|
|
66
|
+
#
|
|
67
|
+
# @param gpu_ids [Array<Integer>] GPU device IDs to include
|
|
68
|
+
# @param rank [Integer] Process rank (for multi-process, default 0)
|
|
69
|
+
# @param world_size [Integer] Total processes (default 1)
|
|
70
|
+
# @return [Communicator]
|
|
71
|
+
def create_communicator(gpu_ids:, rank: 0, world_size: 1)
|
|
72
|
+
Communicator.new(gpu_ids: gpu_ids, rank: rank, world_size: world_size)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Detect GPU topology for all available devices.
|
|
76
|
+
# @return [Topology::Detector]
|
|
77
|
+
def detect_topology
|
|
78
|
+
Topology::Detector.new
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Get list of available GPUs.
|
|
82
|
+
# @return [Array<CUDA::Device>]
|
|
83
|
+
def available_devices
|
|
84
|
+
CUDA::Device.list
|
|
85
|
+
rescue StandardError
|
|
86
|
+
[]
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# @return [Integer]
|
|
90
|
+
def device_count
|
|
91
|
+
CUDA::Device.count
|
|
92
|
+
rescue StandardError
|
|
93
|
+
0
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Check if multi-GPU is available.
|
|
97
|
+
# @return [Boolean]
|
|
98
|
+
def multi_gpu_available?
|
|
99
|
+
device_count > 1
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
private
|
|
103
|
+
|
|
104
|
+
# Register NvCCL's RecoveryProtocol callbacks.
|
|
105
|
+
# @return [void]
|
|
106
|
+
def register_recovery_callbacks!
|
|
107
|
+
Ignis::Shared::RecoveryProtocol.register(
|
|
108
|
+
layer: :nvccl,
|
|
109
|
+
on_degraded: ->(**kwargs) {
|
|
110
|
+
gpu_id = kwargs[:gpu_id]
|
|
111
|
+
$stderr.puts "[NvCCL] GPU #{gpu_id} degraded — suspending collective operations"
|
|
112
|
+
},
|
|
113
|
+
on_recovering: ->(**kwargs) {
|
|
114
|
+
$stderr.puts "[NvCCL] Entering recovery — rebuilding communicator"
|
|
115
|
+
},
|
|
116
|
+
on_healthy: ->(**kwargs) {
|
|
117
|
+
recovered = kwargs[:recovered_gpus] || []
|
|
118
|
+
$stderr.puts "[NvCCL] Recovery complete — active GPUs: #{recovered}"
|
|
119
|
+
Ignis::Shared::EventBus.publish(:topology_changed, payload: { active_gpus: recovered })
|
|
120
|
+
},
|
|
121
|
+
on_failed: ->(**_kwargs) {
|
|
122
|
+
$stderr.puts "[NvCCL] CRITICAL — recovery failed, NvCCL entering FAILED state"
|
|
123
|
+
}
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Subscribe to EventBus events that NvCCL handles.
|
|
128
|
+
# @return [void]
|
|
129
|
+
def subscribe_event_bus!
|
|
130
|
+
# When compute is done, NvCCL may need to do an all-reduce
|
|
131
|
+
Ignis::Shared::EventBus.subscribe(:compute_done, handler_id: :nvccl_compute) do |payload|
|
|
132
|
+
# NvCCL will check if an all-reduce is needed based on training config
|
|
133
|
+
# For now, just track the event
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# WNAIS backpressure signals
|
|
137
|
+
Ignis::Shared::EventBus.subscribe(:backpressure_on, handler_id: :nvccl_bp_on) do |_payload|
|
|
138
|
+
$stderr.puts "[NvCCL] Backpressure received — throttling collective operations"
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
Ignis::Shared::EventBus.subscribe(:backpressure_off, handler_id: :nvccl_bp_off) do |_payload|
|
|
142
|
+
$stderr.puts "[NvCCL] Backpressure released — resuming collective operations"
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Start the HealthMonitor background thread.
|
|
147
|
+
# @return [void]
|
|
148
|
+
def start_health_monitor!
|
|
149
|
+
if defined?(HealthMonitor)
|
|
150
|
+
@health_monitor = HealthMonitor.new
|
|
151
|
+
# HealthMonitor will publish :health_alert and :gpu_failed events
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: ignis-collective
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- NNW / Ignis contributors
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: ignis
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - '='
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: 0.0.1
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - '='
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: 0.0.1
|
|
26
|
+
description: |
|
|
27
|
+
ignis-collective brings NCCL-style collective communication to Ruby: ring/tree
|
|
28
|
+
all-reduce, and P2P / IPC (VMM) / host-staged / TCP transports, on the Ignis
|
|
29
|
+
foundation. EXPERIMENTAL — the transports require multiple GPUs / nodes and
|
|
30
|
+
cannot be exercised on a single GPU, so this is shipped separately from the
|
|
31
|
+
verified single-GPU stack.
|
|
32
|
+
executables: []
|
|
33
|
+
extensions: []
|
|
34
|
+
extra_rdoc_files: []
|
|
35
|
+
files:
|
|
36
|
+
- README.md
|
|
37
|
+
- lib/ignis-collective.rb
|
|
38
|
+
- lib/nvruby/collective.rb
|
|
39
|
+
- lib/nvruby/collective/algorithms/double_binary_tree.rb
|
|
40
|
+
- lib/nvruby/collective/algorithms/pipeliner.rb
|
|
41
|
+
- lib/nvruby/collective/algorithms/reduction_ops.rb
|
|
42
|
+
- lib/nvruby/collective/algorithms/ring.rb
|
|
43
|
+
- lib/nvruby/collective/algorithms/topology_router.rb
|
|
44
|
+
- lib/nvruby/collective/algorithms/tree.rb
|
|
45
|
+
- lib/nvruby/collective/array_ops.rb
|
|
46
|
+
- lib/nvruby/collective/communicator.rb
|
|
47
|
+
- lib/nvruby/collective/communicator_healer.rb
|
|
48
|
+
- lib/nvruby/collective/device_manager.rb
|
|
49
|
+
- lib/nvruby/collective/dynamic_optimizer.rb
|
|
50
|
+
- lib/nvruby/collective/health_monitor.rb
|
|
51
|
+
- lib/nvruby/collective/net/nd_adapter.rb
|
|
52
|
+
- lib/nvruby/collective/net/nd_bindings.rb
|
|
53
|
+
- lib/nvruby/collective/net/rdma_transport.rb
|
|
54
|
+
- lib/nvruby/collective/nvarray_adapter.rb
|
|
55
|
+
- lib/nvruby/collective/p2p_bindings.rb
|
|
56
|
+
- lib/nvruby/collective/resilient_transport.rb
|
|
57
|
+
- lib/nvruby/collective/topology.rb
|
|
58
|
+
- lib/nvruby/collective/transport/base.rb
|
|
59
|
+
- lib/nvruby/collective/transport/host_staged_transport.rb
|
|
60
|
+
- lib/nvruby/collective/transport/ipc_transport.rb
|
|
61
|
+
- lib/nvruby/collective/transport/p2p_transport.rb
|
|
62
|
+
- lib/nvruby/collective/transport/rdma_transports.rb
|
|
63
|
+
- lib/nvruby/collective/transport/rio_transport.rb
|
|
64
|
+
- lib/nvruby/collective/transport/tcp_transport.rb
|
|
65
|
+
- lib/nvruby/collective/transport/vmm_ipc_structs.rb
|
|
66
|
+
- lib/nvruby/collective/transport/vmm_ipc_transport.rb
|
|
67
|
+
- lib/nvruby/collective/transport_selector.rb
|
|
68
|
+
- lib/nvruby/collective/vmm_bindings.rb
|
|
69
|
+
homepage: https://github.com/tigel-agm/Ignis
|
|
70
|
+
licenses:
|
|
71
|
+
- MIT
|
|
72
|
+
metadata:
|
|
73
|
+
source_code_uri: https://github.com/tigel-agm/Ignis
|
|
74
|
+
rubygems_mfa_required: 'true'
|
|
75
|
+
rdoc_options: []
|
|
76
|
+
require_paths:
|
|
77
|
+
- lib
|
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - ">="
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '3.1'
|
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
84
|
+
requirements:
|
|
85
|
+
- - ">="
|
|
86
|
+
- !ruby/object:Gem::Version
|
|
87
|
+
version: '0'
|
|
88
|
+
requirements: []
|
|
89
|
+
rubygems_version: 3.6.9
|
|
90
|
+
specification_version: 4
|
|
91
|
+
summary: Multi-GPU collective communication (NvCCL) for Ruby — EXPERIMENTAL
|
|
92
|
+
test_files: []
|