torch-ddp 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -8
- data/ext/torch_ddp/distributed.cpp +27 -9
- data/ext/torch_ddp/extconf.rb +9 -18
- data/lib/torch/ddp/monkey_patch.rb +3 -1
- data/lib/torch/ddp/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 341ecc522b0ed689920f0841ef56512531aa68706fc90768ee32f0234100a3d6
|
|
4
|
+
data.tar.gz: f6ae0221f15cc9d793c3151865ed9d0189a74efba3122cff08b4f63aa829c754
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: afb85d047d017eb59f1001eac7bb73bd81a33001c0ec9351d4e578f898530acc015de83cb8fe9137580ba93399abe910652fefa1118c103724ee314f5f0cb2f4
|
|
7
|
+
data.tar.gz: 30e2153309523ef5aaab848317f2e60b2fce16ab90950e0c9f7327e726bf8e3c47af4c99dc52a7c80719aad3680dcc0cc2a79ed040654d11148d7b2a0de12dd9
|
data/README.md
CHANGED
|
@@ -6,16 +6,10 @@ Note: This gem has only seen testing across a narrow set of multi-GPU setups (li
|
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
9
|
-
Build LibTorch with distributed backends (Gloo for CPU, NCCL for CUDA). Point the extension at your LibTorch, CUDA, and optional Gloo includes:
|
|
9
|
+
Build or download LibTorch with support of distributed backends (Gloo for CPU, NCCL for CUDA). Point the extension at your LibTorch, CUDA, and optional Gloo includes:
|
|
10
10
|
|
|
11
11
|
```sh
|
|
12
|
-
bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-gloo-include=/path/to/gloo
|
|
13
|
-
```
|
|
14
|
-
|
|
15
|
-
If your CUDA or Gloo headers aren't in standard locations, extend the build config:
|
|
16
|
-
|
|
17
|
-
```sh
|
|
18
|
-
bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-cuda-include=/path/to/cuda/include --with-gloo-include=/path/to/gloo/repo
|
|
12
|
+
bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-cuda-dir=/path/to/cuda --with-gloo-include=/path/to/gloo
|
|
19
13
|
```
|
|
20
14
|
|
|
21
15
|
Add the gem next to `torch-rb`:
|
|
@@ -8,14 +8,14 @@
|
|
|
8
8
|
#include <vector>
|
|
9
9
|
|
|
10
10
|
#include <torch/torch.h>
|
|
11
|
-
#if defined(USE_C10D) && defined(USE_C10D_NCCL)
|
|
12
|
-
#include <torch/cuda.h>
|
|
13
|
-
#include <c10/cuda/CUDAFunctions.h>
|
|
14
|
-
#endif
|
|
15
11
|
|
|
16
12
|
#include <rice/rice.hpp>
|
|
17
13
|
#include <rice/stl.hpp>
|
|
18
14
|
|
|
15
|
+
#if defined(USE_C10D) && defined(USE_C10D_NCCL)
|
|
16
|
+
#include <cuda_runtime_api.h>
|
|
17
|
+
#endif
|
|
18
|
+
|
|
19
19
|
static_assert(
|
|
20
20
|
TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 9,
|
|
21
21
|
"Incompatible LibTorch version");
|
|
@@ -100,6 +100,24 @@ int reduce_op_from_int(int code) {
|
|
|
100
100
|
return code;
|
|
101
101
|
}
|
|
102
102
|
|
|
103
|
+
#if defined(USE_C10D_NCCL)
|
|
104
|
+
int cuda_device_count() {
|
|
105
|
+
int count = 0;
|
|
106
|
+
auto status = cudaGetDeviceCount(&count);
|
|
107
|
+
if (status != 0) {
|
|
108
|
+
rb_raise(rb_eRuntimeError, "cudaGetDeviceCount failed with code %d", status);
|
|
109
|
+
}
|
|
110
|
+
return count;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
void ensure_cuda_device_set(int device_id) {
|
|
114
|
+
auto status = cudaSetDevice(device_id);
|
|
115
|
+
if (status != 0) {
|
|
116
|
+
rb_raise(rb_eRuntimeError, "cudaSetDevice(%d) failed with code %d", device_id, status);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
#endif
|
|
120
|
+
|
|
103
121
|
#endif
|
|
104
122
|
|
|
105
123
|
} // namespace
|
|
@@ -212,18 +230,18 @@ void init_distributed(Rice::Module& m) {
|
|
|
212
230
|
|
|
213
231
|
if (device_id >= 0 && backend_lower == "nccl") {
|
|
214
232
|
#if defined(USE_C10D_NCCL)
|
|
215
|
-
|
|
233
|
+
auto device_count = cuda_device_count();
|
|
234
|
+
if (device_count <= 0) {
|
|
216
235
|
rb_raise(rb_eRuntimeError, "CUDA is not available for NCCL backend");
|
|
217
236
|
}
|
|
218
|
-
|
|
219
|
-
if (device_id >= static_cast<int>(device_count)) {
|
|
237
|
+
if (device_id >= device_count) {
|
|
220
238
|
rb_raise(
|
|
221
239
|
rb_eArgError,
|
|
222
240
|
"Invalid device_id %d for NCCL backend (available devices: %d)",
|
|
223
241
|
device_id,
|
|
224
|
-
|
|
242
|
+
device_count);
|
|
225
243
|
}
|
|
226
|
-
|
|
244
|
+
ensure_cuda_device_set(device_id);
|
|
227
245
|
pg->setBoundDeviceId(c10::Device(c10::kCUDA, device_id));
|
|
228
246
|
#endif
|
|
229
247
|
}
|
data/ext/torch_ddp/extconf.rb
CHANGED
|
@@ -34,9 +34,11 @@ end
|
|
|
34
34
|
|
|
35
35
|
cuda_inc, cuda_lib = dir_config("cuda")
|
|
36
36
|
cuda_lib ||= "/usr/local/cuda/lib64"
|
|
37
|
+
cuda_inc ||= "/usr/include"
|
|
37
38
|
|
|
38
39
|
cudnn_inc, cudnn_lib = dir_config("cudnn")
|
|
39
40
|
cudnn_lib ||= "/usr/local/cuda/lib"
|
|
41
|
+
abort "cuda.h not found" unless find_header("cuda.h")
|
|
40
42
|
|
|
41
43
|
gloo_inc, _ = dir_config("gloo")
|
|
42
44
|
gloo_inc ||= "./vendor/gloo"
|
|
@@ -61,20 +63,6 @@ $INCFLAGS += " -I#{inc}/torch/csrc/api/include"
|
|
|
61
63
|
CONFIG["CC"] = CONFIG["CXX"]
|
|
62
64
|
$CFLAGS = $CXXFLAGS
|
|
63
65
|
|
|
64
|
-
supports_c10_cuda = with_cuda && try_compile(<<~CPP)
|
|
65
|
-
#include <torch/torch.h>
|
|
66
|
-
#include <c10/cuda/CUDAFunctions.h>
|
|
67
|
-
|
|
68
|
-
int main() {
|
|
69
|
-
c10::cuda::set_device(0);
|
|
70
|
-
return 0;
|
|
71
|
-
}
|
|
72
|
-
CPP
|
|
73
|
-
|
|
74
|
-
if supports_c10_cuda
|
|
75
|
-
$defs << " -DHAVE_C10_CUDA"
|
|
76
|
-
end
|
|
77
|
-
|
|
78
66
|
$LDFLAGS += " -Wl,-rpath,#{lib}"
|
|
79
67
|
if RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i && Dir.exist?("/opt/homebrew/opt/libomp/lib")
|
|
80
68
|
$LDFLAGS += ",-rpath,/opt/homebrew/opt/libomp/lib"
|
|
@@ -128,7 +116,7 @@ supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GL
|
|
|
128
116
|
}
|
|
129
117
|
CPP
|
|
130
118
|
|
|
131
|
-
supports_c10d_nccl = with_cuda &&
|
|
119
|
+
supports_c10d_nccl = with_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
|
|
132
120
|
#include <torch/torch.h>
|
|
133
121
|
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
|
|
134
122
|
|
|
@@ -143,12 +131,15 @@ if supports_c10d_gloo
|
|
|
143
131
|
$defs << "-DUSE_C10D_GLOO"
|
|
144
132
|
puts "GLOO support detected"
|
|
145
133
|
end
|
|
146
|
-
unless supports_c10_cuda
|
|
147
|
-
puts "No c10 CUDA headers found. NCCL is unavailable"
|
|
148
|
-
end
|
|
149
134
|
if supports_c10d_nccl
|
|
150
135
|
$defs << "-DUSE_C10D_NCCL"
|
|
151
136
|
puts "NCCL support detected"
|
|
137
|
+
elsif with_cuda
|
|
138
|
+
puts "NCCL support not detected; CUDA libraries found but headers may be unavailable"
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
unless supports_c10d_gloo || supports_c10d_nccl
|
|
142
|
+
abort "Neither Gloo nor NCCL support detected. Ensure LibTorch is built with distributed backends and provide Gloo or NCCL headers."
|
|
152
143
|
end
|
|
153
144
|
|
|
154
145
|
# create makefile
|
|
@@ -100,8 +100,10 @@ module Torch
|
|
|
100
100
|
|
|
101
101
|
def patch_device_helpers
|
|
102
102
|
Torch::Device.class_eval do
|
|
103
|
+
alias_method :_torch_ddp_original_to_s, :to_s unless method_defined?(:_torch_ddp_original_to_s)
|
|
104
|
+
|
|
103
105
|
define_method(:to_s) do
|
|
104
|
-
respond_to?(:_str) ? _str :
|
|
106
|
+
respond_to?(:_str) ? _str : _torch_ddp_original_to_s
|
|
105
107
|
end
|
|
106
108
|
end
|
|
107
109
|
|
data/lib/torch/ddp/version.rb
CHANGED