torch-ddp 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22fe592695d427f7b20bc8c2ac008e46759a3edd6d0ddad5019aee709e6f9c98
4
- data.tar.gz: 6d01523fa38e56dd0de452476c6f67514d971eb43ae2eba78a1fb98f6ba0b4a1
3
+ metadata.gz: fc9265e095072dfefaedc2a0c8da3294a84f973bfb4db52deb526e61948efb5e
4
+ data.tar.gz: '05659da70316d5ea55113c7119d1af3ffac55c1032acc5c8e963956a3c60d744'
5
5
  SHA512:
6
- metadata.gz: 151cf148508441bad8db78979200e0b83543040df57d01cd5c21bb36703862b32968b08a815212e806c5956420051df7057b9ed16717ec98c760036c4cae57bf
7
- data.tar.gz: 5711b842e1a9bb4dce28bcd5026488319c4f8fe6f908d22728ce5f2a6aeeac99afd87cf61371ba231f120a15fb15a5af917fa6a30dcfa23728f0629bfa31be5a
6
+ metadata.gz: 13fdb5d3ab0ff19a2be1c1225cc95433ce9bf7bd3b9ec24918373a78ea57d2b5baf48c2a74649c0a6ee74370f343888a0340fdcee01ab6b82a47d4e1a0c6f850
7
+ data.tar.gz: 393a9e57b2c5377e47354bd20da205bd9ccfe83050e5cb5be5725446ea8401a0bc1c20af016fe26c536fca838a56f9b02a3295718a7e5bf79807d72b84a39ecc
data/README.md CHANGED
@@ -6,16 +6,10 @@ Note: This gem has only seen testing across a narrow set of multi-GPU setups (li
6
6
 
7
7
  ## Installation
8
8
 
9
- Build LibTorch with distributed backends (Gloo for CPU, NCCL for CUDA). Point the extension at your LibTorch, CUDA, and optional Gloo includes:
9
+ Build or download LibTorch with support of distributed backends (Gloo for CPU, NCCL for CUDA). Point the extension at your LibTorch, CUDA, and optional Gloo includes:
10
10
 
11
11
  ```sh
12
- bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-gloo-include=/path/to/gloo
13
- ```
14
-
15
- If your CUDA or Gloo headers aren't in standard locations, extend the build config:
16
-
17
- ```sh
18
- bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-cuda-include=/path/to/cuda/include --with-gloo-include=/path/to/gloo/repo
12
+ bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-cuda-dir=/path/to/cuda --with-gloo-include=/path/to/gloo
19
13
  ```
20
14
 
21
15
  Add the gem next to `torch-rb`:
@@ -8,14 +8,14 @@
8
8
  #include <vector>
9
9
 
10
10
  #include <torch/torch.h>
11
- #if defined(USE_C10D) && defined(USE_C10D_NCCL)
12
- #include <torch/cuda.h>
13
- #include <c10/cuda/CUDAFunctions.h>
14
- #endif
15
11
 
16
12
  #include <rice/rice.hpp>
17
13
  #include <rice/stl.hpp>
18
14
 
15
+ #if defined(USE_C10D) && defined(USE_C10D_NCCL)
16
+ #include <cuda_runtime_api.h>
17
+ #endif
18
+
19
19
  static_assert(
20
20
  TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 9,
21
21
  "Incompatible LibTorch version");
@@ -100,6 +100,24 @@ int reduce_op_from_int(int code) {
100
100
  return code;
101
101
  }
102
102
 
103
+ #if defined(USE_C10D_NCCL)
104
+ int cuda_device_count() {
105
+ int count = 0;
106
+ auto status = cudaGetDeviceCount(&count);
107
+ if (status != 0) {
108
+ rb_raise(rb_eRuntimeError, "cudaGetDeviceCount failed with code %d", status);
109
+ }
110
+ return count;
111
+ }
112
+
113
+ void ensure_cuda_device_set(int device_id) {
114
+ auto status = cudaSetDevice(device_id);
115
+ if (status != 0) {
116
+ rb_raise(rb_eRuntimeError, "cudaSetDevice(%d) failed with code %d", device_id, status);
117
+ }
118
+ }
119
+ #endif
120
+
103
121
  #endif
104
122
 
105
123
  } // namespace
@@ -212,18 +230,18 @@ void init_distributed(Rice::Module& m) {
212
230
 
213
231
  if (device_id >= 0 && backend_lower == "nccl") {
214
232
  #if defined(USE_C10D_NCCL)
215
- if (!torch::cuda::is_available()) {
233
+ auto device_count = cuda_device_count();
234
+ if (device_count <= 0) {
216
235
  rb_raise(rb_eRuntimeError, "CUDA is not available for NCCL backend");
217
236
  }
218
- auto device_count = torch::cuda::device_count();
219
- if (device_id >= static_cast<int>(device_count)) {
237
+ if (device_id >= device_count) {
220
238
  rb_raise(
221
239
  rb_eArgError,
222
240
  "Invalid device_id %d for NCCL backend (available devices: %d)",
223
241
  device_id,
224
- static_cast<int>(device_count));
242
+ device_count);
225
243
  }
226
- c10::cuda::set_device(device_id);
244
+ ensure_cuda_device_set(device_id);
227
245
  pg->setBoundDeviceId(c10::Device(c10::kCUDA, device_id));
228
246
  #endif
229
247
  }
@@ -34,9 +34,11 @@ end
34
34
 
35
35
  cuda_inc, cuda_lib = dir_config("cuda")
36
36
  cuda_lib ||= "/usr/local/cuda/lib64"
37
+ cuda_inc ||= "/usr/include"
37
38
 
38
39
  cudnn_inc, cudnn_lib = dir_config("cudnn")
39
40
  cudnn_lib ||= "/usr/local/cuda/lib"
41
+ abort "cuda.h not found" unless find_header("cuda.h")
40
42
 
41
43
  gloo_inc, _ = dir_config("gloo")
42
44
  gloo_inc ||= "./vendor/gloo"
@@ -61,20 +63,6 @@ $INCFLAGS += " -I#{inc}/torch/csrc/api/include"
61
63
  CONFIG["CC"] = CONFIG["CXX"]
62
64
  $CFLAGS = $CXXFLAGS
63
65
 
64
- supports_c10_cuda = with_cuda && try_compile(<<~CPP)
65
- #include <torch/torch.h>
66
- #include <c10/cuda/CUDAFunctions.h>
67
-
68
- int main() {
69
- c10::cuda::set_device(0);
70
- return 0;
71
- }
72
- CPP
73
-
74
- if supports_c10_cuda
75
- $defs << " -DHAVE_C10_CUDA"
76
- end
77
-
78
66
  $LDFLAGS += " -Wl,-rpath,#{lib}"
79
67
  if RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i && Dir.exist?("/opt/homebrew/opt/libomp/lib")
80
68
  $LDFLAGS += ",-rpath,/opt/homebrew/opt/libomp/lib"
@@ -128,7 +116,7 @@ supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GL
128
116
  }
129
117
  CPP
130
118
 
131
- supports_c10d_nccl = with_cuda && supports_c10_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
119
+ supports_c10d_nccl = with_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
132
120
  #include <torch/torch.h>
133
121
  #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
134
122
 
@@ -143,12 +131,15 @@ if supports_c10d_gloo
143
131
  $defs << "-DUSE_C10D_GLOO"
144
132
  puts "GLOO support detected"
145
133
  end
146
- unless supports_c10_cuda
147
- puts "No c10 CUDA headers found. NCCL is unavailable"
148
- end
149
134
  if supports_c10d_nccl
150
135
  $defs << "-DUSE_C10D_NCCL"
151
136
  puts "NCCL support detected"
137
+ elsif with_cuda
138
+ puts "NCCL support not detected; CUDA libraries found but headers may be unavailable"
139
+ end
140
+
141
+ unless supports_c10d_gloo || supports_c10d_nccl
142
+ abort "Neither Gloo nor NCCL support detected. Ensure LibTorch is built with distributed backends and provide Gloo or NCCL headers."
152
143
  end
153
144
 
154
145
  # create makefile
@@ -100,7 +100,9 @@ module Torch
100
100
 
101
101
  def patch_device_helpers
102
102
  Torch::Device.class_eval do
103
- define_method(:to_s) { _str }
103
+ define_method(:to_s) do
104
+ respond_to?(:_str) ? _str : super()
105
+ end
104
106
  end
105
107
 
106
108
  unless Torch.const_defined?(:DeviceString)
@@ -121,7 +123,13 @@ module Torch
121
123
  Torch::Device.new(device.to_s)
122
124
  end
123
125
  end
124
- super(@device._str)
126
+ device_str =
127
+ if @device.respond_to?(:_str)
128
+ @device._str
129
+ else
130
+ @device.to_s
131
+ end
132
+ super(device_str)
125
133
  end
126
134
 
127
135
  def type
@@ -1,5 +1,5 @@
1
1
  module Torch
2
2
  module DDP
3
- VERSION = "0.1.1"
3
+ VERSION = "0.1.3"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: torch-ddp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Razuvaev