RubyGems - torch-ddp - Versions diffs - 0.1.1 → 0.1.3 - Mend

torch-ddp 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +2 -8
data/ext/torch_ddp/distributed.cpp +27 -9
data/ext/torch_ddp/extconf.rb +9 -18
data/lib/torch/ddp/monkey_patch.rb +10 -2
data/lib/torch/ddp/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 22fe592695d427f7b20bc8c2ac008e46759a3edd6d0ddad5019aee709e6f9c98
-  data.tar.gz: 6d01523fa38e56dd0de452476c6f67514d971eb43ae2eba78a1fb98f6ba0b4a1
+  metadata.gz: fc9265e095072dfefaedc2a0c8da3294a84f973bfb4db52deb526e61948efb5e
+  data.tar.gz: '05659da70316d5ea55113c7119d1af3ffac55c1032acc5c8e963956a3c60d744'
 SHA512:
-  metadata.gz: 151cf148508441bad8db78979200e0b83543040df57d01cd5c21bb36703862b32968b08a815212e806c5956420051df7057b9ed16717ec98c760036c4cae57bf
-  data.tar.gz: 5711b842e1a9bb4dce28bcd5026488319c4f8fe6f908d22728ce5f2a6aeeac99afd87cf61371ba231f120a15fb15a5af917fa6a30dcfa23728f0629bfa31be5a
+  metadata.gz: 13fdb5d3ab0ff19a2be1c1225cc95433ce9bf7bd3b9ec24918373a78ea57d2b5baf48c2a74649c0a6ee74370f343888a0340fdcee01ab6b82a47d4e1a0c6f850
+  data.tar.gz: 393a9e57b2c5377e47354bd20da205bd9ccfe83050e5cb5be5725446ea8401a0bc1c20af016fe26c536fca838a56f9b02a3295718a7e5bf79807d72b84a39ecc

data/README.md CHANGED Viewed

@@ -6,16 +6,10 @@ Note: This gem has only seen testing across a narrow set of multi-GPU setups (li
 ## Installation
-Build LibTorch with distributed backends (Gloo for CPU, NCCL for CUDA). Point the extension at your LibTorch, CUDA, and optional Gloo includes:
+Build or download LibTorch with support of distributed backends (Gloo for CPU, NCCL for CUDA). Point the extension at your LibTorch, CUDA, and optional Gloo includes:
 ```sh
-bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-gloo-include=/path/to/gloo
-```
-If your CUDA or Gloo headers aren't in standard locations, extend the build config:
-```sh
-bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-cuda-include=/path/to/cuda/include --with-gloo-include=/path/to/gloo/repo
+bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-cuda-dir=/path/to/cuda --with-gloo-include=/path/to/gloo
 ```
 Add the gem next to `torch-rb`:

data/ext/torch_ddp/distributed.cpp CHANGED Viewed

@@ -8,14 +8,14 @@
 #include <vector>
 #include <torch/torch.h>
-#if defined(USE_C10D) && defined(USE_C10D_NCCL)
-#include <torch/cuda.h>
-#include <c10/cuda/CUDAFunctions.h>
-#endif
 #include <rice/rice.hpp>
 #include <rice/stl.hpp>
+#if defined(USE_C10D) && defined(USE_C10D_NCCL)
+#include <cuda_runtime_api.h>
+#endif
 static_assert(
     TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 9,
     "Incompatible LibTorch version");
@@ -100,6 +100,24 @@ int reduce_op_from_int(int code) {
   return code;
 }
+#if defined(USE_C10D_NCCL)
+int cuda_device_count() {
+  int count = 0;
+  auto status = cudaGetDeviceCount(&count);
+  if (status != 0) {
+    rb_raise(rb_eRuntimeError, "cudaGetDeviceCount failed with code %d", status);
+  }
+  return count;
+}
+void ensure_cuda_device_set(int device_id) {
+  auto status = cudaSetDevice(device_id);
+  if (status != 0) {
+    rb_raise(rb_eRuntimeError, "cudaSetDevice(%d) failed with code %d", device_id, status);
+  }
+}
+#endif
 #endif
 } // namespace
@@ -212,18 +230,18 @@ void init_distributed(Rice::Module& m) {
         if (device_id >= 0 && backend_lower == "nccl") {
 #if defined(USE_C10D_NCCL)
-          if (!torch::cuda::is_available()) {
+          auto device_count = cuda_device_count();
+          if (device_count <= 0) {
             rb_raise(rb_eRuntimeError, "CUDA is not available for NCCL backend");
           }
-          auto device_count = torch::cuda::device_count();
-          if (device_id >= static_cast<int>(device_count)) {
+          if (device_id >= device_count) {
             rb_raise(
                 rb_eArgError,
                 "Invalid device_id %d for NCCL backend (available devices: %d)",
                 device_id,
-                static_cast<int>(device_count));
+                device_count);
           }
-          c10::cuda::set_device(device_id);
+          ensure_cuda_device_set(device_id);
           pg->setBoundDeviceId(c10::Device(c10::kCUDA, device_id));
 #endif
         }

data/ext/torch_ddp/extconf.rb CHANGED Viewed

@@ -34,9 +34,11 @@ end
 cuda_inc, cuda_lib = dir_config("cuda")
 cuda_lib ||= "/usr/local/cuda/lib64"
+cuda_inc ||= "/usr/include"
 cudnn_inc, cudnn_lib = dir_config("cudnn")
 cudnn_lib ||= "/usr/local/cuda/lib"
+abort "cuda.h not found" unless find_header("cuda.h")
 gloo_inc, _ = dir_config("gloo")
 gloo_inc ||= "./vendor/gloo"
@@ -61,20 +63,6 @@ $INCFLAGS += " -I#{inc}/torch/csrc/api/include"
 CONFIG["CC"] = CONFIG["CXX"]
 $CFLAGS = $CXXFLAGS
-supports_c10_cuda = with_cuda && try_compile(<<~CPP)
-  #include <torch/torch.h>
-  #include <c10/cuda/CUDAFunctions.h>
-  int main() {
-    c10::cuda::set_device(0);
-    return 0;
-  }
-CPP
-if supports_c10_cuda
-  $defs << " -DHAVE_C10_CUDA"
-end
 $LDFLAGS += " -Wl,-rpath,#{lib}"
 if RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i && Dir.exist?("/opt/homebrew/opt/libomp/lib")
   $LDFLAGS += ",-rpath,/opt/homebrew/opt/libomp/lib"
@@ -128,7 +116,7 @@ supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GL
   }
 CPP
-supports_c10d_nccl = with_cuda && supports_c10_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
+supports_c10d_nccl = with_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
   #include <torch/torch.h>
   #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
@@ -143,12 +131,15 @@ if supports_c10d_gloo
   $defs << "-DUSE_C10D_GLOO"
   puts "GLOO support detected"
 end
-unless supports_c10_cuda
-  puts "No c10 CUDA headers found. NCCL is unavailable"
-end
 if supports_c10d_nccl
   $defs << "-DUSE_C10D_NCCL"
   puts "NCCL support detected"
+elsif with_cuda
+  puts "NCCL support not detected; CUDA libraries found but headers may be unavailable"
+end
+unless supports_c10d_gloo || supports_c10d_nccl
+  abort "Neither Gloo nor NCCL support detected. Ensure LibTorch is built with distributed backends and provide Gloo or NCCL headers."
 end
 # create makefile

data/lib/torch/ddp/monkey_patch.rb CHANGED Viewed

@@ -100,7 +100,9 @@ module Torch
         def patch_device_helpers
           Torch::Device.class_eval do
-            define_method(:to_s) { _str }
+            define_method(:to_s) do
+              respond_to?(:_str) ? _str : super()
+            end
           end
           unless Torch.const_defined?(:DeviceString)
@@ -121,7 +123,13 @@ module Torch
                         Torch::Device.new(device.to_s)
                       end
                     end
-                  super(@device._str)
+                  device_str =
+                    if @device.respond_to?(:_str)
+                      @device._str
+                    else
+                      @device.to_s
+                    end
+                  super(device_str)
                 end
                 def type

data/lib/torch/ddp/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Torch
   module DDP
-    VERSION = "0.1.1"
+    VERSION = "0.1.3"
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: torch-ddp
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.3
 platform: ruby
 authors:
 - Ivan Razuvaev