RubyGems - torch-ddp - Versions diffs - 0.1.2 → 0.1.4 - Mend

torch-ddp 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +2 -8
data/ext/torch_ddp/distributed.cpp +27 -9
data/ext/torch_ddp/extconf.rb +9 -18
data/lib/torch/ddp/monkey_patch.rb +3 -1
data/lib/torch/ddp/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cf1fe012a13aed095f9eb6aa3bb32713a42d42c984a2df271a2f5571b8405686
-  data.tar.gz: 9a44d9cd25ccc0d56b294910dd9b5c349a79b5468ff8e67935405b30284ed2ef
+  metadata.gz: 341ecc522b0ed689920f0841ef56512531aa68706fc90768ee32f0234100a3d6
+  data.tar.gz: f6ae0221f15cc9d793c3151865ed9d0189a74efba3122cff08b4f63aa829c754
 SHA512:
-  metadata.gz: 27f09c0880d00540da310ce7904509e1d12c5d087dc8850cc203819e7248ecbe1a1aab5cec33e8dd0415e04932623eb3de7917153a598260a8791c58984bfbb2
-  data.tar.gz: 93ff18257628ff81133b30abe12987159f9c7cfe866a91b5b6c161032483ebb087ad5f1c587268fa541f5b18fa6153578efc4333df6cff86883eb0f2692bac55
+  metadata.gz: afb85d047d017eb59f1001eac7bb73bd81a33001c0ec9351d4e578f898530acc015de83cb8fe9137580ba93399abe910652fefa1118c103724ee314f5f0cb2f4
+  data.tar.gz: 30e2153309523ef5aaab848317f2e60b2fce16ab90950e0c9f7327e726bf8e3c47af4c99dc52a7c80719aad3680dcc0cc2a79ed040654d11148d7b2a0de12dd9

data/README.md CHANGED Viewed

@@ -6,16 +6,10 @@ Note: This gem has only seen testing across a narrow set of multi-GPU setups (li
 ## Installation
-Build LibTorch with distributed backends (Gloo for CPU, NCCL for CUDA). Point the extension at your LibTorch, CUDA, and optional Gloo includes:
+Build or download LibTorch with support of distributed backends (Gloo for CPU, NCCL for CUDA). Point the extension at your LibTorch, CUDA, and optional Gloo includes:
 ```sh
-bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-gloo-include=/path/to/gloo
-```
-If your CUDA or Gloo headers aren't in standard locations, extend the build config:
-```sh
-bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-cuda-include=/path/to/cuda/include --with-gloo-include=/path/to/gloo/repo
+bundle config build.torch-ddp --with-torch-dir=/path/to/libtorch --with-cuda-dir=/path/to/cuda --with-gloo-include=/path/to/gloo
 ```
 Add the gem next to `torch-rb`:

data/ext/torch_ddp/distributed.cpp CHANGED Viewed

@@ -8,14 +8,14 @@
 #include <vector>
 #include <torch/torch.h>
-#if defined(USE_C10D) && defined(USE_C10D_NCCL)
-#include <torch/cuda.h>
-#include <c10/cuda/CUDAFunctions.h>
-#endif
 #include <rice/rice.hpp>
 #include <rice/stl.hpp>
+#if defined(USE_C10D) && defined(USE_C10D_NCCL)
+#include <cuda_runtime_api.h>
+#endif
 static_assert(
     TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 9,
     "Incompatible LibTorch version");
@@ -100,6 +100,24 @@ int reduce_op_from_int(int code) {
   return code;
 }
+#if defined(USE_C10D_NCCL)
+int cuda_device_count() {
+  int count = 0;
+  auto status = cudaGetDeviceCount(&count);
+  if (status != 0) {
+    rb_raise(rb_eRuntimeError, "cudaGetDeviceCount failed with code %d", status);
+  }
+  return count;
+}
+void ensure_cuda_device_set(int device_id) {
+  auto status = cudaSetDevice(device_id);
+  if (status != 0) {
+    rb_raise(rb_eRuntimeError, "cudaSetDevice(%d) failed with code %d", device_id, status);
+  }
+}
+#endif
 #endif
 } // namespace
@@ -212,18 +230,18 @@ void init_distributed(Rice::Module& m) {
         if (device_id >= 0 && backend_lower == "nccl") {
 #if defined(USE_C10D_NCCL)
-          if (!torch::cuda::is_available()) {
+          auto device_count = cuda_device_count();
+          if (device_count <= 0) {
             rb_raise(rb_eRuntimeError, "CUDA is not available for NCCL backend");
           }
-          auto device_count = torch::cuda::device_count();
-          if (device_id >= static_cast<int>(device_count)) {
+          if (device_id >= device_count) {
             rb_raise(
                 rb_eArgError,
                 "Invalid device_id %d for NCCL backend (available devices: %d)",
                 device_id,
-                static_cast<int>(device_count));
+                device_count);
           }
-          c10::cuda::set_device(device_id);
+          ensure_cuda_device_set(device_id);
           pg->setBoundDeviceId(c10::Device(c10::kCUDA, device_id));
 #endif
         }

data/ext/torch_ddp/extconf.rb CHANGED Viewed

@@ -34,9 +34,11 @@ end
 cuda_inc, cuda_lib = dir_config("cuda")
 cuda_lib ||= "/usr/local/cuda/lib64"
+cuda_inc ||= "/usr/include"
 cudnn_inc, cudnn_lib = dir_config("cudnn")
 cudnn_lib ||= "/usr/local/cuda/lib"
+abort "cuda.h not found" unless find_header("cuda.h")
 gloo_inc, _ = dir_config("gloo")
 gloo_inc ||= "./vendor/gloo"
@@ -61,20 +63,6 @@ $INCFLAGS += " -I#{inc}/torch/csrc/api/include"
 CONFIG["CC"] = CONFIG["CXX"]
 $CFLAGS = $CXXFLAGS
-supports_c10_cuda = with_cuda && try_compile(<<~CPP)
-  #include <torch/torch.h>
-  #include <c10/cuda/CUDAFunctions.h>
-  int main() {
-    c10::cuda::set_device(0);
-    return 0;
-  }
-CPP
-if supports_c10_cuda
-  $defs << " -DHAVE_C10_CUDA"
-end
 $LDFLAGS += " -Wl,-rpath,#{lib}"
 if RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i && Dir.exist?("/opt/homebrew/opt/libomp/lib")
   $LDFLAGS += ",-rpath,/opt/homebrew/opt/libomp/lib"
@@ -128,7 +116,7 @@ supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GL
   }
 CPP
-supports_c10d_nccl = with_cuda && supports_c10_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
+supports_c10d_nccl = with_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
   #include <torch/torch.h>
   #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
@@ -143,12 +131,15 @@ if supports_c10d_gloo
   $defs << "-DUSE_C10D_GLOO"
   puts "GLOO support detected"
 end
-unless supports_c10_cuda
-  puts "No c10 CUDA headers found. NCCL is unavailable"
-end
 if supports_c10d_nccl
   $defs << "-DUSE_C10D_NCCL"
   puts "NCCL support detected"
+elsif with_cuda
+  puts "NCCL support not detected; CUDA libraries found but headers may be unavailable"
+end
+unless supports_c10d_gloo || supports_c10d_nccl
+  abort "Neither Gloo nor NCCL support detected. Ensure LibTorch is built with distributed backends and provide Gloo or NCCL headers."
 end
 # create makefile

data/lib/torch/ddp/monkey_patch.rb CHANGED Viewed

@@ -100,8 +100,10 @@ module Torch
         def patch_device_helpers
           Torch::Device.class_eval do
+            alias_method :_torch_ddp_original_to_s, :to_s unless method_defined?(:_torch_ddp_original_to_s)
             define_method(:to_s) do
-              respond_to?(:_str) ? _str : super()
+              respond_to?(:_str) ? _str : _torch_ddp_original_to_s
             end
           end

data/lib/torch/ddp/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Torch
   module DDP
-    VERSION = "0.1.2"
+    VERSION = "0.1.4"
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: torch-ddp
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.4
 platform: ruby
 authors:
 - Ivan Razuvaev