RubyGems - torch-ddp - Versions diffs - 0.1.4 → 0.2.0 - Mend

torch-ddp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/ext/torch_ddp/cuda.cpp +68 -0
data/ext/torch_ddp/ext.cpp +2 -0
data/ext/torch_ddp/extconf.rb +1 -0
data/lib/torch/ddp/monkey_patch.rb +21 -36
data/lib/torch/ddp/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 341ecc522b0ed689920f0841ef56512531aa68706fc90768ee32f0234100a3d6
-  data.tar.gz: f6ae0221f15cc9d793c3151865ed9d0189a74efba3122cff08b4f63aa829c754
+  metadata.gz: c04c1f358a671d251826b7bf9db798bd5d9f11e279b639cff382f9f8f07d4b5f
+  data.tar.gz: 14c9db6913aaf75f98242f06808db8a3b696f37e6d779e092c8ed59cd398a644
 SHA512:
-  metadata.gz: afb85d047d017eb59f1001eac7bb73bd81a33001c0ec9351d4e578f898530acc015de83cb8fe9137580ba93399abe910652fefa1118c103724ee314f5f0cb2f4
-  data.tar.gz: 30e2153309523ef5aaab848317f2e60b2fce16ab90950e0c9f7327e726bf8e3c47af4c99dc52a7c80719aad3680dcc0cc2a79ed040654d11148d7b2a0de12dd9
+  metadata.gz: 0c62affe04041abca2dc56d6e82cc511d0338c3a4460c076b2fde13f219b889903b376521c92a38a6df0f0447bbd058081075ec1fcd6d3fcfb914063a0338691
+  data.tar.gz: 11634e1c29274033be29c37f51eae9bcc62bab902afedb469bdfde39e74f9d9230a93295a1c0b5fb2ee8e303af0df20e659335ee5e21c3c9e6b89608c9600dde

data/ext/torch_ddp/cuda.cpp ADDED Viewed

@@ -0,0 +1,68 @@
+#include <torch/torch.h>
+#include <rice/rice.hpp>
+#if defined(WITH_CUDA)
+#include <cuda_runtime_api.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#endif
+namespace {
+void register_cuda_helpers(Rice::Module& m) {
+  auto rb_mDDP = Rice::define_module_under(m, "DDP");
+  rb_mDDP.define_singleton_function(
+      "_cuda_set_device",
+      [](int device_id) {
+#if defined(WITH_CUDA)
+        int count = 0;
+        auto status = cudaGetDeviceCount(&count);
+        if (status != cudaSuccess) {
+          rb_raise(
+              rb_eRuntimeError,
+              "cudaGetDeviceCount failed with code %d",
+              static_cast<int>(status));
+        }
+        if (device_id < 0 || device_id >= count) {
+          rb_raise(
+              rb_eArgError,
+              "Invalid device_id %d for CUDA (available devices: %d)",
+              device_id,
+              count);
+        }
+        status = cudaSetDevice(device_id);
+        if (status != cudaSuccess) {
+          rb_raise(
+              rb_eRuntimeError,
+              "cudaSetDevice(%d) failed with code %d",
+              device_id,
+              static_cast<int>(status));
+        }
+#else
+        rb_raise(
+            rb_eRuntimeError,
+            "Torch::DDP._cuda_set_device requires CUDA support");
+#endif
+        return Rice::Nil;
+      });
+  rb_mDDP.define_singleton_function(
+      "_cuda_empty_cache",
+      []() {
+#if defined(WITH_CUDA)
+        c10::cuda::CUDACachingAllocator::emptyCache();
+#else
+        rb_raise(
+            rb_eRuntimeError,
+            "Torch::DDP._cuda_empty_cache requires CUDA support");
+#endif
+        return Rice::Nil;
+      });
+}
+} // namespace
+void init_cuda_helpers(Rice::Module& m) {
+  register_cuda_helpers(m);
+}

data/ext/torch_ddp/ext.cpp CHANGED Viewed

@@ -3,9 +3,11 @@
 #include <rice/rice.hpp>
 void init_distributed(Rice::Module& m);
+void init_cuda_helpers(Rice::Module& m);
 extern "C"
 void Init_ddp_ext() {
   auto m = Rice::define_module("Torch");
   init_distributed(m);
+  init_cuda_helpers(m);
 }

data/ext/torch_ddp/extconf.rb CHANGED Viewed

@@ -56,6 +56,7 @@ if Dir["#{lib}/*torch_cuda*"].any?
   $LDFLAGS += " -L#{cudnn_lib}" if Dir.exist?(cudnn_lib) && cudnn_lib != cuda_lib
   with_cuda = have_library("cuda") && have_library("cudnn")
 end
+$defs << "-DWITH_CUDA" if with_cuda
 $INCFLAGS += " -I#{inc}"
 $INCFLAGS += " -I#{inc}/torch/csrc/api/include"

data/lib/torch/ddp/monkey_patch.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-require "fiddle"
 module Torch
   module DDP
     module MonkeyPatch
@@ -14,6 +12,7 @@ module Torch
           warn("#{WARNING_PREFIX} Applying torch compatibility patch for: #{missing.join(', ')}. Please upgrade the torch gem for native support.")
           patch_cuda_set_device if missing.include?(:cuda_set_device)
+          patch_cuda_empty_cache if missing.include?(:cuda_empty_cache)
           patch_device_helpers
           patch_load if missing.include?(:load_keywords)
           patch_tensor_item if missing.include?(:tensor_item_scalar)
@@ -25,6 +24,7 @@ module Torch
         def missing_features
           missing = []
           missing << :cuda_set_device unless Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:set_device)
+          missing << :cuda_empty_cache unless Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:empty_cache)
           missing << :load_keywords unless load_supports_map_location_and_weights_only?
           missing << :tensor_item_scalar unless tensor_item_returns_scalar?
           missing
@@ -56,48 +56,33 @@ module Torch
         end
         def cuda_set_device!(device_id)
-          cuda_set_device_proc.call(Integer(device_id))
+          unless Torch.const_defined?(:DDP) && Torch::DDP.respond_to?(:_cuda_set_device)
+            raise Torch::Error, "Torch::CUDA.set_device is unavailable; ensure torch is built with CUDA or upgrade torch."
+          end
+          Torch::DDP._cuda_set_device(Integer(device_id))
         end
         public :cuda_set_device!
-        def cuda_set_device_proc
-          @cuda_set_device_proc ||= begin
-            candidates = [
-              ENV["LIBCUDART_PATH"],
-              "/usr/local/cuda/lib64/libcudart.so",
-              "/usr/local/cuda/lib/libcudart.so",
-              "/usr/local/cuda/lib/libcudart.dylib",
-              "libcudart.so.12",
-              "libcudart.so.11",
-              "libcudart.so",
-              "libcudart.dylib"
-            ].compact
-            function = nil
-            candidates.each do |path|
-              begin
-                handle = Fiddle.dlopen(path)
-                function = Fiddle::Function.new(handle["cudaSetDevice"], [Fiddle::TYPE_INT], Fiddle::TYPE_INT)
-                break
-              rescue Fiddle::DLError
-                next
-              end
-            end
+        def patch_cuda_empty_cache
+          return unless Torch.const_defined?(:CUDA)
-            if function
-              ->(device_id) do
-                result = function.call(device_id)
-                raise Torch::Error, "cudaSetDevice(#{device_id}) failed with code #{result}" unless result.zero?
-                nil
-              end
-            else
-              ->(device_id) do
-                raise Torch::Error, "Torch::CUDA.set_device is unavailable; ensure torch is built with CUDA or upgrade torch."
-              end
+          Torch::CUDA.singleton_class.class_eval do
+            define_method(:empty_cache) do
+              Torch::DDP::MonkeyPatch.cuda_empty_cache!
             end
           end
         end
+        def cuda_empty_cache!
+          unless Torch.const_defined?(:DDP) && Torch::DDP.respond_to?(:_cuda_empty_cache)
+            raise Torch::Error, "Torch::CUDA.empty_cache is unavailable; ensure torch is built with CUDA or upgrade torch."
+          end
+          Torch::DDP._cuda_empty_cache
+        end
+        public :cuda_empty_cache!
         def patch_device_helpers
           Torch::Device.class_eval do
             alias_method :_torch_ddp_original_to_s, :to_s unless method_defined?(:_torch_ddp_original_to_s)

data/lib/torch/ddp/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Torch
   module DDP
-    VERSION = "0.1.4"
+    VERSION = "0.2.0"
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: torch-ddp
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.2.0
 platform: ruby
 authors:
 - Ivan Razuvaev
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-12-16 00:00:00.000000000 Z
+date: 2025-12-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: torch-rb
@@ -51,6 +51,7 @@ files:
 - bin/torchrun
 - examples/benchmark/training.rb
 - examples/mnist/distributed.rb
+- ext/torch_ddp/cuda.cpp
 - ext/torch_ddp/distributed.cpp
 - ext/torch_ddp/ext.cpp
 - ext/torch_ddp/extconf.rb