RubyGems - torch-ddp - Versions diffs - 0.1.0 - Mend

torch-ddp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +7 -0
data/LICENSE.txt +46 -0
data/README.md +114 -0
data/bin/torchrun +6 -0
data/examples/benchmark/training.rb +374 -0
data/examples/mnist/distributed.rb +240 -0
data/ext/torch_ddp/distributed.cpp +348 -0
data/ext/torch_ddp/ext.cpp +11 -0
data/ext/torch_ddp/extconf.rb +155 -0
data/lib/torch/ddp/monkey_patch.rb +325 -0
data/lib/torch/ddp/version.rb +5 -0
data/lib/torch/distributed.rb +466 -0
data/lib/torch/nn/parallel/distributed_data_parallel.rb +115 -0
data/lib/torch/torchrun.rb +531 -0
data/lib/torch-ddp.rb +8 -0
data/test/distributed_test.rb +243 -0
data/test/support/net.rb +42 -0
data/test/support/scripts/show_ranks.rb +7 -0
data/test/support/tensor.pth +0 -0
data/test/test_helper.rb +71 -0
data/test/torchrun_test.rb +33 -0
metadata +92 -0

data/ext/torch_ddp/extconf.rb ADDED Viewed

@@ -0,0 +1,155 @@
+require "mkmf-rice"
+$CXXFLAGS += " -std=c++17 $(optflags)"
+# change to 0 for Linux pre-cxx11 ABI version
+$CXXFLAGS += " -D_GLIBCXX_USE_CXX11_ABI=1"
+apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
+if apple_clang
+  # silence torch warnings
+  $CXXFLAGS += " -Wno-deprecated-declarations"
+else
+  # silence rice warnings
+  $CXXFLAGS += " -Wno-noexcept-type"
+  # silence torch warnings
+  $CXXFLAGS += " -Wno-duplicated-cond -Wno-suggest-attribute=noreturn"
+end
+paths = [
+  "/usr/local",
+  "/opt/homebrew",
+  "/home/linuxbrew/.linuxbrew"
+]
+inc, lib = dir_config("torch")
+inc ||= paths.map { |v| "#{v}/include" }.find { |v| Dir.exist?("#{v}/torch") }
+lib ||= paths.map { |v| "#{v}/lib" }.find { |v| Dir["#{v}/*torch_cpu*"].any? }
+unless inc && lib
+  abort "LibTorch not found"
+end
+cuda_inc, cuda_lib = dir_config("cuda")
+cuda_lib ||= "/usr/local/cuda/lib64"
+cudnn_inc, cudnn_lib = dir_config("cudnn")
+cudnn_lib ||= "/usr/local/cuda/lib"
+gloo_inc, _ = dir_config("gloo")
+gloo_inc ||= "./vendor/gloo"
+$LDFLAGS += " -L#{lib}" if Dir.exist?(lib)
+abort "LibTorch not found" unless have_library("torch")
+have_library("mkldnn")
+have_library("nnpack")
+with_cuda = false
+if Dir["#{lib}/*torch_cuda*"].any?
+  $LDFLAGS += " -L#{cuda_lib}" if Dir.exist?(cuda_lib)
+  $INCFLAGS += " -I#{cuda_inc}" if cuda_inc && Dir.exist?(cuda_inc)
+  $LDFLAGS += " -L#{cudnn_lib}" if Dir.exist?(cudnn_lib) && cudnn_lib != cuda_lib
+  with_cuda = have_library("cuda") && have_library("cudnn")
+end
+$INCFLAGS += " -I#{inc}"
+$INCFLAGS += " -I#{inc}/torch/csrc/api/include"
+CONFIG["CC"] = CONFIG["CXX"]
+$CFLAGS = $CXXFLAGS
+supports_c10_cuda = with_cuda && try_compile(<<~CPP)
+  #include <torch/torch.h>
+  #include <c10/cuda/CUDAFunctions.h>
+  int main() {
+    c10::cuda::set_device(0);
+    return 0;
+  }
+CPP
+if supports_c10_cuda
+  $defs << " -DHAVE_C10_CUDA"
+end
+$LDFLAGS += " -Wl,-rpath,#{lib}"
+if RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i && Dir.exist?("/opt/homebrew/opt/libomp/lib")
+  $LDFLAGS += ",-rpath,/opt/homebrew/opt/libomp/lib"
+end
+$LDFLAGS += ":#{cuda_lib}/stubs:#{cuda_lib}" if with_cuda
+# https://github.com/pytorch/pytorch/blob/v2.9.0/torch/utils/cpp_extension.py#L1351-L1364
+$LDFLAGS += " -lc10 -ltorch_cpu -ltorch"
+if with_cuda
+  $LDFLAGS += " -lcuda -lnvrtc"
+  $LDFLAGS += " -lnvToolsExt" if File.exist?("#{cuda_lib}/libnvToolsExt.so")
+  $LDFLAGS += " -lcudart -lc10_cuda -ltorch_cuda -lcufft -lcurand -lcublas -lcudnn"
+  # TODO figure out why this is needed
+  $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so"
+end
+supports_c10d = try_link(<<~CPP, "-DUSE_C10D")
+  #include <torch/torch.h>
+  #include <torch/csrc/distributed/c10d/FileStore.hpp>
+  int main() {
+    ::c10d::FileStore store("unused", 1);
+    return 0;
+  }
+CPP
+if supports_c10d
+  $defs << " -DUSE_C10D"
+  puts "Building with distributed support"
+  if find_header("gloo/algorithm.h", gloo_inc)
+    $INCFLAGS += " -I#{gloo_inc}"
+  else
+    puts "GLOO headers not found. Consider setting --with-gloo-include param"
+  end
+else
+  puts "Building without distributed support"
+end
+supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GLOO")
+  #include <torch/torch.h>
+  #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
+  #include <torch/csrc/distributed/c10d/FileStore.hpp>
+  int main() {
+    auto store = c10::make_intrusive<::c10d::FileStore>("unused", 1);
+    auto opts = ::c10d::ProcessGroupGloo::Options::create();
+    opts->devices.push_back(::c10d::ProcessGroupGloo::createDefaultDevice());
+    ::c10d::ProcessGroupGloo pg(store, 0, 1, opts);
+    return static_cast<int>(pg.getRank());
+  }
+CPP
+supports_c10d_nccl = with_cuda && supports_c10_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
+  #include <torch/torch.h>
+  #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
+  int main() {
+    auto opts = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>();
+    opts->is_high_priority_stream = false;
+    return 0;
+  }
+CPP
+if supports_c10d_gloo
+  $defs << "-DUSE_C10D_GLOO"
+  puts "GLOO support detected"
+end
+unless supports_c10_cuda
+  puts "No c10 CUDA headers found. NCCL is unavailable"
+end
+if supports_c10d_nccl
+  $defs << "-DUSE_C10D_NCCL"
+  puts "NCCL support detected"
+end
+# create makefile
+create_makefile("torch/ddp_ext")

data/lib/torch/ddp/monkey_patch.rb ADDED Viewed

@@ -0,0 +1,325 @@
+require "fiddle"
+module Torch
+  module DDP
+    module MonkeyPatch
+      WARNING_PREFIX = "[torch-ddp]".freeze
+      class << self
+        def apply_if_needed
+          return if defined?(@applied) && @applied
+          missing = missing_features
+          return if missing.empty?
+          warn("#{WARNING_PREFIX} Applying torch compatibility patch for: #{missing.join(', ')}. Please upgrade the torch gem for native support.")
+          patch_cuda_set_device if missing.include?(:cuda_set_device)
+          patch_device_helpers
+          patch_load if missing.include?(:load_keywords)
+          patch_tensor_item if missing.include?(:tensor_item_scalar)
+          @applied = true
+        end
+        private
+        def missing_features
+          missing = []
+          missing << :cuda_set_device unless Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:set_device)
+          missing << :load_keywords unless load_supports_map_location_and_weights_only?
+          missing << :tensor_item_scalar unless tensor_item_returns_scalar?
+          missing
+        end
+        def load_supports_map_location_and_weights_only?
+          params = Torch.method(:load).parameters
+          keyword_names = params.select { |kind, _| [:key, :keyreq].include?(kind) }.map(&:last)
+          keyword_names.include?(:map_location) && keyword_names.include?(:weights_only)
+        rescue NameError
+          false
+        end
+        def tensor_item_returns_scalar?
+          value = Torch.tensor([[1]]).item
+          value.is_a?(Numeric) || value == true || value == false
+        rescue StandardError
+          true
+        end
+        def patch_cuda_set_device
+          return unless Torch.const_defined?(:CUDA)
+          Torch::CUDA.singleton_class.class_eval do
+            define_method(:set_device) do |device_id|
+              Torch::DDP::MonkeyPatch.cuda_set_device!(device_id)
+            end
+          end
+        end
+        def cuda_set_device!(device_id)
+          cuda_set_device_proc.call(Integer(device_id))
+        end
+        public :cuda_set_device!
+        def cuda_set_device_proc
+          @cuda_set_device_proc ||= begin
+            candidates = [
+              ENV["LIBCUDART_PATH"],
+              "/usr/local/cuda/lib64/libcudart.so",
+              "/usr/local/cuda/lib/libcudart.so",
+              "/usr/local/cuda/lib/libcudart.dylib",
+              "libcudart.so.12",
+              "libcudart.so.11",
+              "libcudart.so",
+              "libcudart.dylib"
+            ].compact
+            function = nil
+            candidates.each do |path|
+              begin
+                handle = Fiddle.dlopen(path)
+                function = Fiddle::Function.new(handle["cudaSetDevice"], [Fiddle::TYPE_INT], Fiddle::TYPE_INT)
+                break
+              rescue Fiddle::DLError
+                next
+              end
+            end
+            if function
+              ->(device_id) do
+                result = function.call(device_id)
+                raise Torch::Error, "cudaSetDevice(#{device_id}) failed with code #{result}" unless result.zero?
+                nil
+              end
+            else
+              ->(device_id) do
+                raise Torch::Error, "Torch::CUDA.set_device is unavailable; ensure torch is built with CUDA or upgrade torch."
+              end
+            end
+          end
+        end
+        def patch_device_helpers
+          Torch::Device.class_eval do
+            define_method(:to_s) { _str }
+          end
+          unless Torch.const_defined?(:DeviceString)
+            Torch.const_set(
+              :DeviceString,
+              Class.new(String) do
+                def initialize(device)
+                  @device = device
+                  super(device._str)
+                end
+                def type
+                  @device.type
+                end
+                def index
+                  @device.index
+                end
+              end
+            )
+          end
+          Torch::Tensor.class_eval do
+            define_method(:device) { Torch::DeviceString.new(_device) }
+          end
+        end
+        def patch_tensor_item
+          Torch::Tensor.class_eval do
+            alias_method :_torch_ddp_original_item, :item unless method_defined?(:_torch_ddp_original_item)
+            def item
+              value = _torch_ddp_original_item
+              value.is_a?(Array) ? value.flatten.first : value
+            end
+          end
+        end
+        def patch_load
+          patch_load_helpers
+          Torch.singleton_class.class_eval do
+            alias_method :_torch_ddp_original_load, :load unless method_defined?(:_torch_ddp_original_load)
+            def load(filename, map_location: nil, weights_only: false)
+              load_device = map_location_device(map_location) if map_location
+              result =
+                if load_device && respond_to?(:_load_with_device)
+                  Torch::DDP::MonkeyPatch.load_with_device(filename, load_device)
+                else
+                  _torch_ddp_original_load(filename)
+                end
+              ensure_weights_only_contents!(result) if weights_only
+              result = apply_map_location(result, map_location) if map_location
+              result
+            end
+          end
+        end
+        def patch_load_helpers
+          Torch.singleton_class.class_eval do
+            const_set(
+              :WEIGHTS_ONLY_PRIMITIVE_CLASSES,
+              [NilClass, TrueClass, FalseClass, Integer, Float, String].freeze
+            ) unless const_defined?(:WEIGHTS_ONLY_PRIMITIVE_CLASSES)
+            unless method_defined?(:ensure_weights_only_contents!)
+              def ensure_weights_only_contents!(obj)
+                case obj
+                when *WEIGHTS_ONLY_PRIMITIVE_CLASSES, Tensor
+                  obj
+                when Array
+                  obj.each { |value| ensure_weights_only_contents!(value) }
+                when Hash
+                  obj.each do |key, value|
+                    ensure_weights_only_contents!(key)
+                    ensure_weights_only_contents!(value)
+                  end
+                else
+                  raise Error, "weights_only load supports tensors, primitive Ruby types, arrays, and hashes (found #{obj.class.name})"
+                end
+              end
+            end
+            unless method_defined?(:map_location_device)
+              def map_location_device(map_location)
+                case map_location
+                when Device, String, Symbol
+                  normalize_map_location_device(map_location)
+                when Hash
+                  devices = map_location.values.filter_map do |value|
+                    begin
+                      normalize_map_location_device(value)
+                    rescue StandardError
+                      nil
+                    end
+                  end
+                  return nil if devices.empty?
+                  devices.uniq!
+                  devices.one? ? devices.first : nil
+                else
+                  nil
+                end
+              end
+            end
+            unless method_defined?(:apply_map_location)
+              def apply_map_location(obj, map_location)
+                case obj
+                when Tensor
+                  map_tensor_location(obj, map_location)
+                when Array
+                  obj.map { |value| apply_map_location(value, map_location) }
+                when Hash
+                  obj.each_with_object({}) do |(key, value), memo|
+                    memo[apply_map_location(key, map_location)] = apply_map_location(value, map_location)
+                  end
+                else
+                  obj
+                end
+              end
+            end
+            unless method_defined?(:map_tensor_location)
+              def map_tensor_location(tensor, map_location)
+                case map_location
+                when nil
+                  tensor
+                when Hash
+                  target = lookup_map_location_target(map_location, tensor.device)
+                  return tensor if target.nil?
+                  map_tensor_location(tensor, target)
+                else
+                  return map_tensor_location_callable(tensor, map_location) if map_location.respond_to?(:call)
+                  device = normalize_map_location_device(map_location)
+                  tensor.to(device)
+                end
+              end
+            end
+            unless method_defined?(:map_tensor_location_callable)
+              def map_tensor_location_callable(tensor, callable)
+                mapped = callable.call(tensor, map_location_device_tag(tensor.device))
+                return tensor if mapped.nil?
+                unless mapped.is_a?(Tensor)
+                  raise Error, "map_location callable must return a Tensor or nil (got #{mapped.class.name})"
+                end
+                mapped
+              end
+            end
+            unless method_defined?(:lookup_map_location_target)
+              def lookup_map_location_target(mapping, device)
+                key = map_location_device_tag(device)
+                mapping.each do |candidate, value|
+                  candidate_key =
+                    case candidate
+                    when Device
+                      map_location_device_tag(candidate)
+                    when String, Symbol
+                      candidate.to_s
+                    else
+                      candidate
+                    end
+                  return value if candidate_key == key
+                end
+                nil
+              end
+            end
+            unless method_defined?(:map_location_device_tag)
+              def map_location_device_tag(device)
+                case device
+                when Device
+                  tag = device.type
+                  tag += ":#{device.index}" unless device.index.nil?
+                  tag
+                when String, Symbol
+                  device.to_s
+                else
+                  raise Error, "Unknown device reference: #{device.inspect}"
+                end
+              end
+            end
+            unless method_defined?(:normalize_map_location_device)
+              def normalize_map_location_device(location)
+                case location
+                when Device
+                  location
+                when String, Symbol
+                  device(location.to_s)
+                else
+                  raise Error, "Unsupported map_location: #{location.inspect}"
+                end
+              end
+            end
+          end
+        end
+      end
+      module_function
+      def load_with_device(filename, device)
+        fallback_load =
+          if Torch.respond_to?(:_torch_ddp_original_load)
+            Torch.method(:_torch_ddp_original_load)
+          else
+            Torch.method(:load)
+          end
+        return fallback_load.call(filename) unless Torch.respond_to?(:_load_with_device)
+        device_str = device.respond_to?(:_str) ? device._str : device.to_s
+        Torch.send(:to_ruby, Torch._load_with_device(filename, device_str))
+      rescue StandardError
+        fallback_load.call(filename)
+      end
+    end
+  end
+end

data/lib/torch/ddp/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Torch
+  module DDP
+    VERSION = "0.1.0"
+  end
+end