RubyGems - torch-ddp - Versions diffs - 0.1.0 - Mend

torch-ddp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +7 -0
data/LICENSE.txt +46 -0
data/README.md +114 -0
data/bin/torchrun +6 -0
data/examples/benchmark/training.rb +374 -0
data/examples/mnist/distributed.rb +240 -0
data/ext/torch_ddp/distributed.cpp +348 -0
data/ext/torch_ddp/ext.cpp +11 -0
data/ext/torch_ddp/extconf.rb +155 -0
data/lib/torch/ddp/monkey_patch.rb +325 -0
data/lib/torch/ddp/version.rb +5 -0
data/lib/torch/distributed.rb +466 -0
data/lib/torch/nn/parallel/distributed_data_parallel.rb +115 -0
data/lib/torch/torchrun.rb +531 -0
data/lib/torch-ddp.rb +8 -0
data/test/distributed_test.rb +243 -0
data/test/support/net.rb +42 -0
data/test/support/scripts/show_ranks.rb +7 -0
data/test/support/tensor.pth +0 -0
data/test/test_helper.rb +71 -0
data/test/torchrun_test.rb +33 -0
metadata +92 -0

data/test/distributed_test.rb ADDED Viewed

@@ -0,0 +1,243 @@
+require_relative "test_helper"
+require "torch/distributed"
+require "socket"
+require "timeout"
+class DistributedInitProcessGroupTest < Minitest::Test
+  def setup
+    skip "Distributed backend not available" unless Torch::Distributed.available?
+    skip "CUDA not available for NCCL backend" unless cuda_available?
+  end
+  def test_defaults_nccl_device_id_from_local_rank_env
+    calls = []
+    with_stubbed_init_process_group(calls) do
+      ENV["LOCAL_RANK"] = "2"
+      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 5, world_size: 8)
+    ensure
+      ENV.delete("LOCAL_RANK")
+    end
+    assert_equal 1, calls.size
+    assert_equal 2, calls.first[:device_id]
+  end
+  def test_falls_back_to_local_world_size_modulo
+    calls = []
+    with_stubbed_init_process_group(calls) do
+      ENV["LOCAL_WORLD_SIZE"] = "2"
+      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 3, world_size: 4)
+    ensure
+      ENV.delete("LOCAL_WORLD_SIZE")
+    end
+    assert_equal 1, calls.size
+    assert_equal 1, calls.first[:device_id]
+  end
+  def test_uses_world_size_when_env_missing
+    calls = []
+    with_stubbed_init_process_group(calls) do
+      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 1, world_size: 2)
+    end
+    assert_equal 1, calls.size
+    assert_equal 1, calls.first[:device_id]
+  end
+  private
+  # Stub out low-level init to capture arguments without starting a real process group
+  # Used for upper-level tests that don't require actial process group spawning
+  def with_stubbed_init_process_group(calls)
+    original = Torch::Distributed.method(:_init_process_group)
+    Torch::Distributed.singleton_class.define_method(:_init_process_group) do |backend, store, rank, world_size, timeout_ms, device_id|
+      calls << {backend: backend, rank: rank, world_size: world_size, timeout_ms: timeout_ms, device_id: device_id}
+      :stub
+    end
+    yield
+  ensure
+    Torch::Distributed.singleton_class.define_method(:_init_process_group, original)
+  end
+  def cuda_available?
+    Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
+  end
+end
+class DistributedSpawnStartMethodTest < Minitest::Test
+  def test_spawn_worker_env_runs_block
+    reader, writer = IO.pipe
+    writer.close_on_exec = false
+    pid = fork do
+      reader.close
+      ENV[Torch::Distributed::SPAWN_ENV_KEY] = "1"
+      ENV[Torch::Distributed::SPAWN_RANK_ENV_KEY] = "0"
+      ENV[Torch::Distributed::SPAWN_WORLD_SIZE_ENV_KEY] = "1"
+      ENV[Torch::Distributed::SPAWN_PORT_ENV_KEY] = "1234"
+      ENV[Torch::Distributed::SPAWN_PIPE_ENV_KEY] = writer.fileno.to_s
+      Torch::Distributed.fork_world(1, start_method: :spawn) { |rank, port| [rank, port] }
+    end
+    writer.close
+    result = Marshal.load(reader)
+    reader.close
+    _pid, status = Process.wait2(pid)
+    assert status.success?
+    assert_equal [0, 1234], result
+  end
+end
+class DistributedBackendTest < Minitest::Test
+  BACKEND = nil
+  def setup
+    super
+    skip "Distributed backend not available" unless Torch::Distributed.available?
+    skip "No backend configured for test" unless backend
+    skip_unless_backend_available!
+  end
+  def backend
+    self.class::BACKEND
+  end
+  def tensor_options
+    {}
+  end
+  def skip_unless_backend_available!
+    skip "#{backend} backend not available" unless backend_available?
+  end
+  def backend_available?
+    timeout = distributed_timeout
+    port = Torch::Distributed.free_port
+    store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false, timeout: timeout)
+    Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1, timeout: timeout)
+    true
+  rescue StandardError => e
+    return false if e.message =~ /not available/i || e.message =~ /unsupported backend/i
+    raise
+  ensure
+    Torch::Distributed.destroy_process_group if Torch::Distributed.initialized?
+  end
+  def fork_with_backend(world_size: 2, start_method: :spawn)
+    timeout = distributed_timeout
+    original_filter = ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY]
+    original_script = ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY]
+    ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = name if start_method == :spawn
+    ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = File.expand_path(__FILE__) if start_method == :spawn
+    Timeout.timeout(timeout, Timeout::Error, "distributed test exceeded #{timeout}s") do
+      Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port|
+        Timeout.timeout(timeout, Timeout::Error, "distributed worker #{rank} exceeded #{timeout}s") do
+          store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?, timeout: timeout)
+          Torch::Distributed.init_process_group(
+            backend,
+            store: store,
+            rank: rank,
+            world_size: world_size,
+            device_id: rank,
+            timeout: timeout
+          )
+          begin
+            yield(rank)
+          ensure
+            Torch::Distributed.destroy_process_group
+          end
+        end
+      end
+    end
+  ensure
+    ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = original_filter
+    ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = original_script
+  end
+  def test_all_reduce
+    results = fork_with_backend do |rank|
+      tensor = Torch.tensor([rank + 1.0], **tensor_options)
+      Torch::Distributed.all_reduce(tensor)
+      tensor.to_a
+    end
+    assert_equal [[3.0], [3.0]], results
+  end
+  def test_barrier
+    wait_times = fork_with_backend do |rank|
+      sleep 0.3 if rank.zero?
+      before = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+      Torch::Distributed.barrier
+      after = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+      after - before
+    end
+    assert_operator wait_times.first, :<, 0.1
+    assert_operator wait_times.last, :>=, 0.25
+  end
+  def test_broadcast
+    tensors = fork_with_backend do |rank|
+      tensor = Torch.tensor([rank + 1.0], **tensor_options)
+      Torch::Distributed.broadcast(tensor, src: 0)
+      tensor.to_a
+    end
+    assert_equal [[1.0], [1.0]], tensors
+  end
+  def test_ddp_gradient_sync
+    # autograd cannot run safely with fork-based multiprocessing; always use spawn here
+    grads = fork_with_backend(start_method: :spawn) do |rank|
+      device = tensor_options[:device]
+      model = Torch::NN::Linear.new(1, 1, bias: false)
+      model = model.to(device) if device
+      ddp = Torch::NN::Parallel::DistributedDataParallel.new(model)
+      input = Torch.tensor([[rank + 1.0]], **tensor_options)
+      output = ddp.call(input)
+      loss = output.sum
+      loss.backward
+      grad = model.parameters.first.grad
+      grad = grad.to("cpu") if device
+      grad.item
+    end
+    grads.each do |grad|
+      assert_in_delta 1.5, grad, 1e-6
+    end
+  end
+  def distributed_timeout
+    Integer(ENV.fetch("TORCH_DISTRIBUTED_TEST_TIMEOUT", "30"))
+  end
+end
+class DistributedGlooTest < DistributedBackendTest
+  BACKEND = "gloo"
+  def fork_with_backend(world_size: 2, start_method: :fork)
+    super(world_size: world_size, start_method: start_method)
+  end
+end
+class DistributedNcclTest < DistributedBackendTest
+  BACKEND = "nccl"
+  def setup
+    skip "CUDA not available for NCCL backend" unless Torch.const_defined?(:CUDA) && Torch::CUDA.available?
+    skip "Need at least 2 CUDA devices for NCCL tests" unless Torch::CUDA.device_count >= 2
+    super
+  end
+  def tensor_options
+    {device: "cuda"}
+  end
+  def fork_with_backend(world_size: 2, start_method: :spawn)
+    super(world_size: world_size, start_method: start_method)
+  end
+end

data/test/support/net.rb ADDED Viewed

@@ -0,0 +1,42 @@
+class TestNet < Torch::NN::Module
+  def initialize
+    super()
+    @conv1 = Torch::NN::Conv2d.new(1, 6, 3)
+    @conv2 = Torch::NN::Conv2d.new(6, 16, 3)
+    @fc1 = Torch::NN::Linear.new(16 * 6 * 6, 120)
+    @fc2 = Torch::NN::Linear.new(120, 84)
+    @fc3 = Torch::NN::Linear.new(84, 10)
+  end
+  def forward(x)
+    x = Torch::NN::F.max_pool2d(Torch::NN::F.relu(@conv1.call(x)), [2, 2])
+    x = Torch::NN::F.max_pool2d(Torch::NN::F.relu(@conv2.call(x)), 2)
+    x = Torch.flatten(x, 1)
+    x = Torch::NN::F.relu(@fc1.call(x))
+    x = Torch::NN::F.relu(@fc2.call(x))
+    @fc3.call(x)
+  end
+end
+class SimpleResidualBlock < Torch::NN::Module
+  def initialize
+    super()
+    @relu = Torch::NN::ReLU.new
+    @seq = Torch::NN::Sequential.new(
+      Torch::NN::Conv2d.new(64, 128, 3, padding: 1, bias: false),
+      Torch::NN::BatchNorm2d.new(128),
+      @relu,
+      Torch::NN::Conv2d.new(128, 128, 3, padding: 1, bias: false),
+      Torch::NN::BatchNorm2d.new(128),
+      @relu,
+      Torch::NN::Conv2d.new(128, 64, 3, bias: false),
+      Torch::NN::BatchNorm2d.new(64)
+    )
+  end
+  def forward(x)
+    @relu.forward(@seq.forward(x) + x)
+  end
+end

data/test/support/scripts/show_ranks.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+$stdout.sync = true
+rank = ENV.fetch("RANK", "unknown")
+local_rank = ENV.fetch("LOCAL_RANK", "unknown")
+world_size = ENV.fetch("WORLD_SIZE", "unknown")
+puts "RANK=#{rank} LOCAL_RANK=#{local_rank} WORLD_SIZE=#{world_size}"

data/test/support/tensor.pth ADDED Viewed

Binary file

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,71 @@
+spawn_worker = ENV["TORCH_DISTRIBUTED_SPAWNED"] == "1"
+# Spawned distributed workers shouldn't try to load minitest plugins from the
+# parent test environment.
+ENV["MT_NO_PLUGINS"] = "1" if spawn_worker
+require "bundler/setup"
+Bundler.require(:default)
+require "torch-ddp"
+require "minitest/autorun"
+if spawn_worker
+  module TorchDistributedSpawnTest
+    module QuietSummaryReporter
+      def start # :nodoc:
+        Minitest::StatisticsReporter.instance_method(:start).bind(self).call
+        self.sync = io.respond_to?(:"sync=")
+        self.old_sync, io.sync = io.sync, true if self.sync
+      end
+      def report # :nodoc:
+        super
+      ensure
+        io.sync = self.old_sync if self.sync
+      end
+    end
+  end
+  Minitest::SummaryReporter.prepend(TorchDistributedSpawnTest::QuietSummaryReporter)
+end
+# support
+require_relative "support/net"
+class Minitest::Test
+  def assert_elements_in_delta(expected, actual)
+    assert_equal expected.size, actual.size
+    expected.zip(actual) do |exp, act|
+      if exp.finite?
+        assert_in_delta exp, act
+      else
+        assert_equal exp, act
+      end
+    end
+  end
+  def assert_tensor(expected, actual, dtype: nil)
+    assert_kind_of Torch::Tensor, actual
+    assert_equal actual.dtype, dtype if dtype
+    if (actual.floating_point? || actual.complex?) && actual.dim < 2
+      assert_elements_in_delta expected, actual.to_a
+    else
+      assert_equal expected, actual.to_a
+    end
+  end
+  def mac?
+    RbConfig::CONFIG["host_os"] =~ /darwin/i
+  end
+  def stress_gc
+    previous = GC.stress
+    begin
+      GC.stress = true
+      yield
+    ensure
+      GC.stress = previous
+      GC.start
+    end
+  end
+end

data/test/torchrun_test.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+require_relative "test_helper"
+require "open3"
+require "rbconfig"
+class TorchRunTest < Minitest::Test
+  def test_standalone_launches_multiple_workers
+    script = File.expand_path("support/scripts/show_ranks.rb", __dir__)
+    torchrun = File.expand_path("../bin/torchrun", __dir__)
+    stdout, stderr, status = Open3.capture3(
+      {"TORCHRUN_TEST" => "1"},
+      RbConfig.ruby,
+      torchrun,
+      "--standalone",
+      "--nproc-per-node=2",
+      script
+    )
+    assert status.success?, "torchrun failed: #{stderr}"
+    lines = stdout.lines.map(&:strip).select { |line| line.start_with?("RANK=") }
+    assert_equal 2, lines.size, "expected two worker outputs, got: #{lines.inspect}"
+    ranks = lines.map do |line|
+      match = line.match(/RANK=(\d+)\s+LOCAL_RANK=(\d+)\s+WORLD_SIZE=(\d+)/)
+      raise "unexpected output: #{line}" unless match
+      [match[1].to_i, match[2].to_i, match[3].to_i]
+    end
+    assert_equal [[0, 0, 2], [1, 1, 2]], ranks.sort
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,92 @@
+--- !ruby/object:Gem::Specification
+name: torch-ddp
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Ivan Razuvaev
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2025-12-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: torch-rb
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.22.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.22.2
+- !ruby/object:Gem::Dependency
+  name: rice
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '4.7'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '4.7'
+description:
+email: i@orlando-labs.com
+executables:
+- torchrun
+extensions:
+- ext/torch_ddp/extconf.rb
+extra_rdoc_files: []
+files:
+- LICENSE.txt
+- README.md
+- bin/torchrun
+- examples/benchmark/training.rb
+- examples/mnist/distributed.rb
+- ext/torch_ddp/distributed.cpp
+- ext/torch_ddp/ext.cpp
+- ext/torch_ddp/extconf.rb
+- lib/torch-ddp.rb
+- lib/torch/ddp/monkey_patch.rb
+- lib/torch/ddp/version.rb
+- lib/torch/distributed.rb
+- lib/torch/nn/parallel/distributed_data_parallel.rb
+- lib/torch/torchrun.rb
+- test/distributed_test.rb
+- test/support/net.rb
+- test/support/scripts/show_ranks.rb
+- test/support/tensor.pth
+- test/test_helper.rb
+- test/torchrun_test.rb
+homepage: https://github.com/ankane/torch.rb
+licenses:
+- BSD-3-Clause
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '3.2'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.5.22
+signing_key:
+specification_version: 4
+summary: Distributed data parallel support for torch-rb
+test_files: []