RubyGems - torchaudio - Versions diffs - 0.4.0 → 0.5.0 - Mend

torchaudio 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/LICENSE.txt +1 -1
data/README.md +3 -47
data/lib/torchaudio/transforms/mel_spectrogram.rb +12 -4
data/lib/torchaudio/transforms/spectrogram.rb +11 -4
data/lib/torchaudio/version.rb +1 -1
data/lib/torchaudio.rb +113 -99
metadata +5 -35
data/ext/torchaudio/csrc/register.cpp +0 -65
data/ext/torchaudio/csrc/sox.cpp +0 -361
data/ext/torchaudio/csrc/sox.h +0 -71
data/ext/torchaudio/csrc/sox_effects.cpp +0 -54
data/ext/torchaudio/csrc/sox_effects.h +0 -18
data/ext/torchaudio/csrc/sox_io.cpp +0 -170
data/ext/torchaudio/csrc/sox_io.h +0 -41
data/ext/torchaudio/csrc/sox_utils.cpp +0 -245
data/ext/torchaudio/csrc/sox_utils.h +0 -100
data/ext/torchaudio/ext.cpp +0 -33
data/ext/torchaudio/extconf.rb +0 -79

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 938d840ca33f543f9ae76e911fe9681448703dddaab35a1a9dc05703a19d42c0
-  data.tar.gz: 5bac1684448eac1c520d2f4e5f17ac0ac7948afc7a2e46bb4bc5cd721a66ef26
+  metadata.gz: 689c84e4854288639090d826e4f3844a7e3cabf6ec68cf623b609678a8934ff6
+  data.tar.gz: 07afae5c2e18256d62b31362acf7b1347cff2a332fe0de3659af463abbaa2dd4
 SHA512:
-  metadata.gz: 01f602344f23934a80470c72a2608f64209fd3649c614d107f6a87757774a6930a14e83fc421cb4f35a629d41f4321b1967849c0f05aa3c6a580b69814bfe637
-  data.tar.gz: 908bb885c92f1748c6f38609ea02b44fdf187bace3fafffd4764348516d5cf590fdb9aafb3871de5a9feacdcb9bd96cefead9753092b8c5d8711a1a63ae57aef
+  metadata.gz: c62e2dcbc6daa4d1e574954101b889f391018fb078a89f200e35a70efff80e7b85eabb1fd7322212c5199644f0a4e8a16057e6863e6501321b46ebba5ea95070
+  data.tar.gz: a764b0585aaee7b6f7c50cc65f3348c8a48cba317ffe15b6c455422db4d30c1fc26784e58574ce238d749e849a7a283054158850ef35e142154141a63ddb987e

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,14 @@
+## 0.5.0 (2026-02-27)
+- Added support for CUDA 12.9+
+- Switched to TorchCodec for `load` and `save` methods
+- Removed `load_wav` method
+- Dropped support for Ruby < 3.2
+## 0.4.1 (2025-06-26)
+- Improved SoX detection for Homebrew
 ## 0.4.0 (2024-08-02)
 - Dropped support for Ruby < 3.1

data/LICENSE.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 BSD 2-Clause License
 Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
-Copyright (c) 2020-2024 Andrew Kane,
+Copyright (c) 2020-2026 Andrew Kane,
 All rights reserved.
 Redistribution and use in source and binary forms, with or without

data/README.md CHANGED Viewed

@@ -6,12 +6,6 @@
 ## Installation
-First, [install SoX](#sox-installation). For Homebrew, use:
-```sh
-brew install sox
-```
 Add this line to your application’s Gemfile:
 ```ruby
@@ -22,22 +16,15 @@ gem "torchaudio"
 This library follows the [Python API](https://pytorch.org/audio/). Many methods and options are missing at the moment. PRs welcome!
-## Tutorial
-- [PyTorch tutorial](https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html)
-- [Ruby code](examples/tutorial.rb)
-Download the [audio file](https://github.com/pytorch/tutorials/raw/master/_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav) and install the [matplotlib](https://github.com/mrkn/matplotlib.rb) gem first.
 ## Basics
-Load a file
+Load a file (requires [torchcodec](https://github.com/ankane/torchcodec-ruby))
 ```ruby
 waveform, sample_rate = TorchAudio.load("file.wav")
 ```
-Save a file
+Save a file (requires [torchcodec](https://github.com/ankane/torchcodec-ruby))
 ```ruby
 TorchAudio.save("new.wave", waveform, sample_rate)
@@ -93,7 +80,7 @@ TorchAudio::Datasets::YESNO.new(".", download: true)
 Supported datasets are:
-- [YESNO](http://www.openslr.org/1/)
+- [YESNO](https://www.openslr.org/1/)
 ## Disclaimer
@@ -101,37 +88,6 @@ This library downloads and prepares public datasets. We don’t host any dataset
 If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
-## SoX Installation
-### Mac
-```sh
-brew install sox
-```
-### Windows
-todo
-### Ubuntu
-```sh
-sudo apt install sox libsox-dev libsox-fmt-all
-```
-### Travis CI
-Add to `.travis.yml`:
-```yml
-addons:
-  apt:
-    packages:
-      - sox
-      - libsox-dev
-      - libsox-fmt-all
-```
 ## History
 View the [changelog](https://github.com/ankane/torchaudio-ruby/blob/master/CHANGELOG.md)

data/lib/torchaudio/transforms/mel_spectrogram.rb CHANGED Viewed

@@ -4,11 +4,19 @@ module TorchAudio
       attr_reader :n_mels
       def initialize(
-        sample_rate: 16000, n_fft: 400, win_length: nil, hop_length: nil, f_min: 0.0,
-        f_max: nil, pad: 0, n_mels: 128, window_fn: Torch.method(:hann_window),
-        power: 2.0, normalized: false, wkwargs: nil
+        sample_rate: 16000,
+        n_fft: 400,
+        win_length: nil,
+        hop_length: nil,
+        f_min: 0.0,
+        f_max: nil,
+        pad: 0,
+        n_mels: 128,
+        window_fn: Torch.method(:hann_window),
+        power: 2.0,
+        normalized: false,
+        wkwargs: nil
       )
         super()
         @sample_rate = sample_rate
         @n_fft = n_fft

data/lib/torchaudio/transforms/spectrogram.rb CHANGED Viewed

@@ -2,11 +2,18 @@ module TorchAudio
   module Transforms
     class Spectrogram < Torch::NN::Module
       def initialize(
-        n_fft: 400, win_length: nil, hop_length: nil, pad: 0,
-        window_fn: Torch.method(:hann_window), power: 2.0, normalized: false, wkwargs: nil,
-        center: true, pad_mode: "reflect", onesided: true
+        n_fft: 400,
+        win_length: nil,
+        hop_length: nil,
+        pad: 0,
+        window_fn: Torch.method(:hann_window),
+        power: 2.0,
+        normalized: false,
+        wkwargs: nil,
+        center: true,
+        pad_mode: "reflect",
+        onesided: true
       )
         super()
         @n_fft = n_fft
         # number of FFT bins. the returned STFT result will have n_fft // 2 + 1

data/lib/torchaudio/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TorchAudio
-  VERSION = "0.4.0"
+  VERSION = "0.5.0"
 end

data/lib/torchaudio.rb CHANGED Viewed

@@ -1,9 +1,6 @@
 # dependencies
 require "torch"
-# ext
-require "torchaudio/ext"
 # stdlib
 require "digest"
 require "fileutils"
@@ -30,122 +27,139 @@ module TorchAudio
   class Error < StandardError; end
   class << self
-    # TODO remove filetype in 0.4.0
     def load(
-      filepath, out: nil, normalization: true, channels_first: true, num_frames: 0,
-      offset: 0, signalinfo: nil, encodinginfo: nil, filetype: nil, format: nil
+      uri,
+      frame_offset: 0,
+      num_frames: -1,
+      channels_first: true
     )
-      filepath = filepath.to_s
-      # check if valid file
-      unless File.exist?(filepath)
-        raise ArgumentError, "#{filepath} not found or is a directory"
+      begin
+        require "torchcodec"
+      rescue LoadError
+        raise LoadError, "TorchCodec is required for load. Please install torchcodec to use this function."
       end
-      # initialize output tensor
-      if !out.nil?
-        check_input(out)
-      else
-        out = Torch::FloatTensor.new
+      begin
+        decoder = TorchCodec::Decoders::AudioDecoder.new(uri)
+      rescue => e
+        raise RuntimeError, "Failed to create AudioDecoder for #{uri}: #{e}"
       end
-      if num_frames < -1
-        raise ArgumentError, "Expected value for num_samples -1 (entire file) or >=0"
+      # Get sample rate from metadata
+      sample_rate = decoder.metadata[:sample_rate]
+      if sample_rate.nil?
+        raise RuntimeError, "Unable to determine sample rate from audio metadata"
       end
-      if offset < 0
-        raise ArgumentError, "Expected positive offset value"
+      # Decode the entire file first, then subsample manually
+      # This is the simplest approach since torchcodec uses time-based indexing
+      begin
+        audio_samples = decoder.get_all_samples
+      rescue => e
+        raise RuntimeError, "Failed to decode audio samples: #{e}"
       end
-      # same logic as C++
-      # could also make read_audio_file work with nil
-      format ||= filetype || File.extname(filepath)[1..-1]
+      data = audio_samples[:data]
+      # Apply frame_offset and num_frames (which are actually sample offsets)
+      if frame_offset > 0
+        if frame_offset >= data.shape[1]
+          # Return empty tensor if offset is beyond available data
+          empty_shape = channels_first ? [data.shape[0], 0] : [0, data.shape[0]]
+          return [Torch.zeros(empty_shape, dtype: Torch.float32), sample_rate]
+        end
+        data = data[0.., frame_offset..]
+      end
-      sample_rate =
-        Ext.read_audio_file(
-          filepath,
-          out,
-          channels_first,
-          num_frames,
-          offset,
-          signalinfo,
-          encodinginfo,
-          format
-        )
+      if num_frames == 0
+        # Return empty tensor if num_frames is 0
+        empty_shape = channels_first ? [data.shape[0], 0] : [0, data.shape[0]]
+        return [Torch.zeros(empty_shape, dtype: Torch.float32), sample_rate]
+      elsif num_frames > 0
+        data = data[0.., 0...num_frames]
+      end
-      # normalize if needed
-      normalize_audio(out, normalization)
+      # TorchCodec returns data in [channel, time] format by default
+      # Handle channels_first parameter
+      if !channels_first
+        data = data.transpose(0, 1)  # [channel, time] -> [time, channel]
+      end
-      [out, sample_rate]
+      [data, sample_rate]
     end
-    def load_wav(filepath, **kwargs)
-      kwargs[:normalization] = 1 << 16
-      load(filepath, **kwargs)
+    def load_wav(path, channels_first: true)
+      load(path, channels_first: channels_first)
     end
-    def save(filepath, src, sample_rate, precision: 16, channels_first: true)
-      si = Ext::SignalInfo.new
-      ch_idx = channels_first ? 0 : 1
-      si.rate = sample_rate
-      si.channels = src.dim == 1 ? 1 : src.size(ch_idx)
-      si.length = src.numel
-      si.precision = precision
-      save_encinfo(filepath, src, channels_first: channels_first, signalinfo: si)
-    end
+    def save(
+      uri,
+      src,
+      sample_rate,
+      channels_first: true,
+      compression: nil
+    )
+      begin
+        require "torchcodec"
+      rescue LoadError
+        raise LoadError, "TorchCodec is required for save. Please install torchcodec to use this function."
+      end
-    def save_encinfo(filepath, src, channels_first: true, signalinfo: nil, encodinginfo: nil, filetype: nil)
-      ch_idx, _len_idx = channels_first ? [0, 1] : [1, 0]
-      # check if save directory exists
-      abs_dirpath = File.dirname(File.expand_path(filepath))
-      unless Dir.exist?(abs_dirpath)
-        raise "Directory does not exist: #{abs_dirpath}"
-      end
-      # check that src is a CPU tensor
-      check_input(src)
-      # Check/Fix shape of source data
-      if src.dim == 1
-        # 1d tensors as assumed to be mono signals
-        src.unsqueeze!(ch_idx)
-      elsif src.dim > 2 || src.size(ch_idx) > 16
-        # assumes num_channels < 16
-        raise ArgumentError, "Expected format where C < 16, but found #{src.size}"
-      end
-      # sox stores the sample rate as a float, though practically sample rates are almost always integers
-      # convert integers to floats
-      if signalinfo
-        if signalinfo.rate && !signalinfo.rate.is_a?(Float)
-          if signalinfo.rate.to_f == signalinfo.rate
-            signalinfo.rate = signalinfo.rate.to_f
-          else
-            raise ArgumentError, "Sample rate should be a float or int"
-          end
+      # Input validation
+      if !src.is_a?(Torch::Tensor)
+        raise ArgumentError, "Expected src to be a torch.Tensor, got #{src.class.name}"
+      end
+      if src.dtype != Torch.float32
+        src = src.float
+      end
+      if sample_rate <= 0
+        raise ArgumentError, "sample_rate must be positive, got #{sample_rate}"
+      end
+      # Handle tensor shape and channels_first
+      if src.ndim == 1
+        # Convert to 2D: [1, time] for channels_first: true
+        if channels_first
+          data = src.unsqueeze(0)  # [1, time]
+        else
+          # For channels_first: false, input is [time] -> reshape to [time, 1] -> transpose to [1, time]
+          data = src.unsqueeze(1).transpose(0, 1)  # [time, 1] -> [1, time]
         end
-        # check if the bit precision (i.e. bits per sample) is an integer
-        if signalinfo.precision && ! signalinfo.precision.is_a?(Integer)
-          if signalinfo.precision.to_i == signalinfo.precision
-            signalinfo.precision = signalinfo.precision.to_i
-          else
-            raise ArgumentError, "Bit precision should be an integer"
-          end
+      elsif src.ndim == 2
+        if channels_first
+          data = src  # Already [channel, time]
+        else
+          data = src.transpose(0, 1)  # [time, channel] -> [channel, time]
         end
+      else
+        raise ArgumentError, "Expected 1D or 2D tensor, got #{src.ndim}D tensor"
+      end
+      # Create AudioEncoder
+      begin
+        encoder = TorchCodec::Encoders::AudioEncoder.new(data, sample_rate: sample_rate)
+      rescue => e
+        raise RuntimeError, "Failed to create AudioEncoder: #{e}"
+      end
+      # Determine bit_rate from compression parameter
+      bit_rate = nil
+      if !compression.nil?
+        if compression.is_a?(Integer) || compression.is_a?(Float)
+          bit_rate = compression.to_i
+        else
+          warn "Unsupported compression type #{compression.class.name}."
+        end
+      end
+      # Save to file
+      begin
+        encoder.to_file(uri, bit_rate: bit_rate)
+      rescue => e
+        raise RuntimeError, "Failed to save audio to #{uri}: #{e}"
       end
-      # programs such as librosa normalize the signal, unnormalize if detected
-      if src.min >= -1.0 && src.max <= 1.0
-        src = src * (1 << 31)
-        src = src.long
-      end
-      # set filetype and allow for files with no extensions
-      extension = File.extname(filepath)
-      filetype = extension.length > 0 ? extension[1..-1] : filetype
-      # transpose from C x L -> L x C
-      if channels_first
-        src = src.transpose(1, 0)
-      end
-      # save data to file
-      src = src.contiguous
-      Ext.write_audio_file(filepath, src, signalinfo, encodinginfo, filetype)
     end
     private

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: torchaudio
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.0
 platform: ruby
 authors:
 - Andrew Kane
-autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-08-03 00:00:00.000000000 Z
+date: 1980-01-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: torch-rb
@@ -24,41 +23,14 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0.13'
-- !ruby/object:Gem::Dependency
-  name: rice
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '4.3'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '4.3'
-description:
 email: andrew@ankane.org
 executables: []
-extensions:
-- ext/torchaudio/extconf.rb
+extensions: []
 extra_rdoc_files: []
 files:
 - CHANGELOG.md
 - LICENSE.txt
 - README.md
-- ext/torchaudio/csrc/register.cpp
-- ext/torchaudio/csrc/sox.cpp
-- ext/torchaudio/csrc/sox.h
-- ext/torchaudio/csrc/sox_effects.cpp
-- ext/torchaudio/csrc/sox_effects.h
-- ext/torchaudio/csrc/sox_io.cpp
-- ext/torchaudio/csrc/sox_io.h
-- ext/torchaudio/csrc/sox_utils.cpp
-- ext/torchaudio/csrc/sox_utils.h
-- ext/torchaudio/ext.cpp
-- ext/torchaudio/extconf.rb
 - lib/torchaudio.rb
 - lib/torchaudio/datasets/utils.rb
 - lib/torchaudio/datasets/yesno.rb
@@ -78,7 +50,6 @@ homepage: https://github.com/ankane/torchaudio-ruby
 licenses:
 - BSD-2-Clause
 metadata: {}
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -86,15 +57,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '3.1'
+      version: '3.2'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.11
-signing_key:
+rubygems_version: 4.0.3
 specification_version: 4
 summary: Data manipulation and transformation for audio signal processing
 test_files: []

data/ext/torchaudio/csrc/register.cpp DELETED Viewed

@@ -1,65 +0,0 @@
-#ifndef TORCHAUDIO_REGISTER_H
-#define TORCHAUDIO_REGISTER_H
-#include <torchaudio/csrc/sox_effects.h>
-#include <torchaudio/csrc/sox_io.h>
-#include <torchaudio/csrc/sox_utils.h>
-namespace torchaudio {
-namespace {
-////////////////////////////////////////////////////////////////////////////////
-// sox_utils.h
-////////////////////////////////////////////////////////////////////////////////
-static auto registerTensorSignal =
-    torch::class_<sox_utils::TensorSignal>("torchaudio", "TensorSignal")
-        .def(torch::init<torch::Tensor, int64_t, bool>())
-        .def("get_tensor", &sox_utils::TensorSignal::getTensor)
-        .def("get_sample_rate", &sox_utils::TensorSignal::getSampleRate)
-        .def("get_channels_first", &sox_utils::TensorSignal::getChannelsFirst);
-////////////////////////////////////////////////////////////////////////////////
-// sox_io.h
-////////////////////////////////////////////////////////////////////////////////
-static auto registerSignalInfo =
-    torch::class_<sox_io::SignalInfo>("torchaudio", "SignalInfo")
-        .def("get_sample_rate", &sox_io::SignalInfo::getSampleRate)
-        .def("get_num_channels", &sox_io::SignalInfo::getNumChannels)
-        .def("get_num_frames", &sox_io::SignalInfo::getNumFrames);
-static auto registerGetInfo = torch::RegisterOperators().op(
-    torch::RegisterOperators::options()
-        .schema(
-            "torchaudio::sox_io_get_info(str path) -> __torch__.torch.classes.torchaudio.SignalInfo info")
-        .catchAllKernel<decltype(sox_io::get_info), &sox_io::get_info>());
-static auto registerLoadAudioFile = torch::RegisterOperators().op(
-    torch::RegisterOperators::options()
-        .schema(
-            "torchaudio::sox_io_load_audio_file(str path, int frame_offset, int num_frames, bool normalize, bool channels_first) -> __torch__.torch.classes.torchaudio.TensorSignal signal")
-        .catchAllKernel<
-            decltype(sox_io::load_audio_file),
-            &sox_io::load_audio_file>());
-static auto registerSaveAudioFile = torch::RegisterOperators().op(
-    torch::RegisterOperators::options()
-        .schema(
-            "torchaudio::sox_io_save_audio_file(str path, __torch__.torch.classes.torchaudio.TensorSignal signal, float compression) -> ()")
-        .catchAllKernel<
-            decltype(sox_io::save_audio_file),
-            &sox_io::save_audio_file>());
-////////////////////////////////////////////////////////////////////////////////
-// sox_effects.h
-////////////////////////////////////////////////////////////////////////////////
-static auto registerSoxEffects =
-    torch::RegisterOperators(
-        "torchaudio::sox_effects_initialize_sox_effects",
-        &sox_effects::initialize_sox_effects)
-        .op("torchaudio::sox_effects_shutdown_sox_effects",
-            &sox_effects::shutdown_sox_effects)
-        .op("torchaudio::sox_effects_list_effects", &sox_effects::list_effects);
-} // namespace
-} // namespace torchaudio
-#endif