torchaudio 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 938d840ca33f543f9ae76e911fe9681448703dddaab35a1a9dc05703a19d42c0
4
- data.tar.gz: 5bac1684448eac1c520d2f4e5f17ac0ac7948afc7a2e46bb4bc5cd721a66ef26
3
+ metadata.gz: 689c84e4854288639090d826e4f3844a7e3cabf6ec68cf623b609678a8934ff6
4
+ data.tar.gz: 07afae5c2e18256d62b31362acf7b1347cff2a332fe0de3659af463abbaa2dd4
5
5
  SHA512:
6
- metadata.gz: 01f602344f23934a80470c72a2608f64209fd3649c614d107f6a87757774a6930a14e83fc421cb4f35a629d41f4321b1967849c0f05aa3c6a580b69814bfe637
7
- data.tar.gz: 908bb885c92f1748c6f38609ea02b44fdf187bace3fafffd4764348516d5cf590fdb9aafb3871de5a9feacdcb9bd96cefead9753092b8c5d8711a1a63ae57aef
6
+ metadata.gz: c62e2dcbc6daa4d1e574954101b889f391018fb078a89f200e35a70efff80e7b85eabb1fd7322212c5199644f0a4e8a16057e6863e6501321b46ebba5ea95070
7
+ data.tar.gz: a764b0585aaee7b6f7c50cc65f3348c8a48cba317ffe15b6c455422db4d30c1fc26784e58574ce238d749e849a7a283054158850ef35e142154141a63ddb987e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## 0.5.0 (2026-02-27)
2
+
3
+ - Added support for CUDA 12.9+
4
+ - Switched to TorchCodec for `load` and `save` methods
5
+ - Removed `load_wav` method
6
+ - Dropped support for Ruby < 3.2
7
+
8
+ ## 0.4.1 (2025-06-26)
9
+
10
+ - Improved SoX detection for Homebrew
11
+
1
12
  ## 0.4.0 (2024-08-02)
2
13
 
3
14
  - Dropped support for Ruby < 3.1
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  BSD 2-Clause License
2
2
 
3
3
  Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
4
- Copyright (c) 2020-2024 Andrew Kane,
4
+ Copyright (c) 2020-2026 Andrew Kane,
5
5
  All rights reserved.
6
6
 
7
7
  Redistribution and use in source and binary forms, with or without
data/README.md CHANGED
@@ -6,12 +6,6 @@
6
6
 
7
7
  ## Installation
8
8
 
9
- First, [install SoX](#sox-installation). For Homebrew, use:
10
-
11
- ```sh
12
- brew install sox
13
- ```
14
-
15
9
  Add this line to your application’s Gemfile:
16
10
 
17
11
  ```ruby
@@ -22,22 +16,15 @@ gem "torchaudio"
22
16
 
23
17
  This library follows the [Python API](https://pytorch.org/audio/). Many methods and options are missing at the moment. PRs welcome!
24
18
 
25
- ## Tutorial
26
-
27
- - [PyTorch tutorial](https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html)
28
- - [Ruby code](examples/tutorial.rb)
29
-
30
- Download the [audio file](https://github.com/pytorch/tutorials/raw/master/_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav) and install the [matplotlib](https://github.com/mrkn/matplotlib.rb) gem first.
31
-
32
19
  ## Basics
33
20
 
34
- Load a file
21
+ Load a file (requires [torchcodec](https://github.com/ankane/torchcodec-ruby))
35
22
 
36
23
  ```ruby
37
24
  waveform, sample_rate = TorchAudio.load("file.wav")
38
25
  ```
39
26
 
40
- Save a file
27
+ Save a file (requires [torchcodec](https://github.com/ankane/torchcodec-ruby))
41
28
 
42
29
  ```ruby
43
30
  TorchAudio.save("new.wave", waveform, sample_rate)
@@ -93,7 +80,7 @@ TorchAudio::Datasets::YESNO.new(".", download: true)
93
80
 
94
81
  Supported datasets are:
95
82
 
96
- - [YESNO](http://www.openslr.org/1/)
83
+ - [YESNO](https://www.openslr.org/1/)
97
84
 
98
85
  ## Disclaimer
99
86
 
@@ -101,37 +88,6 @@ This library downloads and prepares public datasets. We don’t host any dataset
101
88
 
102
89
  If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
103
90
 
104
- ## SoX Installation
105
-
106
- ### Mac
107
-
108
- ```sh
109
- brew install sox
110
- ```
111
-
112
- ### Windows
113
-
114
- todo
115
-
116
- ### Ubuntu
117
-
118
- ```sh
119
- sudo apt install sox libsox-dev libsox-fmt-all
120
- ```
121
-
122
- ### Travis CI
123
-
124
- Add to `.travis.yml`:
125
-
126
- ```yml
127
- addons:
128
- apt:
129
- packages:
130
- - sox
131
- - libsox-dev
132
- - libsox-fmt-all
133
- ```
134
-
135
91
  ## History
136
92
 
137
93
  View the [changelog](https://github.com/ankane/torchaudio-ruby/blob/master/CHANGELOG.md)
@@ -4,11 +4,19 @@ module TorchAudio
4
4
  attr_reader :n_mels
5
5
 
6
6
  def initialize(
7
- sample_rate: 16000, n_fft: 400, win_length: nil, hop_length: nil, f_min: 0.0,
8
- f_max: nil, pad: 0, n_mels: 128, window_fn: Torch.method(:hann_window),
9
- power: 2.0, normalized: false, wkwargs: nil
7
+ sample_rate: 16000,
8
+ n_fft: 400,
9
+ win_length: nil,
10
+ hop_length: nil,
11
+ f_min: 0.0,
12
+ f_max: nil,
13
+ pad: 0,
14
+ n_mels: 128,
15
+ window_fn: Torch.method(:hann_window),
16
+ power: 2.0,
17
+ normalized: false,
18
+ wkwargs: nil
10
19
  )
11
-
12
20
  super()
13
21
  @sample_rate = sample_rate
14
22
  @n_fft = n_fft
@@ -2,11 +2,18 @@ module TorchAudio
2
2
  module Transforms
3
3
  class Spectrogram < Torch::NN::Module
4
4
  def initialize(
5
- n_fft: 400, win_length: nil, hop_length: nil, pad: 0,
6
- window_fn: Torch.method(:hann_window), power: 2.0, normalized: false, wkwargs: nil,
7
- center: true, pad_mode: "reflect", onesided: true
5
+ n_fft: 400,
6
+ win_length: nil,
7
+ hop_length: nil,
8
+ pad: 0,
9
+ window_fn: Torch.method(:hann_window),
10
+ power: 2.0,
11
+ normalized: false,
12
+ wkwargs: nil,
13
+ center: true,
14
+ pad_mode: "reflect",
15
+ onesided: true
8
16
  )
9
-
10
17
  super()
11
18
  @n_fft = n_fft
12
19
  # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
@@ -1,3 +1,3 @@
1
1
  module TorchAudio
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/torchaudio.rb CHANGED
@@ -1,9 +1,6 @@
1
1
  # dependencies
2
2
  require "torch"
3
3
 
4
- # ext
5
- require "torchaudio/ext"
6
-
7
4
  # stdlib
8
5
  require "digest"
9
6
  require "fileutils"
@@ -30,122 +27,139 @@ module TorchAudio
30
27
  class Error < StandardError; end
31
28
 
32
29
  class << self
33
- # TODO remove filetype in 0.4.0
34
30
  def load(
35
- filepath, out: nil, normalization: true, channels_first: true, num_frames: 0,
36
- offset: 0, signalinfo: nil, encodinginfo: nil, filetype: nil, format: nil
31
+ uri,
32
+ frame_offset: 0,
33
+ num_frames: -1,
34
+ channels_first: true
37
35
  )
38
-
39
- filepath = filepath.to_s
40
-
41
- # check if valid file
42
- unless File.exist?(filepath)
43
- raise ArgumentError, "#{filepath} not found or is a directory"
36
+ begin
37
+ require "torchcodec"
38
+ rescue LoadError
39
+ raise LoadError, "TorchCodec is required for load. Please install torchcodec to use this function."
44
40
  end
45
41
 
46
- # initialize output tensor
47
- if !out.nil?
48
- check_input(out)
49
- else
50
- out = Torch::FloatTensor.new
42
+ begin
43
+ decoder = TorchCodec::Decoders::AudioDecoder.new(uri)
44
+ rescue => e
45
+ raise RuntimeError, "Failed to create AudioDecoder for #{uri}: #{e}"
51
46
  end
52
47
 
53
- if num_frames < -1
54
- raise ArgumentError, "Expected value for num_samples -1 (entire file) or >=0"
48
+ # Get sample rate from metadata
49
+ sample_rate = decoder.metadata[:sample_rate]
50
+ if sample_rate.nil?
51
+ raise RuntimeError, "Unable to determine sample rate from audio metadata"
55
52
  end
56
- if offset < 0
57
- raise ArgumentError, "Expected positive offset value"
53
+
54
+ # Decode the entire file first, then subsample manually
55
+ # This is the simplest approach since torchcodec uses time-based indexing
56
+ begin
57
+ audio_samples = decoder.get_all_samples
58
+ rescue => e
59
+ raise RuntimeError, "Failed to decode audio samples: #{e}"
58
60
  end
59
61
 
60
- # same logic as C++
61
- # could also make read_audio_file work with nil
62
- format ||= filetype || File.extname(filepath)[1..-1]
62
+ data = audio_samples[:data]
63
+
64
+ # Apply frame_offset and num_frames (which are actually sample offsets)
65
+ if frame_offset > 0
66
+ if frame_offset >= data.shape[1]
67
+ # Return empty tensor if offset is beyond available data
68
+ empty_shape = channels_first ? [data.shape[0], 0] : [0, data.shape[0]]
69
+ return [Torch.zeros(empty_shape, dtype: Torch.float32), sample_rate]
70
+ end
71
+ data = data[0.., frame_offset..]
72
+ end
63
73
 
64
- sample_rate =
65
- Ext.read_audio_file(
66
- filepath,
67
- out,
68
- channels_first,
69
- num_frames,
70
- offset,
71
- signalinfo,
72
- encodinginfo,
73
- format
74
- )
74
+ if num_frames == 0
75
+ # Return empty tensor if num_frames is 0
76
+ empty_shape = channels_first ? [data.shape[0], 0] : [0, data.shape[0]]
77
+ return [Torch.zeros(empty_shape, dtype: Torch.float32), sample_rate]
78
+ elsif num_frames > 0
79
+ data = data[0.., 0...num_frames]
80
+ end
75
81
 
76
- # normalize if needed
77
- normalize_audio(out, normalization)
82
+ # TorchCodec returns data in [channel, time] format by default
83
+ # Handle channels_first parameter
84
+ if !channels_first
85
+ data = data.transpose(0, 1) # [channel, time] -> [time, channel]
86
+ end
78
87
 
79
- [out, sample_rate]
88
+ [data, sample_rate]
80
89
  end
81
90
 
82
- def load_wav(filepath, **kwargs)
83
- kwargs[:normalization] = 1 << 16
84
- load(filepath, **kwargs)
91
+ def load_wav(path, channels_first: true)
92
+ load(path, channels_first: channels_first)
85
93
  end
86
94
 
87
- def save(filepath, src, sample_rate, precision: 16, channels_first: true)
88
- si = Ext::SignalInfo.new
89
- ch_idx = channels_first ? 0 : 1
90
- si.rate = sample_rate
91
- si.channels = src.dim == 1 ? 1 : src.size(ch_idx)
92
- si.length = src.numel
93
- si.precision = precision
94
- save_encinfo(filepath, src, channels_first: channels_first, signalinfo: si)
95
- end
95
+ def save(
96
+ uri,
97
+ src,
98
+ sample_rate,
99
+ channels_first: true,
100
+ compression: nil
101
+ )
102
+ begin
103
+ require "torchcodec"
104
+ rescue LoadError
105
+ raise LoadError, "TorchCodec is required for save. Please install torchcodec to use this function."
106
+ end
96
107
 
97
- def save_encinfo(filepath, src, channels_first: true, signalinfo: nil, encodinginfo: nil, filetype: nil)
98
- ch_idx, _len_idx = channels_first ? [0, 1] : [1, 0]
99
-
100
- # check if save directory exists
101
- abs_dirpath = File.dirname(File.expand_path(filepath))
102
- unless Dir.exist?(abs_dirpath)
103
- raise "Directory does not exist: #{abs_dirpath}"
104
- end
105
- # check that src is a CPU tensor
106
- check_input(src)
107
- # Check/Fix shape of source data
108
- if src.dim == 1
109
- # 1d tensors as assumed to be mono signals
110
- src.unsqueeze!(ch_idx)
111
- elsif src.dim > 2 || src.size(ch_idx) > 16
112
- # assumes num_channels < 16
113
- raise ArgumentError, "Expected format where C < 16, but found #{src.size}"
114
- end
115
- # sox stores the sample rate as a float, though practically sample rates are almost always integers
116
- # convert integers to floats
117
- if signalinfo
118
- if signalinfo.rate && !signalinfo.rate.is_a?(Float)
119
- if signalinfo.rate.to_f == signalinfo.rate
120
- signalinfo.rate = signalinfo.rate.to_f
121
- else
122
- raise ArgumentError, "Sample rate should be a float or int"
123
- end
108
+ # Input validation
109
+ if !src.is_a?(Torch::Tensor)
110
+ raise ArgumentError, "Expected src to be a torch.Tensor, got #{src.class.name}"
111
+ end
112
+
113
+ if src.dtype != Torch.float32
114
+ src = src.float
115
+ end
116
+
117
+ if sample_rate <= 0
118
+ raise ArgumentError, "sample_rate must be positive, got #{sample_rate}"
119
+ end
120
+
121
+ # Handle tensor shape and channels_first
122
+ if src.ndim == 1
123
+ # Convert to 2D: [1, time] for channels_first: true
124
+ if channels_first
125
+ data = src.unsqueeze(0) # [1, time]
126
+ else
127
+ # For channels_first: false, input is [time] -> reshape to [time, 1] -> transpose to [1, time]
128
+ data = src.unsqueeze(1).transpose(0, 1) # [time, 1] -> [1, time]
124
129
  end
125
- # check if the bit precision (i.e. bits per sample) is an integer
126
- if signalinfo.precision && ! signalinfo.precision.is_a?(Integer)
127
- if signalinfo.precision.to_i == signalinfo.precision
128
- signalinfo.precision = signalinfo.precision.to_i
129
- else
130
- raise ArgumentError, "Bit precision should be an integer"
131
- end
130
+ elsif src.ndim == 2
131
+ if channels_first
132
+ data = src # Already [channel, time]
133
+ else
134
+ data = src.transpose(0, 1) # [time, channel] -> [channel, time]
132
135
  end
136
+ else
137
+ raise ArgumentError, "Expected 1D or 2D tensor, got #{src.ndim}D tensor"
138
+ end
139
+
140
+ # Create AudioEncoder
141
+ begin
142
+ encoder = TorchCodec::Encoders::AudioEncoder.new(data, sample_rate: sample_rate)
143
+ rescue => e
144
+ raise RuntimeError, "Failed to create AudioEncoder: #{e}"
145
+ end
146
+
147
+ # Determine bit_rate from compression parameter
148
+ bit_rate = nil
149
+ if !compression.nil?
150
+ if compression.is_a?(Integer) || compression.is_a?(Float)
151
+ bit_rate = compression.to_i
152
+ else
153
+ warn "Unsupported compression type #{compression.class.name}."
154
+ end
155
+ end
156
+
157
+ # Save to file
158
+ begin
159
+ encoder.to_file(uri, bit_rate: bit_rate)
160
+ rescue => e
161
+ raise RuntimeError, "Failed to save audio to #{uri}: #{e}"
133
162
  end
134
- # programs such as librosa normalize the signal, unnormalize if detected
135
- if src.min >= -1.0 && src.max <= 1.0
136
- src = src * (1 << 31)
137
- src = src.long
138
- end
139
- # set filetype and allow for files with no extensions
140
- extension = File.extname(filepath)
141
- filetype = extension.length > 0 ? extension[1..-1] : filetype
142
- # transpose from C x L -> L x C
143
- if channels_first
144
- src = src.transpose(1, 0)
145
- end
146
- # save data to file
147
- src = src.contiguous
148
- Ext.write_audio_file(filepath, src, signalinfo, encodinginfo, filetype)
149
163
  end
150
164
 
151
165
  private
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: torchaudio
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-08-03 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: torch-rb
@@ -24,41 +23,14 @@ dependencies:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
25
  version: '0.13'
27
- - !ruby/object:Gem::Dependency
28
- name: rice
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '4.3'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '4.3'
41
- description:
42
26
  email: andrew@ankane.org
43
27
  executables: []
44
- extensions:
45
- - ext/torchaudio/extconf.rb
28
+ extensions: []
46
29
  extra_rdoc_files: []
47
30
  files:
48
31
  - CHANGELOG.md
49
32
  - LICENSE.txt
50
33
  - README.md
51
- - ext/torchaudio/csrc/register.cpp
52
- - ext/torchaudio/csrc/sox.cpp
53
- - ext/torchaudio/csrc/sox.h
54
- - ext/torchaudio/csrc/sox_effects.cpp
55
- - ext/torchaudio/csrc/sox_effects.h
56
- - ext/torchaudio/csrc/sox_io.cpp
57
- - ext/torchaudio/csrc/sox_io.h
58
- - ext/torchaudio/csrc/sox_utils.cpp
59
- - ext/torchaudio/csrc/sox_utils.h
60
- - ext/torchaudio/ext.cpp
61
- - ext/torchaudio/extconf.rb
62
34
  - lib/torchaudio.rb
63
35
  - lib/torchaudio/datasets/utils.rb
64
36
  - lib/torchaudio/datasets/yesno.rb
@@ -78,7 +50,6 @@ homepage: https://github.com/ankane/torchaudio-ruby
78
50
  licenses:
79
51
  - BSD-2-Clause
80
52
  metadata: {}
81
- post_install_message:
82
53
  rdoc_options: []
83
54
  require_paths:
84
55
  - lib
@@ -86,15 +57,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
86
57
  requirements:
87
58
  - - ">="
88
59
  - !ruby/object:Gem::Version
89
- version: '3.1'
60
+ version: '3.2'
90
61
  required_rubygems_version: !ruby/object:Gem::Requirement
91
62
  requirements:
92
63
  - - ">="
93
64
  - !ruby/object:Gem::Version
94
65
  version: '0'
95
66
  requirements: []
96
- rubygems_version: 3.5.11
97
- signing_key:
67
+ rubygems_version: 4.0.3
98
68
  specification_version: 4
99
69
  summary: Data manipulation and transformation for audio signal processing
100
70
  test_files: []
@@ -1,65 +0,0 @@
1
- #ifndef TORCHAUDIO_REGISTER_H
2
- #define TORCHAUDIO_REGISTER_H
3
-
4
- #include <torchaudio/csrc/sox_effects.h>
5
- #include <torchaudio/csrc/sox_io.h>
6
- #include <torchaudio/csrc/sox_utils.h>
7
-
8
- namespace torchaudio {
9
- namespace {
10
-
11
- ////////////////////////////////////////////////////////////////////////////////
12
- // sox_utils.h
13
- ////////////////////////////////////////////////////////////////////////////////
14
- static auto registerTensorSignal =
15
- torch::class_<sox_utils::TensorSignal>("torchaudio", "TensorSignal")
16
- .def(torch::init<torch::Tensor, int64_t, bool>())
17
- .def("get_tensor", &sox_utils::TensorSignal::getTensor)
18
- .def("get_sample_rate", &sox_utils::TensorSignal::getSampleRate)
19
- .def("get_channels_first", &sox_utils::TensorSignal::getChannelsFirst);
20
-
21
- ////////////////////////////////////////////////////////////////////////////////
22
- // sox_io.h
23
- ////////////////////////////////////////////////////////////////////////////////
24
- static auto registerSignalInfo =
25
- torch::class_<sox_io::SignalInfo>("torchaudio", "SignalInfo")
26
- .def("get_sample_rate", &sox_io::SignalInfo::getSampleRate)
27
- .def("get_num_channels", &sox_io::SignalInfo::getNumChannels)
28
- .def("get_num_frames", &sox_io::SignalInfo::getNumFrames);
29
-
30
- static auto registerGetInfo = torch::RegisterOperators().op(
31
- torch::RegisterOperators::options()
32
- .schema(
33
- "torchaudio::sox_io_get_info(str path) -> __torch__.torch.classes.torchaudio.SignalInfo info")
34
- .catchAllKernel<decltype(sox_io::get_info), &sox_io::get_info>());
35
-
36
- static auto registerLoadAudioFile = torch::RegisterOperators().op(
37
- torch::RegisterOperators::options()
38
- .schema(
39
- "torchaudio::sox_io_load_audio_file(str path, int frame_offset, int num_frames, bool normalize, bool channels_first) -> __torch__.torch.classes.torchaudio.TensorSignal signal")
40
- .catchAllKernel<
41
- decltype(sox_io::load_audio_file),
42
- &sox_io::load_audio_file>());
43
-
44
- static auto registerSaveAudioFile = torch::RegisterOperators().op(
45
- torch::RegisterOperators::options()
46
- .schema(
47
- "torchaudio::sox_io_save_audio_file(str path, __torch__.torch.classes.torchaudio.TensorSignal signal, float compression) -> ()")
48
- .catchAllKernel<
49
- decltype(sox_io::save_audio_file),
50
- &sox_io::save_audio_file>());
51
-
52
- ////////////////////////////////////////////////////////////////////////////////
53
- // sox_effects.h
54
- ////////////////////////////////////////////////////////////////////////////////
55
- static auto registerSoxEffects =
56
- torch::RegisterOperators(
57
- "torchaudio::sox_effects_initialize_sox_effects",
58
- &sox_effects::initialize_sox_effects)
59
- .op("torchaudio::sox_effects_shutdown_sox_effects",
60
- &sox_effects::shutdown_sox_effects)
61
- .op("torchaudio::sox_effects_list_effects", &sox_effects::list_effects);
62
-
63
- } // namespace
64
- } // namespace torchaudio
65
- #endif