torchaudio 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1d221dc6cf08b75a5084dcd0142067168c3a03302e5dbd638f0f68ed7882ad31
4
- data.tar.gz: acc470a5f1ab004ec0a42e9229bbfc5321434328496f165f932a217a3d106220
3
+ metadata.gz: 689c84e4854288639090d826e4f3844a7e3cabf6ec68cf623b609678a8934ff6
4
+ data.tar.gz: 07afae5c2e18256d62b31362acf7b1347cff2a332fe0de3659af463abbaa2dd4
5
5
  SHA512:
6
- metadata.gz: e49dec118b6886295f44aa01452ffc6f3f0dedae8debbb6c094a33dcb8ca388c3336581af53ceb0fbe25e0689ac7ceba1bee0c3b1b172019405fc7a4240a32b0
7
- data.tar.gz: 0b1fc5854e251137cba56279219d9c307b98811083f127be282f8229c46d80e667145773967d75afedf0b33a4ecf6af39f437c83b9844a11fab435d25b265df3
6
+ metadata.gz: c62e2dcbc6daa4d1e574954101b889f391018fb078a89f200e35a70efff80e7b85eabb1fd7322212c5199644f0a4e8a16057e6863e6501321b46ebba5ea95070
7
+ data.tar.gz: a764b0585aaee7b6f7c50cc65f3348c8a48cba317ffe15b6c455422db4d30c1fc26784e58574ce238d749e849a7a283054158850ef35e142154141a63ddb987e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.5.0 (2026-02-27)
2
+
3
+ - Added support for CUDA 12.9+
4
+ - Switched to TorchCodec for `load` and `save` methods
5
+ - Removed `load_wav` method
6
+ - Dropped support for Ruby < 3.2
7
+
1
8
  ## 0.4.1 (2025-06-26)
2
9
 
3
10
  - Improved SoX detection for Homebrew
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  BSD 2-Clause License
2
2
 
3
3
  Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
4
- Copyright (c) 2020-2025 Andrew Kane,
4
+ Copyright (c) 2020-2026 Andrew Kane,
5
5
  All rights reserved.
6
6
 
7
7
  Redistribution and use in source and binary forms, with or without
data/README.md CHANGED
@@ -6,12 +6,6 @@
6
6
 
7
7
  ## Installation
8
8
 
9
- First, [install SoX](#sox-installation). For Homebrew, use:
10
-
11
- ```sh
12
- brew install sox
13
- ```
14
-
15
9
  Add this line to your application’s Gemfile:
16
10
 
17
11
  ```ruby
@@ -22,22 +16,15 @@ gem "torchaudio"
22
16
 
23
17
  This library follows the [Python API](https://pytorch.org/audio/). Many methods and options are missing at the moment. PRs welcome!
24
18
 
25
- ## Tutorial
26
-
27
- - [PyTorch tutorial](https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html)
28
- - [Ruby code](examples/tutorial.rb)
29
-
30
- Download the [audio file](https://github.com/pytorch/tutorials/raw/master/_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav) and install the [matplotlib](https://github.com/mrkn/matplotlib.rb) gem first.
31
-
32
19
  ## Basics
33
20
 
34
- Load a file
21
+ Load a file (requires [torchcodec](https://github.com/ankane/torchcodec-ruby))
35
22
 
36
23
  ```ruby
37
24
  waveform, sample_rate = TorchAudio.load("file.wav")
38
25
  ```
39
26
 
40
- Save a file
27
+ Save a file (requires [torchcodec](https://github.com/ankane/torchcodec-ruby))
41
28
 
42
29
  ```ruby
43
30
  TorchAudio.save("new.wave", waveform, sample_rate)
@@ -101,37 +88,6 @@ This library downloads and prepares public datasets. We don’t host any dataset
101
88
 
102
89
  If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
103
90
 
104
- ## SoX Installation
105
-
106
- ### Mac
107
-
108
- ```sh
109
- brew install sox
110
- ```
111
-
112
- ### Windows
113
-
114
- todo
115
-
116
- ### Ubuntu
117
-
118
- ```sh
119
- sudo apt install sox libsox-dev libsox-fmt-all
120
- ```
121
-
122
- ### Travis CI
123
-
124
- Add to `.travis.yml`:
125
-
126
- ```yml
127
- addons:
128
- apt:
129
- packages:
130
- - sox
131
- - libsox-dev
132
- - libsox-fmt-all
133
- ```
134
-
135
91
  ## History
136
92
 
137
93
  View the [changelog](https://github.com/ankane/torchaudio-ruby/blob/master/CHANGELOG.md)
@@ -1,3 +1,3 @@
1
1
  module TorchAudio
2
- VERSION = "0.4.1"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/torchaudio.rb CHANGED
@@ -1,9 +1,6 @@
1
1
  # dependencies
2
2
  require "torch"
3
3
 
4
- # ext
5
- require "torchaudio/ext"
6
-
7
4
  # stdlib
8
5
  require "digest"
9
6
  require "fileutils"
@@ -30,129 +27,139 @@ module TorchAudio
30
27
  class Error < StandardError; end
31
28
 
32
29
  class << self
33
- # TODO remove filetype in 0.4.0
34
30
  def load(
35
- filepath,
36
- out: nil,
37
- normalization: true,
38
- channels_first: true,
39
- num_frames: 0,
40
- offset: 0,
41
- signalinfo: nil,
42
- encodinginfo: nil,
43
- filetype: nil,
44
- format: nil
31
+ uri,
32
+ frame_offset: 0,
33
+ num_frames: -1,
34
+ channels_first: true
45
35
  )
46
- filepath = filepath.to_s
47
-
48
- # check if valid file
49
- unless File.exist?(filepath)
50
- raise ArgumentError, "#{filepath} not found or is a directory"
36
+ begin
37
+ require "torchcodec"
38
+ rescue LoadError
39
+ raise LoadError, "TorchCodec is required for load. Please install torchcodec to use this function."
51
40
  end
52
41
 
53
- # initialize output tensor
54
- if !out.nil?
55
- check_input(out)
56
- else
57
- out = Torch::FloatTensor.new
42
+ begin
43
+ decoder = TorchCodec::Decoders::AudioDecoder.new(uri)
44
+ rescue => e
45
+ raise RuntimeError, "Failed to create AudioDecoder for #{uri}: #{e}"
58
46
  end
59
47
 
60
- if num_frames < -1
61
- raise ArgumentError, "Expected value for num_samples -1 (entire file) or >=0"
48
+ # Get sample rate from metadata
49
+ sample_rate = decoder.metadata[:sample_rate]
50
+ if sample_rate.nil?
51
+ raise RuntimeError, "Unable to determine sample rate from audio metadata"
62
52
  end
63
- if offset < 0
64
- raise ArgumentError, "Expected positive offset value"
53
+
54
+ # Decode the entire file first, then subsample manually
55
+ # This is the simplest approach since torchcodec uses time-based indexing
56
+ begin
57
+ audio_samples = decoder.get_all_samples
58
+ rescue => e
59
+ raise RuntimeError, "Failed to decode audio samples: #{e}"
65
60
  end
66
61
 
67
- # same logic as C++
68
- # could also make read_audio_file work with nil
69
- format ||= filetype || File.extname(filepath)[1..-1]
62
+ data = audio_samples[:data]
63
+
64
+ # Apply frame_offset and num_frames (which are actually sample offsets)
65
+ if frame_offset > 0
66
+ if frame_offset >= data.shape[1]
67
+ # Return empty tensor if offset is beyond available data
68
+ empty_shape = channels_first ? [data.shape[0], 0] : [0, data.shape[0]]
69
+ return [Torch.zeros(empty_shape, dtype: Torch.float32), sample_rate]
70
+ end
71
+ data = data[0.., frame_offset..]
72
+ end
70
73
 
71
- sample_rate =
72
- Ext.read_audio_file(
73
- filepath,
74
- out,
75
- channels_first,
76
- num_frames,
77
- offset,
78
- signalinfo,
79
- encodinginfo,
80
- format
81
- )
74
+ if num_frames == 0
75
+ # Return empty tensor if num_frames is 0
76
+ empty_shape = channels_first ? [data.shape[0], 0] : [0, data.shape[0]]
77
+ return [Torch.zeros(empty_shape, dtype: Torch.float32), sample_rate]
78
+ elsif num_frames > 0
79
+ data = data[0.., 0...num_frames]
80
+ end
82
81
 
83
- # normalize if needed
84
- normalize_audio(out, normalization)
82
+ # TorchCodec returns data in [channel, time] format by default
83
+ # Handle channels_first parameter
84
+ if !channels_first
85
+ data = data.transpose(0, 1) # [channel, time] -> [time, channel]
86
+ end
85
87
 
86
- [out, sample_rate]
88
+ [data, sample_rate]
87
89
  end
88
90
 
89
- def load_wav(filepath, **kwargs)
90
- kwargs[:normalization] = 1 << 16
91
- load(filepath, **kwargs)
91
+ def load_wav(path, channels_first: true)
92
+ load(path, channels_first: channels_first)
92
93
  end
93
94
 
94
- def save(filepath, src, sample_rate, precision: 16, channels_first: true)
95
- si = Ext::SignalInfo.new
96
- ch_idx = channels_first ? 0 : 1
97
- si.rate = sample_rate
98
- si.channels = src.dim == 1 ? 1 : src.size(ch_idx)
99
- si.length = src.numel
100
- si.precision = precision
101
- save_encinfo(filepath, src, channels_first: channels_first, signalinfo: si)
102
- end
95
+ def save(
96
+ uri,
97
+ src,
98
+ sample_rate,
99
+ channels_first: true,
100
+ compression: nil
101
+ )
102
+ begin
103
+ require "torchcodec"
104
+ rescue LoadError
105
+ raise LoadError, "TorchCodec is required for save. Please install torchcodec to use this function."
106
+ end
103
107
 
104
- def save_encinfo(filepath, src, channels_first: true, signalinfo: nil, encodinginfo: nil, filetype: nil)
105
- ch_idx, _len_idx = channels_first ? [0, 1] : [1, 0]
106
-
107
- # check if save directory exists
108
- abs_dirpath = File.dirname(File.expand_path(filepath))
109
- unless Dir.exist?(abs_dirpath)
110
- raise "Directory does not exist: #{abs_dirpath}"
111
- end
112
- # check that src is a CPU tensor
113
- check_input(src)
114
- # Check/Fix shape of source data
115
- if src.dim == 1
116
- # 1d tensors as assumed to be mono signals
117
- src.unsqueeze!(ch_idx)
118
- elsif src.dim > 2 || src.size(ch_idx) > 16
119
- # assumes num_channels < 16
120
- raise ArgumentError, "Expected format where C < 16, but found #{src.size}"
121
- end
122
- # sox stores the sample rate as a float, though practically sample rates are almost always integers
123
- # convert integers to floats
124
- if signalinfo
125
- if signalinfo.rate && !signalinfo.rate.is_a?(Float)
126
- if signalinfo.rate.to_f == signalinfo.rate
127
- signalinfo.rate = signalinfo.rate.to_f
128
- else
129
- raise ArgumentError, "Sample rate should be a float or int"
130
- end
108
+ # Input validation
109
+ if !src.is_a?(Torch::Tensor)
110
+ raise ArgumentError, "Expected src to be a torch.Tensor, got #{src.class.name}"
111
+ end
112
+
113
+ if src.dtype != Torch.float32
114
+ src = src.float
115
+ end
116
+
117
+ if sample_rate <= 0
118
+ raise ArgumentError, "sample_rate must be positive, got #{sample_rate}"
119
+ end
120
+
121
+ # Handle tensor shape and channels_first
122
+ if src.ndim == 1
123
+ # Convert to 2D: [1, time] for channels_first: true
124
+ if channels_first
125
+ data = src.unsqueeze(0) # [1, time]
126
+ else
127
+ # For channels_first: false, input is [time] -> reshape to [time, 1] -> transpose to [1, time]
128
+ data = src.unsqueeze(1).transpose(0, 1) # [time, 1] -> [1, time]
129
+ end
130
+ elsif src.ndim == 2
131
+ if channels_first
132
+ data = src # Already [channel, time]
133
+ else
134
+ data = src.transpose(0, 1) # [time, channel] -> [channel, time]
131
135
  end
132
- # check if the bit precision (i.e. bits per sample) is an integer
133
- if signalinfo.precision && ! signalinfo.precision.is_a?(Integer)
134
- if signalinfo.precision.to_i == signalinfo.precision
135
- signalinfo.precision = signalinfo.precision.to_i
136
- else
137
- raise ArgumentError, "Bit precision should be an integer"
138
- end
136
+ else
137
+ raise ArgumentError, "Expected 1D or 2D tensor, got #{src.ndim}D tensor"
138
+ end
139
+
140
+ # Create AudioEncoder
141
+ begin
142
+ encoder = TorchCodec::Encoders::AudioEncoder.new(data, sample_rate: sample_rate)
143
+ rescue => e
144
+ raise RuntimeError, "Failed to create AudioEncoder: #{e}"
145
+ end
146
+
147
+ # Determine bit_rate from compression parameter
148
+ bit_rate = nil
149
+ if !compression.nil?
150
+ if compression.is_a?(Integer) || compression.is_a?(Float)
151
+ bit_rate = compression.to_i
152
+ else
153
+ warn "Unsupported compression type #{compression.class.name}."
139
154
  end
140
155
  end
141
- # programs such as librosa normalize the signal, unnormalize if detected
142
- if src.min >= -1.0 && src.max <= 1.0
143
- src = src * (1 << 31)
144
- src = src.long
145
- end
146
- # set filetype and allow for files with no extensions
147
- extension = File.extname(filepath)
148
- filetype = extension.length > 0 ? extension[1..-1] : filetype
149
- # transpose from C x L -> L x C
150
- if channels_first
151
- src = src.transpose(1, 0)
152
- end
153
- # save data to file
154
- src = src.contiguous
155
- Ext.write_audio_file(filepath, src, signalinfo, encodinginfo, filetype)
156
+
157
+ # Save to file
158
+ begin
159
+ encoder.to_file(uri, bit_rate: bit_rate)
160
+ rescue => e
161
+ raise RuntimeError, "Failed to save audio to #{uri}: #{e}"
162
+ end
156
163
  end
157
164
 
158
165
  private
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: torchaudio
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -23,40 +23,14 @@ dependencies:
23
23
  - - ">="
24
24
  - !ruby/object:Gem::Version
25
25
  version: '0.13'
26
- - !ruby/object:Gem::Dependency
27
- name: rice
28
- requirement: !ruby/object:Gem::Requirement
29
- requirements:
30
- - - ">="
31
- - !ruby/object:Gem::Version
32
- version: 4.3.3
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: !ruby/object:Gem::Requirement
36
- requirements:
37
- - - ">="
38
- - !ruby/object:Gem::Version
39
- version: 4.3.3
40
26
  email: andrew@ankane.org
41
27
  executables: []
42
- extensions:
43
- - ext/torchaudio/extconf.rb
28
+ extensions: []
44
29
  extra_rdoc_files: []
45
30
  files:
46
31
  - CHANGELOG.md
47
32
  - LICENSE.txt
48
33
  - README.md
49
- - ext/torchaudio/csrc/register.cpp
50
- - ext/torchaudio/csrc/sox.cpp
51
- - ext/torchaudio/csrc/sox.h
52
- - ext/torchaudio/csrc/sox_effects.cpp
53
- - ext/torchaudio/csrc/sox_effects.h
54
- - ext/torchaudio/csrc/sox_io.cpp
55
- - ext/torchaudio/csrc/sox_io.h
56
- - ext/torchaudio/csrc/sox_utils.cpp
57
- - ext/torchaudio/csrc/sox_utils.h
58
- - ext/torchaudio/ext.cpp
59
- - ext/torchaudio/extconf.rb
60
34
  - lib/torchaudio.rb
61
35
  - lib/torchaudio/datasets/utils.rb
62
36
  - lib/torchaudio/datasets/yesno.rb
@@ -83,14 +57,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
83
57
  requirements:
84
58
  - - ">="
85
59
  - !ruby/object:Gem::Version
86
- version: '3.1'
60
+ version: '3.2'
87
61
  required_rubygems_version: !ruby/object:Gem::Requirement
88
62
  requirements:
89
63
  - - ">="
90
64
  - !ruby/object:Gem::Version
91
65
  version: '0'
92
66
  requirements: []
93
- rubygems_version: 3.6.7
67
+ rubygems_version: 4.0.3
94
68
  specification_version: 4
95
69
  summary: Data manipulation and transformation for audio signal processing
96
70
  test_files: []
@@ -1,65 +0,0 @@
1
- #ifndef TORCHAUDIO_REGISTER_H
2
- #define TORCHAUDIO_REGISTER_H
3
-
4
- #include <torchaudio/csrc/sox_effects.h>
5
- #include <torchaudio/csrc/sox_io.h>
6
- #include <torchaudio/csrc/sox_utils.h>
7
-
8
- namespace torchaudio {
9
- namespace {
10
-
11
- ////////////////////////////////////////////////////////////////////////////////
12
- // sox_utils.h
13
- ////////////////////////////////////////////////////////////////////////////////
14
- static auto registerTensorSignal =
15
- torch::class_<sox_utils::TensorSignal>("torchaudio", "TensorSignal")
16
- .def(torch::init<torch::Tensor, int64_t, bool>())
17
- .def("get_tensor", &sox_utils::TensorSignal::getTensor)
18
- .def("get_sample_rate", &sox_utils::TensorSignal::getSampleRate)
19
- .def("get_channels_first", &sox_utils::TensorSignal::getChannelsFirst);
20
-
21
- ////////////////////////////////////////////////////////////////////////////////
22
- // sox_io.h
23
- ////////////////////////////////////////////////////////////////////////////////
24
- static auto registerSignalInfo =
25
- torch::class_<sox_io::SignalInfo>("torchaudio", "SignalInfo")
26
- .def("get_sample_rate", &sox_io::SignalInfo::getSampleRate)
27
- .def("get_num_channels", &sox_io::SignalInfo::getNumChannels)
28
- .def("get_num_frames", &sox_io::SignalInfo::getNumFrames);
29
-
30
- static auto registerGetInfo = torch::RegisterOperators().op(
31
- torch::RegisterOperators::options()
32
- .schema(
33
- "torchaudio::sox_io_get_info(str path) -> __torch__.torch.classes.torchaudio.SignalInfo info")
34
- .catchAllKernel<decltype(sox_io::get_info), &sox_io::get_info>());
35
-
36
- static auto registerLoadAudioFile = torch::RegisterOperators().op(
37
- torch::RegisterOperators::options()
38
- .schema(
39
- "torchaudio::sox_io_load_audio_file(str path, int frame_offset, int num_frames, bool normalize, bool channels_first) -> __torch__.torch.classes.torchaudio.TensorSignal signal")
40
- .catchAllKernel<
41
- decltype(sox_io::load_audio_file),
42
- &sox_io::load_audio_file>());
43
-
44
- static auto registerSaveAudioFile = torch::RegisterOperators().op(
45
- torch::RegisterOperators::options()
46
- .schema(
47
- "torchaudio::sox_io_save_audio_file(str path, __torch__.torch.classes.torchaudio.TensorSignal signal, float compression) -> ()")
48
- .catchAllKernel<
49
- decltype(sox_io::save_audio_file),
50
- &sox_io::save_audio_file>());
51
-
52
- ////////////////////////////////////////////////////////////////////////////////
53
- // sox_effects.h
54
- ////////////////////////////////////////////////////////////////////////////////
55
- static auto registerSoxEffects =
56
- torch::RegisterOperators(
57
- "torchaudio::sox_effects_initialize_sox_effects",
58
- &sox_effects::initialize_sox_effects)
59
- .op("torchaudio::sox_effects_shutdown_sox_effects",
60
- &sox_effects::shutdown_sox_effects)
61
- .op("torchaudio::sox_effects_list_effects", &sox_effects::list_effects);
62
-
63
- } // namespace
64
- } // namespace torchaudio
65
- #endif