torchaudio 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b527976494325cc12e81342c25d318204d2d7c75bfba7036be4296769cdb30a0
4
+ data.tar.gz: 2cfde7bd1b0e7a1628818d5bd74657cfbfba6dfa83ef42897f3ad0f98e77f739
5
+ SHA512:
6
+ metadata.gz: 8e6f34b014340b5ace3193ab589dae75ed0869ab7606402bd4b09de6042299e6f3a118d439dd381491f489ce9552bca4376a7d5b4693dddc3d1c5f5b26540900
7
+ data.tar.gz: d651c46f5185ceb70ae3d9c90154c77afe29a5c35854d1a9d98913096b7ab9ba39a745242dd268548ca87f9e109b56c96dee9dc5539cf066f9ad0f773eddbdcd
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-08-24)
2
+
3
+ - First release
@@ -0,0 +1,26 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
4
+ Copyright (c) 2020 Andrew Kane,
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,93 @@
1
+ # TorchAudio
2
+
3
+ :fire: An audio library for Torch.rb
4
+
5
+ ## Installation
6
+
7
+ First, [install SoX](#sox-installation). For Homebrew, use:
8
+
9
+ ```sh
10
+ brew install sox
11
+ ```
12
+
13
+ Add this line to your application’s Gemfile:
14
+
15
+ ```ruby
16
+ gem 'torchaudio'
17
+ ```
18
+
19
+ ## Getting Started
20
+
21
+ This library follows the [Python API](https://pytorch.org/audio/). Many methods and options are missing at the moment. PRs welcome!
22
+
23
+ ## Datasets
24
+
25
+ Load a dataset
26
+
27
+ ```ruby
28
+ TorchAudio::Datasets::YESNO.new(".", download: true)
29
+ ```
30
+
31
+ Supported datasets are:
32
+
33
+ - [YESNO](http://www.openslr.org/1/)
34
+
35
+ ## Disclaimer
36
+
37
+ This library downloads and prepares public datasets. We don’t host any datasets. Be sure to adhere to the license for each dataset.
38
+
39
+ If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
40
+
41
+ ## SoX Installation
42
+
43
+ ### Mac
44
+
45
+ ```sh
46
+ brew install sox
47
+ ```
48
+
49
+ ### Windows
50
+
51
+ todo
52
+
53
+ ### Ubuntu
54
+
55
+ ```sh
56
+ sudo apt install sox libsox-dev libsox-fmt-all
57
+ ```
58
+
59
+ ### Travis CI
60
+
61
+ Add to `.travis.yml`:
62
+
63
+ ```yml
64
+ addons:
65
+ apt:
66
+ packages:
67
+ - sox
68
+ - libsox-dev
69
+ - libsox-fmt-all
70
+ ```
71
+
72
+ ## History
73
+
74
+ View the [changelog](https://github.com/ankane/torchaudio/blob/master/CHANGELOG.md)
75
+
76
+ ## Contributing
77
+
78
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
79
+
80
+ - [Report bugs](https://github.com/ankane/torchaudio/issues)
81
+ - Fix bugs and [submit pull requests](https://github.com/ankane/torchaudio/pulls)
82
+ - Write, clarify, or fix documentation
83
+ - Suggest or add new features
84
+
85
+ To get started with development:
86
+
87
+ ```sh
88
+ git clone https://github.com/ankane/torchaudio.git
89
+ cd torchaudio
90
+ bundle install
91
+ bundle exec rake compile
92
+ bundle exec rake test
93
+ ```
@@ -0,0 +1,65 @@
1
+ #ifndef TORCHAUDIO_REGISTER_H
2
+ #define TORCHAUDIO_REGISTER_H
3
+
4
+ #include <torchaudio/csrc/sox_effects.h>
5
+ #include <torchaudio/csrc/sox_io.h>
6
+ #include <torchaudio/csrc/sox_utils.h>
7
+
8
+ namespace torchaudio {
9
+ namespace {
10
+
11
+ ////////////////////////////////////////////////////////////////////////////////
12
+ // sox_utils.h
13
+ ////////////////////////////////////////////////////////////////////////////////
14
+ static auto registerTensorSignal =
15
+ torch::class_<sox_utils::TensorSignal>("torchaudio", "TensorSignal")
16
+ .def(torch::init<torch::Tensor, int64_t, bool>())
17
+ .def("get_tensor", &sox_utils::TensorSignal::getTensor)
18
+ .def("get_sample_rate", &sox_utils::TensorSignal::getSampleRate)
19
+ .def("get_channels_first", &sox_utils::TensorSignal::getChannelsFirst);
20
+
21
+ ////////////////////////////////////////////////////////////////////////////////
22
+ // sox_io.h
23
+ ////////////////////////////////////////////////////////////////////////////////
24
+ static auto registerSignalInfo =
25
+ torch::class_<sox_io::SignalInfo>("torchaudio", "SignalInfo")
26
+ .def("get_sample_rate", &sox_io::SignalInfo::getSampleRate)
27
+ .def("get_num_channels", &sox_io::SignalInfo::getNumChannels)
28
+ .def("get_num_frames", &sox_io::SignalInfo::getNumFrames);
29
+
30
+ static auto registerGetInfo = torch::RegisterOperators().op(
31
+ torch::RegisterOperators::options()
32
+ .schema(
33
+ "torchaudio::sox_io_get_info(str path) -> __torch__.torch.classes.torchaudio.SignalInfo info")
34
+ .catchAllKernel<decltype(sox_io::get_info), &sox_io::get_info>());
35
+
36
+ static auto registerLoadAudioFile = torch::RegisterOperators().op(
37
+ torch::RegisterOperators::options()
38
+ .schema(
39
+ "torchaudio::sox_io_load_audio_file(str path, int frame_offset, int num_frames, bool normalize, bool channels_first) -> __torch__.torch.classes.torchaudio.TensorSignal signal")
40
+ .catchAllKernel<
41
+ decltype(sox_io::load_audio_file),
42
+ &sox_io::load_audio_file>());
43
+
44
+ static auto registerSaveAudioFile = torch::RegisterOperators().op(
45
+ torch::RegisterOperators::options()
46
+ .schema(
47
+ "torchaudio::sox_io_save_audio_file(str path, __torch__.torch.classes.torchaudio.TensorSignal signal, float compression) -> ()")
48
+ .catchAllKernel<
49
+ decltype(sox_io::save_audio_file),
50
+ &sox_io::save_audio_file>());
51
+
52
+ ////////////////////////////////////////////////////////////////////////////////
53
+ // sox_effects.h
54
+ ////////////////////////////////////////////////////////////////////////////////
55
+ static auto registerSoxEffects =
56
+ torch::RegisterOperators(
57
+ "torchaudio::sox_effects_initialize_sox_effects",
58
+ &sox_effects::initialize_sox_effects)
59
+ .op("torchaudio::sox_effects_shutdown_sox_effects",
60
+ &sox_effects::shutdown_sox_effects)
61
+ .op("torchaudio::sox_effects_list_effects", &sox_effects::list_effects);
62
+
63
+ } // namespace
64
+ } // namespace torchaudio
65
+ #endif
@@ -0,0 +1,361 @@
1
+ #include <torchaudio/csrc/sox.h>
2
+
3
+ #include <algorithm>
4
+ #include <cstdint>
5
+ #include <stdexcept>
6
+ #include <vector>
7
+
8
+ namespace torch {
9
+ namespace audio {
10
+ namespace {
11
+ /// Helper struct to safely close the sox_format_t descriptor.
12
+ struct SoxDescriptor {
13
+ explicit SoxDescriptor(sox_format_t* fd) noexcept : fd_(fd) {}
14
+ SoxDescriptor(const SoxDescriptor& other) = delete;
15
+ SoxDescriptor(SoxDescriptor&& other) = delete;
16
+ SoxDescriptor& operator=(const SoxDescriptor& other) = delete;
17
+ SoxDescriptor& operator=(SoxDescriptor&& other) = delete;
18
+ ~SoxDescriptor() {
19
+ if (fd_ != nullptr) {
20
+ sox_close(fd_);
21
+ }
22
+ }
23
+ sox_format_t* operator->() noexcept {
24
+ return fd_;
25
+ }
26
+ sox_format_t* get() noexcept {
27
+ return fd_;
28
+ }
29
+
30
+ private:
31
+ sox_format_t* fd_;
32
+ };
33
+
34
+ int64_t write_audio(SoxDescriptor& fd, at::Tensor tensor) {
35
+ std::vector<sox_sample_t> buffer(tensor.numel());
36
+
37
+ AT_DISPATCH_ALL_TYPES(tensor.scalar_type(), "write_audio_buffer", [&] {
38
+ auto* data = tensor.data_ptr<scalar_t>();
39
+ std::copy(data, data + tensor.numel(), buffer.begin());
40
+ });
41
+
42
+ const auto samples_written =
43
+ sox_write(fd.get(), buffer.data(), buffer.size());
44
+
45
+ return samples_written;
46
+ }
47
+
48
+ void read_audio(
49
+ SoxDescriptor& fd,
50
+ at::Tensor output,
51
+ int64_t buffer_length) {
52
+ std::vector<sox_sample_t> buffer(buffer_length);
53
+
54
+ int number_of_channels = fd->signal.channels;
55
+ const int64_t samples_read = sox_read(fd.get(), buffer.data(), buffer_length);
56
+ if (samples_read == 0) {
57
+ throw std::runtime_error(
58
+ "Error reading audio file: empty file or read failed in sox_read");
59
+ }
60
+
61
+ output.resize_({samples_read / number_of_channels, number_of_channels});
62
+ output = output.contiguous();
63
+
64
+ AT_DISPATCH_ALL_TYPES(output.scalar_type(), "read_audio_buffer", [&] {
65
+ auto* data = output.data_ptr<scalar_t>();
66
+ std::copy(buffer.begin(), buffer.begin() + samples_read, data);
67
+ });
68
+ }
69
+ } // namespace
70
+
71
+ std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
72
+ const std::string& file_name
73
+ ) {
74
+ SoxDescriptor fd(sox_open_read(
75
+ file_name.c_str(),
76
+ /*signal=*/nullptr,
77
+ /*encoding=*/nullptr,
78
+ /*filetype=*/nullptr));
79
+ if (fd.get() == nullptr) {
80
+ throw std::runtime_error("Error opening audio file");
81
+ }
82
+ return std::make_tuple(fd->signal, fd->encoding);
83
+ }
84
+
85
+ int read_audio_file(
86
+ const std::string& file_name,
87
+ at::Tensor output,
88
+ bool ch_first,
89
+ int64_t nframes,
90
+ int64_t offset,
91
+ sox_signalinfo_t* si,
92
+ sox_encodinginfo_t* ei,
93
+ const char* ft) {
94
+
95
+ SoxDescriptor fd(sox_open_read(file_name.c_str(), si, ei, ft));
96
+ if (fd.get() == nullptr) {
97
+ throw std::runtime_error("Error opening audio file");
98
+ }
99
+
100
+ // signal info
101
+
102
+ const int number_of_channels = fd->signal.channels;
103
+ const int sample_rate = fd->signal.rate;
104
+ const int64_t total_length = fd->signal.length;
105
+
106
+ // multiply offset and number of frames by number of channels
107
+ offset *= number_of_channels;
108
+ nframes *= number_of_channels;
109
+
110
+ if (total_length == 0) {
111
+ throw std::runtime_error("Error reading audio file: unknown length");
112
+ }
113
+ if (offset > total_length) {
114
+ throw std::runtime_error("Offset past EOF");
115
+ }
116
+
117
+ // calculate buffer length
118
+ int64_t buffer_length = total_length;
119
+ if (offset > 0) {
120
+ buffer_length -= offset;
121
+ }
122
+ if (nframes > 0 && buffer_length > nframes) {
123
+ buffer_length = nframes;
124
+ }
125
+
126
+ // seek to offset point before reading data
127
+ if (sox_seek(fd.get(), offset, 0) == SOX_EOF) {
128
+ throw std::runtime_error("sox_seek reached EOF, try reducing offset or num_samples");
129
+ }
130
+
131
+ // read data and fill output tensor
132
+ read_audio(fd, output, buffer_length);
133
+
134
+ // L x C -> C x L, if desired
135
+ if (ch_first) {
136
+ output.transpose_(1, 0);
137
+ }
138
+
139
+ return sample_rate;
140
+ }
141
+
142
+ void write_audio_file(
143
+ const std::string& file_name,
144
+ const at::Tensor& tensor,
145
+ sox_signalinfo_t* si,
146
+ sox_encodinginfo_t* ei,
147
+ const char* file_type) {
148
+ if (!tensor.is_contiguous()) {
149
+ throw std::runtime_error(
150
+ "Error writing audio file: input tensor must be contiguous");
151
+ }
152
+
153
+ #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
154
+ si->mult = nullptr;
155
+ #endif
156
+
157
+ SoxDescriptor fd(sox_open_write(
158
+ file_name.c_str(),
159
+ si,
160
+ ei,
161
+ file_type,
162
+ /*oob=*/nullptr,
163
+ /*overwrite=*/nullptr));
164
+
165
+ if (fd.get() == nullptr) {
166
+ throw std::runtime_error(
167
+ "Error writing audio file: could not open file for writing");
168
+ }
169
+
170
+ const auto samples_written = write_audio(fd, tensor);
171
+
172
+ if (samples_written != tensor.numel()) {
173
+ throw std::runtime_error(
174
+ "Error writing audio file: could not write entire buffer");
175
+ }
176
+ }
177
+
178
+ int build_flow_effects(const std::string& file_name,
179
+ at::Tensor otensor,
180
+ bool ch_first,
181
+ sox_signalinfo_t* target_signal,
182
+ sox_encodinginfo_t* target_encoding,
183
+ const char* file_type,
184
+ std::vector<SoxEffect> pyeffs,
185
+ int max_num_eopts) {
186
+
187
+ /* This function builds an effects flow and puts the results into a tensor.
188
+ It can also be used to re-encode audio using any of the available encoding
189
+ options in SoX including sample rate and channel re-encoding. */
190
+
191
+ // open input
192
+ sox_format_t* input = sox_open_read(file_name.c_str(), nullptr, nullptr, nullptr);
193
+ if (input == nullptr) {
194
+ throw std::runtime_error("Error opening audio file");
195
+ }
196
+
197
+ // only used if target signal or encoding are null
198
+ sox_signalinfo_t empty_signal;
199
+ sox_encodinginfo_t empty_encoding;
200
+
201
+ // set signalinfo and encodinginfo if blank
202
+ if(target_signal == nullptr) {
203
+ target_signal = &empty_signal;
204
+ target_signal->rate = input->signal.rate;
205
+ target_signal->channels = input->signal.channels;
206
+ target_signal->length = SOX_UNSPEC;
207
+ target_signal->precision = input->signal.precision;
208
+ #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
209
+ target_signal->mult = nullptr;
210
+ #endif
211
+ }
212
+ if(target_encoding == nullptr) {
213
+ target_encoding = &empty_encoding;
214
+ target_encoding->encoding = SOX_ENCODING_SIGN2; // Sample format
215
+ target_encoding->bits_per_sample = input->signal.precision; // Bits per sample
216
+ target_encoding->compression = 0.0; // Compression factor
217
+ target_encoding->reverse_bytes = sox_option_default; // Should bytes be reversed
218
+ target_encoding->reverse_nibbles = sox_option_default; // Should nibbles be reversed
219
+ target_encoding->reverse_bits = sox_option_default; // Should bits be reversed (pairs of bits?)
220
+ target_encoding->opposite_endian = sox_false; // Reverse endianness
221
+ }
222
+
223
+ // check for rate or channels effect and change the output signalinfo accordingly
224
+ for (SoxEffect se : pyeffs) {
225
+ if (se.ename == "rate") {
226
+ target_signal->rate = std::stod(se.eopts[0]);
227
+ } else if (se.ename == "channels") {
228
+ target_signal->channels = std::stoi(se.eopts[0]);
229
+ }
230
+ }
231
+
232
+ // create interm_signal for effects, intermediate steps change this in-place
233
+ sox_signalinfo_t interm_signal = input->signal;
234
+
235
+ #ifdef __APPLE__
236
+ // According to Mozilla Deepspeech sox_open_memstream_write doesn't work
237
+ // with OSX
238
+ char tmp_name[] = "/tmp/fileXXXXXX";
239
+ int tmp_fd = mkstemp(tmp_name);
240
+ close(tmp_fd);
241
+ sox_format_t* output = sox_open_write(tmp_name, target_signal,
242
+ target_encoding, "wav", nullptr, nullptr);
243
+ #else
244
+ // create buffer and buffer_size for output in memwrite
245
+ char* buffer;
246
+ size_t buffer_size;
247
+ // in-memory descriptor (this may not work for OSX)
248
+ sox_format_t* output = sox_open_memstream_write(&buffer,
249
+ &buffer_size,
250
+ target_signal,
251
+ target_encoding,
252
+ file_type, nullptr);
253
+ #endif
254
+ if (output == nullptr) {
255
+ throw std::runtime_error("Error opening output memstream/temporary file");
256
+ }
257
+ // Setup the effects chain to decode/resample
258
+ sox_effects_chain_t* chain =
259
+ sox_create_effects_chain(&input->encoding, &output->encoding);
260
+
261
+ sox_effect_t* e = sox_create_effect(sox_find_effect("input"));
262
+ char* io_args[1];
263
+ io_args[0] = (char*)input;
264
+ sox_effect_options(e, 1, io_args);
265
+ sox_add_effect(chain, e, &interm_signal, &input->signal);
266
+ free(e);
267
+
268
+ for(SoxEffect tae : pyeffs) {
269
+ if(tae.ename == "no_effects") break;
270
+ e = sox_create_effect(sox_find_effect(tae.ename.c_str()));
271
+ e->global_info->global_info->verbosity = 1;
272
+ if(tae.eopts[0] == "") {
273
+ sox_effect_options(e, 0, nullptr);
274
+ } else {
275
+ int num_opts = tae.eopts.size();
276
+ char* sox_args[max_num_eopts];
277
+ for(std::vector<std::string>::size_type i = 0; i != tae.eopts.size(); i++) {
278
+ sox_args[i] = (char*) tae.eopts[i].c_str();
279
+ }
280
+ if(sox_effect_options(e, num_opts, sox_args) != SOX_SUCCESS) {
281
+ #ifdef __APPLE__
282
+ unlink(tmp_name);
283
+ #endif
284
+ throw std::runtime_error("invalid effect options, see SoX docs for details");
285
+ }
286
+ }
287
+ sox_add_effect(chain, e, &interm_signal, &output->signal);
288
+ free(e);
289
+ }
290
+
291
+ e = sox_create_effect(sox_find_effect("output"));
292
+ io_args[0] = (char*)output;
293
+ sox_effect_options(e, 1, io_args);
294
+ sox_add_effect(chain, e, &interm_signal, &output->signal);
295
+ free(e);
296
+
297
+ // Finally run the effects chain
298
+ sox_flow_effects(chain, nullptr, nullptr);
299
+ sox_delete_effects_chain(chain);
300
+
301
+ // Close sox handles, buffer does not get properly sized until these are closed
302
+ sox_close(output);
303
+ sox_close(input);
304
+
305
+ int sr;
306
+ // Read the in-memory audio buffer or temp file that we just wrote.
307
+ #ifdef __APPLE__
308
+ /*
309
+ Temporary filetype must have a valid header. Wav seems to work here while
310
+ raw does not. Certain effects like chorus caused strange behavior on the mac.
311
+ */
312
+ // read_audio_file reads the temporary file and returns the sr and otensor
313
+ sr = read_audio_file(tmp_name, otensor, ch_first, 0, 0,
314
+ target_signal, target_encoding, "wav");
315
+ // delete temporary audio file
316
+ unlink(tmp_name);
317
+ #else
318
+ // Resize output tensor to desired dimensions, different effects result in output->signal.length,
319
+ // interm_signal.length and buffer size being inconsistent with the result of the file output.
320
+ // We prioritize in the order: output->signal.length > interm_signal.length > buffer_size
321
+ // Could be related to: https://sourceforge.net/p/sox/bugs/314/
322
+ int nc, ns;
323
+ if (output->signal.length == 0) {
324
+ // sometimes interm_signal length is extremely large, but the buffer_size
325
+ // is double the length of the output signal
326
+ if (interm_signal.length > (buffer_size * 10)) {
327
+ ns = buffer_size / 2;
328
+ } else {
329
+ ns = interm_signal.length;
330
+ }
331
+ nc = interm_signal.channels;
332
+ } else {
333
+ nc = output->signal.channels;
334
+ ns = output->signal.length;
335
+ }
336
+ otensor.resize_({ns/nc, nc});
337
+ otensor = otensor.contiguous();
338
+
339
+ input = sox_open_mem_read(buffer, buffer_size, target_signal, target_encoding, file_type);
340
+ std::vector<sox_sample_t> samples(buffer_size);
341
+ const int64_t samples_read = sox_read(input, samples.data(), buffer_size);
342
+ assert(samples_read != nc * ns && samples_read != 0);
343
+ AT_DISPATCH_ALL_TYPES(otensor.scalar_type(), "effects_buffer", [&] {
344
+ auto* data = otensor.data_ptr<scalar_t>();
345
+ std::copy(samples.begin(), samples.begin() + samples_read, data);
346
+ });
347
+ // free buffer and close mem_read
348
+ sox_close(input);
349
+ free(buffer);
350
+
351
+ if (ch_first) {
352
+ otensor.transpose_(1, 0);
353
+ }
354
+ sr = target_signal->rate;
355
+
356
+ #endif
357
+ // return sample rate, output tensor modified in-place
358
+ return sr;
359
+ }
360
+ } // namespace audio
361
+ } // namespace torch