torchaudio 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b527976494325cc12e81342c25d318204d2d7c75bfba7036be4296769cdb30a0
4
+ data.tar.gz: 2cfde7bd1b0e7a1628818d5bd74657cfbfba6dfa83ef42897f3ad0f98e77f739
5
+ SHA512:
6
+ metadata.gz: 8e6f34b014340b5ace3193ab589dae75ed0869ab7606402bd4b09de6042299e6f3a118d439dd381491f489ce9552bca4376a7d5b4693dddc3d1c5f5b26540900
7
+ data.tar.gz: d651c46f5185ceb70ae3d9c90154c77afe29a5c35854d1a9d98913096b7ab9ba39a745242dd268548ca87f9e109b56c96dee9dc5539cf066f9ad0f773eddbdcd
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-08-24)
2
+
3
+ - First release
@@ -0,0 +1,26 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
4
+ Copyright (c) 2020 Andrew Kane,
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,93 @@
1
+ # TorchAudio
2
+
3
+ :fire: An audio library for Torch.rb
4
+
5
+ ## Installation
6
+
7
+ First, [install SoX](#sox-installation). For Homebrew, use:
8
+
9
+ ```sh
10
+ brew install sox
11
+ ```
12
+
13
+ Add this line to your application’s Gemfile:
14
+
15
+ ```ruby
16
+ gem 'torchaudio'
17
+ ```
18
+
19
+ ## Getting Started
20
+
21
+ This library follows the [Python API](https://pytorch.org/audio/). Many methods and options are missing at the moment. PRs welcome!
22
+
23
+ ## Datasets
24
+
25
+ Load a dataset
26
+
27
+ ```ruby
28
+ TorchAudio::Datasets::YESNO.new(".", download: true)
29
+ ```
30
+
31
+ Supported datasets are:
32
+
33
+ - [YESNO](http://www.openslr.org/1/)
34
+
35
+ ## Disclaimer
36
+
37
+ This library downloads and prepares public datasets. We don’t host any datasets. Be sure to adhere to the license for each dataset.
38
+
39
+ If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
40
+
41
+ ## SoX Installation
42
+
43
+ ### Mac
44
+
45
+ ```sh
46
+ brew install sox
47
+ ```
48
+
49
+ ### Windows
50
+
51
+ todo
52
+
53
+ ### Ubuntu
54
+
55
+ ```sh
56
+ sudo apt install sox libsox-dev libsox-fmt-all
57
+ ```
58
+
59
+ ### Travis CI
60
+
61
+ Add to `.travis.yml`:
62
+
63
+ ```yml
64
+ addons:
65
+ apt:
66
+ packages:
67
+ - sox
68
+ - libsox-dev
69
+ - libsox-fmt-all
70
+ ```
71
+
72
+ ## History
73
+
74
+ View the [changelog](https://github.com/ankane/torchaudio/blob/master/CHANGELOG.md)
75
+
76
+ ## Contributing
77
+
78
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
79
+
80
+ - [Report bugs](https://github.com/ankane/torchaudio/issues)
81
+ - Fix bugs and [submit pull requests](https://github.com/ankane/torchaudio/pulls)
82
+ - Write, clarify, or fix documentation
83
+ - Suggest or add new features
84
+
85
+ To get started with development:
86
+
87
+ ```sh
88
+ git clone https://github.com/ankane/torchaudio.git
89
+ cd torchaudio
90
+ bundle install
91
+ bundle exec rake compile
92
+ bundle exec rake test
93
+ ```
@@ -0,0 +1,65 @@
1
+ #ifndef TORCHAUDIO_REGISTER_H
2
+ #define TORCHAUDIO_REGISTER_H
3
+
4
+ #include <torchaudio/csrc/sox_effects.h>
5
+ #include <torchaudio/csrc/sox_io.h>
6
+ #include <torchaudio/csrc/sox_utils.h>
7
+
8
+ namespace torchaudio {
9
+ namespace {
10
+
11
+ ////////////////////////////////////////////////////////////////////////////////
12
+ // sox_utils.h
13
+ ////////////////////////////////////////////////////////////////////////////////
14
+ static auto registerTensorSignal =
15
+ torch::class_<sox_utils::TensorSignal>("torchaudio", "TensorSignal")
16
+ .def(torch::init<torch::Tensor, int64_t, bool>())
17
+ .def("get_tensor", &sox_utils::TensorSignal::getTensor)
18
+ .def("get_sample_rate", &sox_utils::TensorSignal::getSampleRate)
19
+ .def("get_channels_first", &sox_utils::TensorSignal::getChannelsFirst);
20
+
21
+ ////////////////////////////////////////////////////////////////////////////////
22
+ // sox_io.h
23
+ ////////////////////////////////////////////////////////////////////////////////
24
+ static auto registerSignalInfo =
25
+ torch::class_<sox_io::SignalInfo>("torchaudio", "SignalInfo")
26
+ .def("get_sample_rate", &sox_io::SignalInfo::getSampleRate)
27
+ .def("get_num_channels", &sox_io::SignalInfo::getNumChannels)
28
+ .def("get_num_frames", &sox_io::SignalInfo::getNumFrames);
29
+
30
+ static auto registerGetInfo = torch::RegisterOperators().op(
31
+ torch::RegisterOperators::options()
32
+ .schema(
33
+ "torchaudio::sox_io_get_info(str path) -> __torch__.torch.classes.torchaudio.SignalInfo info")
34
+ .catchAllKernel<decltype(sox_io::get_info), &sox_io::get_info>());
35
+
36
+ static auto registerLoadAudioFile = torch::RegisterOperators().op(
37
+ torch::RegisterOperators::options()
38
+ .schema(
39
+ "torchaudio::sox_io_load_audio_file(str path, int frame_offset, int num_frames, bool normalize, bool channels_first) -> __torch__.torch.classes.torchaudio.TensorSignal signal")
40
+ .catchAllKernel<
41
+ decltype(sox_io::load_audio_file),
42
+ &sox_io::load_audio_file>());
43
+
44
+ static auto registerSaveAudioFile = torch::RegisterOperators().op(
45
+ torch::RegisterOperators::options()
46
+ .schema(
47
+ "torchaudio::sox_io_save_audio_file(str path, __torch__.torch.classes.torchaudio.TensorSignal signal, float compression) -> ()")
48
+ .catchAllKernel<
49
+ decltype(sox_io::save_audio_file),
50
+ &sox_io::save_audio_file>());
51
+
52
+ ////////////////////////////////////////////////////////////////////////////////
53
+ // sox_effects.h
54
+ ////////////////////////////////////////////////////////////////////////////////
55
+ static auto registerSoxEffects =
56
+ torch::RegisterOperators(
57
+ "torchaudio::sox_effects_initialize_sox_effects",
58
+ &sox_effects::initialize_sox_effects)
59
+ .op("torchaudio::sox_effects_shutdown_sox_effects",
60
+ &sox_effects::shutdown_sox_effects)
61
+ .op("torchaudio::sox_effects_list_effects", &sox_effects::list_effects);
62
+
63
+ } // namespace
64
+ } // namespace torchaudio
65
+ #endif
@@ -0,0 +1,361 @@
1
+ #include <torchaudio/csrc/sox.h>
2
+
3
+ #include <algorithm>
4
+ #include <cstdint>
5
+ #include <stdexcept>
6
+ #include <vector>
7
+
8
+ namespace torch {
9
+ namespace audio {
10
+ namespace {
11
+ /// Helper struct to safely close the sox_format_t descriptor.
12
+ struct SoxDescriptor {
13
+ explicit SoxDescriptor(sox_format_t* fd) noexcept : fd_(fd) {}
14
+ SoxDescriptor(const SoxDescriptor& other) = delete;
15
+ SoxDescriptor(SoxDescriptor&& other) = delete;
16
+ SoxDescriptor& operator=(const SoxDescriptor& other) = delete;
17
+ SoxDescriptor& operator=(SoxDescriptor&& other) = delete;
18
+ ~SoxDescriptor() {
19
+ if (fd_ != nullptr) {
20
+ sox_close(fd_);
21
+ }
22
+ }
23
+ sox_format_t* operator->() noexcept {
24
+ return fd_;
25
+ }
26
+ sox_format_t* get() noexcept {
27
+ return fd_;
28
+ }
29
+
30
+ private:
31
+ sox_format_t* fd_;
32
+ };
33
+
34
+ int64_t write_audio(SoxDescriptor& fd, at::Tensor tensor) {
35
+ std::vector<sox_sample_t> buffer(tensor.numel());
36
+
37
+ AT_DISPATCH_ALL_TYPES(tensor.scalar_type(), "write_audio_buffer", [&] {
38
+ auto* data = tensor.data_ptr<scalar_t>();
39
+ std::copy(data, data + tensor.numel(), buffer.begin());
40
+ });
41
+
42
+ const auto samples_written =
43
+ sox_write(fd.get(), buffer.data(), buffer.size());
44
+
45
+ return samples_written;
46
+ }
47
+
48
+ void read_audio(
49
+ SoxDescriptor& fd,
50
+ at::Tensor output,
51
+ int64_t buffer_length) {
52
+ std::vector<sox_sample_t> buffer(buffer_length);
53
+
54
+ int number_of_channels = fd->signal.channels;
55
+ const int64_t samples_read = sox_read(fd.get(), buffer.data(), buffer_length);
56
+ if (samples_read == 0) {
57
+ throw std::runtime_error(
58
+ "Error reading audio file: empty file or read failed in sox_read");
59
+ }
60
+
61
+ output.resize_({samples_read / number_of_channels, number_of_channels});
62
+ output = output.contiguous();
63
+
64
+ AT_DISPATCH_ALL_TYPES(output.scalar_type(), "read_audio_buffer", [&] {
65
+ auto* data = output.data_ptr<scalar_t>();
66
+ std::copy(buffer.begin(), buffer.begin() + samples_read, data);
67
+ });
68
+ }
69
+ } // namespace
70
+
71
+ std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
72
+ const std::string& file_name
73
+ ) {
74
+ SoxDescriptor fd(sox_open_read(
75
+ file_name.c_str(),
76
+ /*signal=*/nullptr,
77
+ /*encoding=*/nullptr,
78
+ /*filetype=*/nullptr));
79
+ if (fd.get() == nullptr) {
80
+ throw std::runtime_error("Error opening audio file");
81
+ }
82
+ return std::make_tuple(fd->signal, fd->encoding);
83
+ }
84
+
85
+ int read_audio_file(
86
+ const std::string& file_name,
87
+ at::Tensor output,
88
+ bool ch_first,
89
+ int64_t nframes,
90
+ int64_t offset,
91
+ sox_signalinfo_t* si,
92
+ sox_encodinginfo_t* ei,
93
+ const char* ft) {
94
+
95
+ SoxDescriptor fd(sox_open_read(file_name.c_str(), si, ei, ft));
96
+ if (fd.get() == nullptr) {
97
+ throw std::runtime_error("Error opening audio file");
98
+ }
99
+
100
+ // signal info
101
+
102
+ const int number_of_channels = fd->signal.channels;
103
+ const int sample_rate = fd->signal.rate;
104
+ const int64_t total_length = fd->signal.length;
105
+
106
+ // multiply offset and number of frames by number of channels
107
+ offset *= number_of_channels;
108
+ nframes *= number_of_channels;
109
+
110
+ if (total_length == 0) {
111
+ throw std::runtime_error("Error reading audio file: unknown length");
112
+ }
113
+ if (offset > total_length) {
114
+ throw std::runtime_error("Offset past EOF");
115
+ }
116
+
117
+ // calculate buffer length
118
+ int64_t buffer_length = total_length;
119
+ if (offset > 0) {
120
+ buffer_length -= offset;
121
+ }
122
+ if (nframes > 0 && buffer_length > nframes) {
123
+ buffer_length = nframes;
124
+ }
125
+
126
+ // seek to offset point before reading data
127
+ if (sox_seek(fd.get(), offset, 0) == SOX_EOF) {
128
+ throw std::runtime_error("sox_seek reached EOF, try reducing offset or num_samples");
129
+ }
130
+
131
+ // read data and fill output tensor
132
+ read_audio(fd, output, buffer_length);
133
+
134
+ // L x C -> C x L, if desired
135
+ if (ch_first) {
136
+ output.transpose_(1, 0);
137
+ }
138
+
139
+ return sample_rate;
140
+ }
141
+
142
+ void write_audio_file(
143
+ const std::string& file_name,
144
+ const at::Tensor& tensor,
145
+ sox_signalinfo_t* si,
146
+ sox_encodinginfo_t* ei,
147
+ const char* file_type) {
148
+ if (!tensor.is_contiguous()) {
149
+ throw std::runtime_error(
150
+ "Error writing audio file: input tensor must be contiguous");
151
+ }
152
+
153
+ #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
154
+ si->mult = nullptr;
155
+ #endif
156
+
157
+ SoxDescriptor fd(sox_open_write(
158
+ file_name.c_str(),
159
+ si,
160
+ ei,
161
+ file_type,
162
+ /*oob=*/nullptr,
163
+ /*overwrite=*/nullptr));
164
+
165
+ if (fd.get() == nullptr) {
166
+ throw std::runtime_error(
167
+ "Error writing audio file: could not open file for writing");
168
+ }
169
+
170
+ const auto samples_written = write_audio(fd, tensor);
171
+
172
+ if (samples_written != tensor.numel()) {
173
+ throw std::runtime_error(
174
+ "Error writing audio file: could not write entire buffer");
175
+ }
176
+ }
177
+
178
+ int build_flow_effects(const std::string& file_name,
179
+ at::Tensor otensor,
180
+ bool ch_first,
181
+ sox_signalinfo_t* target_signal,
182
+ sox_encodinginfo_t* target_encoding,
183
+ const char* file_type,
184
+ std::vector<SoxEffect> pyeffs,
185
+ int max_num_eopts) {
186
+
187
+ /* This function builds an effects flow and puts the results into a tensor.
188
+ It can also be used to re-encode audio using any of the available encoding
189
+ options in SoX including sample rate and channel re-encoding. */
190
+
191
+ // open input
192
+ sox_format_t* input = sox_open_read(file_name.c_str(), nullptr, nullptr, nullptr);
193
+ if (input == nullptr) {
194
+ throw std::runtime_error("Error opening audio file");
195
+ }
196
+
197
+ // only used if target signal or encoding are null
198
+ sox_signalinfo_t empty_signal;
199
+ sox_encodinginfo_t empty_encoding;
200
+
201
+ // set signalinfo and encodinginfo if blank
202
+ if(target_signal == nullptr) {
203
+ target_signal = &empty_signal;
204
+ target_signal->rate = input->signal.rate;
205
+ target_signal->channels = input->signal.channels;
206
+ target_signal->length = SOX_UNSPEC;
207
+ target_signal->precision = input->signal.precision;
208
+ #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
209
+ target_signal->mult = nullptr;
210
+ #endif
211
+ }
212
+ if(target_encoding == nullptr) {
213
+ target_encoding = &empty_encoding;
214
+ target_encoding->encoding = SOX_ENCODING_SIGN2; // Sample format
215
+ target_encoding->bits_per_sample = input->signal.precision; // Bits per sample
216
+ target_encoding->compression = 0.0; // Compression factor
217
+ target_encoding->reverse_bytes = sox_option_default; // Should bytes be reversed
218
+ target_encoding->reverse_nibbles = sox_option_default; // Should nibbles be reversed
219
+ target_encoding->reverse_bits = sox_option_default; // Should bits be reversed (pairs of bits?)
220
+ target_encoding->opposite_endian = sox_false; // Reverse endianness
221
+ }
222
+
223
+ // check for rate or channels effect and change the output signalinfo accordingly
224
+ for (SoxEffect se : pyeffs) {
225
+ if (se.ename == "rate") {
226
+ target_signal->rate = std::stod(se.eopts[0]);
227
+ } else if (se.ename == "channels") {
228
+ target_signal->channels = std::stoi(se.eopts[0]);
229
+ }
230
+ }
231
+
232
+ // create interm_signal for effects, intermediate steps change this in-place
233
+ sox_signalinfo_t interm_signal = input->signal;
234
+
235
+ #ifdef __APPLE__
236
+ // According to Mozilla Deepspeech sox_open_memstream_write doesn't work
237
+ // with OSX
238
+ char tmp_name[] = "/tmp/fileXXXXXX";
239
+ int tmp_fd = mkstemp(tmp_name);
240
+ close(tmp_fd);
241
+ sox_format_t* output = sox_open_write(tmp_name, target_signal,
242
+ target_encoding, "wav", nullptr, nullptr);
243
+ #else
244
+ // create buffer and buffer_size for output in memwrite
245
+ char* buffer;
246
+ size_t buffer_size;
247
+ // in-memory descriptor (this may not work for OSX)
248
+ sox_format_t* output = sox_open_memstream_write(&buffer,
249
+ &buffer_size,
250
+ target_signal,
251
+ target_encoding,
252
+ file_type, nullptr);
253
+ #endif
254
+ if (output == nullptr) {
255
+ throw std::runtime_error("Error opening output memstream/temporary file");
256
+ }
257
+ // Setup the effects chain to decode/resample
258
+ sox_effects_chain_t* chain =
259
+ sox_create_effects_chain(&input->encoding, &output->encoding);
260
+
261
+ sox_effect_t* e = sox_create_effect(sox_find_effect("input"));
262
+ char* io_args[1];
263
+ io_args[0] = (char*)input;
264
+ sox_effect_options(e, 1, io_args);
265
+ sox_add_effect(chain, e, &interm_signal, &input->signal);
266
+ free(e);
267
+
268
+ for(SoxEffect tae : pyeffs) {
269
+ if(tae.ename == "no_effects") break;
270
+ e = sox_create_effect(sox_find_effect(tae.ename.c_str()));
271
+ e->global_info->global_info->verbosity = 1;
272
+ if(tae.eopts[0] == "") {
273
+ sox_effect_options(e, 0, nullptr);
274
+ } else {
275
+ int num_opts = tae.eopts.size();
276
+ char* sox_args[max_num_eopts];
277
+ for(std::vector<std::string>::size_type i = 0; i != tae.eopts.size(); i++) {
278
+ sox_args[i] = (char*) tae.eopts[i].c_str();
279
+ }
280
+ if(sox_effect_options(e, num_opts, sox_args) != SOX_SUCCESS) {
281
+ #ifdef __APPLE__
282
+ unlink(tmp_name);
283
+ #endif
284
+ throw std::runtime_error("invalid effect options, see SoX docs for details");
285
+ }
286
+ }
287
+ sox_add_effect(chain, e, &interm_signal, &output->signal);
288
+ free(e);
289
+ }
290
+
291
+ e = sox_create_effect(sox_find_effect("output"));
292
+ io_args[0] = (char*)output;
293
+ sox_effect_options(e, 1, io_args);
294
+ sox_add_effect(chain, e, &interm_signal, &output->signal);
295
+ free(e);
296
+
297
+ // Finally run the effects chain
298
+ sox_flow_effects(chain, nullptr, nullptr);
299
+ sox_delete_effects_chain(chain);
300
+
301
+ // Close sox handles, buffer does not get properly sized until these are closed
302
+ sox_close(output);
303
+ sox_close(input);
304
+
305
+ int sr;
306
+ // Read the in-memory audio buffer or temp file that we just wrote.
307
+ #ifdef __APPLE__
308
+ /*
309
+ Temporary filetype must have a valid header. Wav seems to work here while
310
+ raw does not. Certain effects like chorus caused strange behavior on the mac.
311
+ */
312
+ // read_audio_file reads the temporary file and returns the sr and otensor
313
+ sr = read_audio_file(tmp_name, otensor, ch_first, 0, 0,
314
+ target_signal, target_encoding, "wav");
315
+ // delete temporary audio file
316
+ unlink(tmp_name);
317
+ #else
318
+ // Resize output tensor to desired dimensions, different effects result in output->signal.length,
319
+ // interm_signal.length and buffer size being inconsistent with the result of the file output.
320
+ // We prioritize in the order: output->signal.length > interm_signal.length > buffer_size
321
+ // Could be related to: https://sourceforge.net/p/sox/bugs/314/
322
+ int nc, ns;
323
+ if (output->signal.length == 0) {
324
+ // sometimes interm_signal length is extremely large, but the buffer_size
325
+ // is double the length of the output signal
326
+ if (interm_signal.length > (buffer_size * 10)) {
327
+ ns = buffer_size / 2;
328
+ } else {
329
+ ns = interm_signal.length;
330
+ }
331
+ nc = interm_signal.channels;
332
+ } else {
333
+ nc = output->signal.channels;
334
+ ns = output->signal.length;
335
+ }
336
+ otensor.resize_({ns/nc, nc});
337
+ otensor = otensor.contiguous();
338
+
339
+ input = sox_open_mem_read(buffer, buffer_size, target_signal, target_encoding, file_type);
340
+ std::vector<sox_sample_t> samples(buffer_size);
341
+ const int64_t samples_read = sox_read(input, samples.data(), buffer_size);
342
+ assert(samples_read != nc * ns && samples_read != 0);
343
+ AT_DISPATCH_ALL_TYPES(otensor.scalar_type(), "effects_buffer", [&] {
344
+ auto* data = otensor.data_ptr<scalar_t>();
345
+ std::copy(samples.begin(), samples.begin() + samples_read, data);
346
+ });
347
+ // free buffer and close mem_read
348
+ sox_close(input);
349
+ free(buffer);
350
+
351
+ if (ch_first) {
352
+ otensor.transpose_(1, 0);
353
+ }
354
+ sr = target_signal->rate;
355
+
356
+ #endif
357
+ // return sample rate, output tensor modified in-place
358
+ return sr;
359
+ }
360
+ } // namespace audio
361
+ } // namespace torch