torchaudio 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/LICENSE.txt +1 -1
- data/README.md +2 -46
- data/lib/torchaudio/version.rb +1 -1
- data/lib/torchaudio.rb +113 -106
- metadata +4 -30
- data/ext/torchaudio/csrc/register.cpp +0 -65
- data/ext/torchaudio/csrc/sox.cpp +0 -361
- data/ext/torchaudio/csrc/sox.h +0 -71
- data/ext/torchaudio/csrc/sox_effects.cpp +0 -54
- data/ext/torchaudio/csrc/sox_effects.h +0 -18
- data/ext/torchaudio/csrc/sox_io.cpp +0 -170
- data/ext/torchaudio/csrc/sox_io.h +0 -41
- data/ext/torchaudio/csrc/sox_utils.cpp +0 -245
- data/ext/torchaudio/csrc/sox_utils.h +0 -100
- data/ext/torchaudio/ext.cpp +0 -34
- data/ext/torchaudio/extconf.rb +0 -85
data/ext/torchaudio/csrc/sox.cpp
DELETED
|
@@ -1,361 +0,0 @@
|
|
|
1
|
-
#include <torchaudio/csrc/sox.h>
|
|
2
|
-
|
|
3
|
-
#include <algorithm>
|
|
4
|
-
#include <cstdint>
|
|
5
|
-
#include <stdexcept>
|
|
6
|
-
#include <vector>
|
|
7
|
-
|
|
8
|
-
namespace torch {
|
|
9
|
-
namespace audio {
|
|
10
|
-
namespace {
|
|
11
|
-
/// Helper struct to safely close the sox_format_t descriptor.
|
|
12
|
-
struct SoxDescriptor {
|
|
13
|
-
explicit SoxDescriptor(sox_format_t* fd) noexcept : fd_(fd) {}
|
|
14
|
-
SoxDescriptor(const SoxDescriptor& other) = delete;
|
|
15
|
-
SoxDescriptor(SoxDescriptor&& other) = delete;
|
|
16
|
-
SoxDescriptor& operator=(const SoxDescriptor& other) = delete;
|
|
17
|
-
SoxDescriptor& operator=(SoxDescriptor&& other) = delete;
|
|
18
|
-
~SoxDescriptor() {
|
|
19
|
-
if (fd_ != nullptr) {
|
|
20
|
-
sox_close(fd_);
|
|
21
|
-
}
|
|
22
|
-
}
|
|
23
|
-
sox_format_t* operator->() noexcept {
|
|
24
|
-
return fd_;
|
|
25
|
-
}
|
|
26
|
-
sox_format_t* get() noexcept {
|
|
27
|
-
return fd_;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
private:
|
|
31
|
-
sox_format_t* fd_;
|
|
32
|
-
};
|
|
33
|
-
|
|
34
|
-
int64_t write_audio(SoxDescriptor& fd, at::Tensor tensor) {
|
|
35
|
-
std::vector<sox_sample_t> buffer(tensor.numel());
|
|
36
|
-
|
|
37
|
-
AT_DISPATCH_ALL_TYPES(tensor.scalar_type(), "write_audio_buffer", [&] {
|
|
38
|
-
auto* data = tensor.data_ptr<scalar_t>();
|
|
39
|
-
std::copy(data, data + tensor.numel(), buffer.begin());
|
|
40
|
-
});
|
|
41
|
-
|
|
42
|
-
const auto samples_written =
|
|
43
|
-
sox_write(fd.get(), buffer.data(), buffer.size());
|
|
44
|
-
|
|
45
|
-
return samples_written;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
void read_audio(
|
|
49
|
-
SoxDescriptor& fd,
|
|
50
|
-
at::Tensor output,
|
|
51
|
-
int64_t buffer_length) {
|
|
52
|
-
std::vector<sox_sample_t> buffer(buffer_length);
|
|
53
|
-
|
|
54
|
-
int number_of_channels = fd->signal.channels;
|
|
55
|
-
const int64_t samples_read = sox_read(fd.get(), buffer.data(), buffer_length);
|
|
56
|
-
if (samples_read == 0) {
|
|
57
|
-
throw std::runtime_error(
|
|
58
|
-
"Error reading audio file: empty file or read failed in sox_read");
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
output.resize_({samples_read / number_of_channels, number_of_channels});
|
|
62
|
-
output = output.contiguous();
|
|
63
|
-
|
|
64
|
-
AT_DISPATCH_ALL_TYPES(output.scalar_type(), "read_audio_buffer", [&] {
|
|
65
|
-
auto* data = output.data_ptr<scalar_t>();
|
|
66
|
-
std::copy(buffer.begin(), buffer.begin() + samples_read, data);
|
|
67
|
-
});
|
|
68
|
-
}
|
|
69
|
-
} // namespace
|
|
70
|
-
|
|
71
|
-
std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
|
|
72
|
-
const std::string& file_name
|
|
73
|
-
) {
|
|
74
|
-
SoxDescriptor fd(sox_open_read(
|
|
75
|
-
file_name.c_str(),
|
|
76
|
-
/*signal=*/nullptr,
|
|
77
|
-
/*encoding=*/nullptr,
|
|
78
|
-
/*filetype=*/nullptr));
|
|
79
|
-
if (fd.get() == nullptr) {
|
|
80
|
-
throw std::runtime_error("Error opening audio file");
|
|
81
|
-
}
|
|
82
|
-
return std::make_tuple(fd->signal, fd->encoding);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
int read_audio_file(
|
|
86
|
-
const std::string& file_name,
|
|
87
|
-
at::Tensor output,
|
|
88
|
-
bool ch_first,
|
|
89
|
-
int64_t nframes,
|
|
90
|
-
int64_t offset,
|
|
91
|
-
sox_signalinfo_t* si,
|
|
92
|
-
sox_encodinginfo_t* ei,
|
|
93
|
-
const char* ft) {
|
|
94
|
-
|
|
95
|
-
SoxDescriptor fd(sox_open_read(file_name.c_str(), si, ei, ft));
|
|
96
|
-
if (fd.get() == nullptr) {
|
|
97
|
-
throw std::runtime_error("Error opening audio file");
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
// signal info
|
|
101
|
-
|
|
102
|
-
const int number_of_channels = fd->signal.channels;
|
|
103
|
-
const int sample_rate = fd->signal.rate;
|
|
104
|
-
const int64_t total_length = fd->signal.length;
|
|
105
|
-
|
|
106
|
-
// multiply offset and number of frames by number of channels
|
|
107
|
-
offset *= number_of_channels;
|
|
108
|
-
nframes *= number_of_channels;
|
|
109
|
-
|
|
110
|
-
if (total_length == 0) {
|
|
111
|
-
throw std::runtime_error("Error reading audio file: unknown length");
|
|
112
|
-
}
|
|
113
|
-
if (offset > total_length) {
|
|
114
|
-
throw std::runtime_error("Offset past EOF");
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
// calculate buffer length
|
|
118
|
-
int64_t buffer_length = total_length;
|
|
119
|
-
if (offset > 0) {
|
|
120
|
-
buffer_length -= offset;
|
|
121
|
-
}
|
|
122
|
-
if (nframes > 0 && buffer_length > nframes) {
|
|
123
|
-
buffer_length = nframes;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
// seek to offset point before reading data
|
|
127
|
-
if (sox_seek(fd.get(), offset, 0) == SOX_EOF) {
|
|
128
|
-
throw std::runtime_error("sox_seek reached EOF, try reducing offset or num_samples");
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
// read data and fill output tensor
|
|
132
|
-
read_audio(fd, output, buffer_length);
|
|
133
|
-
|
|
134
|
-
// L x C -> C x L, if desired
|
|
135
|
-
if (ch_first) {
|
|
136
|
-
output.transpose_(1, 0);
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
return sample_rate;
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
void write_audio_file(
|
|
143
|
-
const std::string& file_name,
|
|
144
|
-
const at::Tensor& tensor,
|
|
145
|
-
sox_signalinfo_t* si,
|
|
146
|
-
sox_encodinginfo_t* ei,
|
|
147
|
-
const char* file_type) {
|
|
148
|
-
if (!tensor.is_contiguous()) {
|
|
149
|
-
throw std::runtime_error(
|
|
150
|
-
"Error writing audio file: input tensor must be contiguous");
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
#if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
|
|
154
|
-
si->mult = nullptr;
|
|
155
|
-
#endif
|
|
156
|
-
|
|
157
|
-
SoxDescriptor fd(sox_open_write(
|
|
158
|
-
file_name.c_str(),
|
|
159
|
-
si,
|
|
160
|
-
ei,
|
|
161
|
-
file_type,
|
|
162
|
-
/*oob=*/nullptr,
|
|
163
|
-
/*overwrite=*/nullptr));
|
|
164
|
-
|
|
165
|
-
if (fd.get() == nullptr) {
|
|
166
|
-
throw std::runtime_error(
|
|
167
|
-
"Error writing audio file: could not open file for writing");
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
const auto samples_written = write_audio(fd, tensor);
|
|
171
|
-
|
|
172
|
-
if (samples_written != tensor.numel()) {
|
|
173
|
-
throw std::runtime_error(
|
|
174
|
-
"Error writing audio file: could not write entire buffer");
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
int build_flow_effects(const std::string& file_name,
|
|
179
|
-
at::Tensor otensor,
|
|
180
|
-
bool ch_first,
|
|
181
|
-
sox_signalinfo_t* target_signal,
|
|
182
|
-
sox_encodinginfo_t* target_encoding,
|
|
183
|
-
const char* file_type,
|
|
184
|
-
std::vector<SoxEffect> pyeffs,
|
|
185
|
-
int max_num_eopts) {
|
|
186
|
-
|
|
187
|
-
/* This function builds an effects flow and puts the results into a tensor.
|
|
188
|
-
It can also be used to re-encode audio using any of the available encoding
|
|
189
|
-
options in SoX including sample rate and channel re-encoding. */
|
|
190
|
-
|
|
191
|
-
// open input
|
|
192
|
-
sox_format_t* input = sox_open_read(file_name.c_str(), nullptr, nullptr, nullptr);
|
|
193
|
-
if (input == nullptr) {
|
|
194
|
-
throw std::runtime_error("Error opening audio file");
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
// only used if target signal or encoding are null
|
|
198
|
-
sox_signalinfo_t empty_signal;
|
|
199
|
-
sox_encodinginfo_t empty_encoding;
|
|
200
|
-
|
|
201
|
-
// set signalinfo and encodinginfo if blank
|
|
202
|
-
if(target_signal == nullptr) {
|
|
203
|
-
target_signal = &empty_signal;
|
|
204
|
-
target_signal->rate = input->signal.rate;
|
|
205
|
-
target_signal->channels = input->signal.channels;
|
|
206
|
-
target_signal->length = SOX_UNSPEC;
|
|
207
|
-
target_signal->precision = input->signal.precision;
|
|
208
|
-
#if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
|
|
209
|
-
target_signal->mult = nullptr;
|
|
210
|
-
#endif
|
|
211
|
-
}
|
|
212
|
-
if(target_encoding == nullptr) {
|
|
213
|
-
target_encoding = &empty_encoding;
|
|
214
|
-
target_encoding->encoding = SOX_ENCODING_SIGN2; // Sample format
|
|
215
|
-
target_encoding->bits_per_sample = input->signal.precision; // Bits per sample
|
|
216
|
-
target_encoding->compression = 0.0; // Compression factor
|
|
217
|
-
target_encoding->reverse_bytes = sox_option_default; // Should bytes be reversed
|
|
218
|
-
target_encoding->reverse_nibbles = sox_option_default; // Should nibbles be reversed
|
|
219
|
-
target_encoding->reverse_bits = sox_option_default; // Should bits be reversed (pairs of bits?)
|
|
220
|
-
target_encoding->opposite_endian = sox_false; // Reverse endianness
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
// check for rate or channels effect and change the output signalinfo accordingly
|
|
224
|
-
for (SoxEffect se : pyeffs) {
|
|
225
|
-
if (se.ename == "rate") {
|
|
226
|
-
target_signal->rate = std::stod(se.eopts[0]);
|
|
227
|
-
} else if (se.ename == "channels") {
|
|
228
|
-
target_signal->channels = std::stoi(se.eopts[0]);
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
// create interm_signal for effects, intermediate steps change this in-place
|
|
233
|
-
sox_signalinfo_t interm_signal = input->signal;
|
|
234
|
-
|
|
235
|
-
#ifdef __APPLE__
|
|
236
|
-
// According to Mozilla Deepspeech sox_open_memstream_write doesn't work
|
|
237
|
-
// with OSX
|
|
238
|
-
char tmp_name[] = "/tmp/fileXXXXXX";
|
|
239
|
-
int tmp_fd = mkstemp(tmp_name);
|
|
240
|
-
close(tmp_fd);
|
|
241
|
-
sox_format_t* output = sox_open_write(tmp_name, target_signal,
|
|
242
|
-
target_encoding, "wav", nullptr, nullptr);
|
|
243
|
-
#else
|
|
244
|
-
// create buffer and buffer_size for output in memwrite
|
|
245
|
-
char* buffer;
|
|
246
|
-
size_t buffer_size;
|
|
247
|
-
// in-memory descriptor (this may not work for OSX)
|
|
248
|
-
sox_format_t* output = sox_open_memstream_write(&buffer,
|
|
249
|
-
&buffer_size,
|
|
250
|
-
target_signal,
|
|
251
|
-
target_encoding,
|
|
252
|
-
file_type, nullptr);
|
|
253
|
-
#endif
|
|
254
|
-
if (output == nullptr) {
|
|
255
|
-
throw std::runtime_error("Error opening output memstream/temporary file");
|
|
256
|
-
}
|
|
257
|
-
// Setup the effects chain to decode/resample
|
|
258
|
-
sox_effects_chain_t* chain =
|
|
259
|
-
sox_create_effects_chain(&input->encoding, &output->encoding);
|
|
260
|
-
|
|
261
|
-
sox_effect_t* e = sox_create_effect(sox_find_effect("input"));
|
|
262
|
-
char* io_args[1];
|
|
263
|
-
io_args[0] = (char*)input;
|
|
264
|
-
sox_effect_options(e, 1, io_args);
|
|
265
|
-
sox_add_effect(chain, e, &interm_signal, &input->signal);
|
|
266
|
-
free(e);
|
|
267
|
-
|
|
268
|
-
for(SoxEffect tae : pyeffs) {
|
|
269
|
-
if(tae.ename == "no_effects") break;
|
|
270
|
-
e = sox_create_effect(sox_find_effect(tae.ename.c_str()));
|
|
271
|
-
e->global_info->global_info->verbosity = 1;
|
|
272
|
-
if(tae.eopts[0] == "") {
|
|
273
|
-
sox_effect_options(e, 0, nullptr);
|
|
274
|
-
} else {
|
|
275
|
-
int num_opts = tae.eopts.size();
|
|
276
|
-
char* sox_args[max_num_eopts];
|
|
277
|
-
for(std::vector<std::string>::size_type i = 0; i != tae.eopts.size(); i++) {
|
|
278
|
-
sox_args[i] = (char*) tae.eopts[i].c_str();
|
|
279
|
-
}
|
|
280
|
-
if(sox_effect_options(e, num_opts, sox_args) != SOX_SUCCESS) {
|
|
281
|
-
#ifdef __APPLE__
|
|
282
|
-
unlink(tmp_name);
|
|
283
|
-
#endif
|
|
284
|
-
throw std::runtime_error("invalid effect options, see SoX docs for details");
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
sox_add_effect(chain, e, &interm_signal, &output->signal);
|
|
288
|
-
free(e);
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
e = sox_create_effect(sox_find_effect("output"));
|
|
292
|
-
io_args[0] = (char*)output;
|
|
293
|
-
sox_effect_options(e, 1, io_args);
|
|
294
|
-
sox_add_effect(chain, e, &interm_signal, &output->signal);
|
|
295
|
-
free(e);
|
|
296
|
-
|
|
297
|
-
// Finally run the effects chain
|
|
298
|
-
sox_flow_effects(chain, nullptr, nullptr);
|
|
299
|
-
sox_delete_effects_chain(chain);
|
|
300
|
-
|
|
301
|
-
// Close sox handles, buffer does not get properly sized until these are closed
|
|
302
|
-
sox_close(output);
|
|
303
|
-
sox_close(input);
|
|
304
|
-
|
|
305
|
-
int sr;
|
|
306
|
-
// Read the in-memory audio buffer or temp file that we just wrote.
|
|
307
|
-
#ifdef __APPLE__
|
|
308
|
-
/*
|
|
309
|
-
Temporary filetype must have a valid header. Wav seems to work here while
|
|
310
|
-
raw does not. Certain effects like chorus caused strange behavior on the mac.
|
|
311
|
-
*/
|
|
312
|
-
// read_audio_file reads the temporary file and returns the sr and otensor
|
|
313
|
-
sr = read_audio_file(tmp_name, otensor, ch_first, 0, 0,
|
|
314
|
-
target_signal, target_encoding, "wav");
|
|
315
|
-
// delete temporary audio file
|
|
316
|
-
unlink(tmp_name);
|
|
317
|
-
#else
|
|
318
|
-
// Resize output tensor to desired dimensions, different effects result in output->signal.length,
|
|
319
|
-
// interm_signal.length and buffer size being inconsistent with the result of the file output.
|
|
320
|
-
// We prioritize in the order: output->signal.length > interm_signal.length > buffer_size
|
|
321
|
-
// Could be related to: https://sourceforge.net/p/sox/bugs/314/
|
|
322
|
-
int nc, ns;
|
|
323
|
-
if (output->signal.length == 0) {
|
|
324
|
-
// sometimes interm_signal length is extremely large, but the buffer_size
|
|
325
|
-
// is double the length of the output signal
|
|
326
|
-
if (interm_signal.length > (buffer_size * 10)) {
|
|
327
|
-
ns = buffer_size / 2;
|
|
328
|
-
} else {
|
|
329
|
-
ns = interm_signal.length;
|
|
330
|
-
}
|
|
331
|
-
nc = interm_signal.channels;
|
|
332
|
-
} else {
|
|
333
|
-
nc = output->signal.channels;
|
|
334
|
-
ns = output->signal.length;
|
|
335
|
-
}
|
|
336
|
-
otensor.resize_({ns/nc, nc});
|
|
337
|
-
otensor = otensor.contiguous();
|
|
338
|
-
|
|
339
|
-
input = sox_open_mem_read(buffer, buffer_size, target_signal, target_encoding, file_type);
|
|
340
|
-
std::vector<sox_sample_t> samples(buffer_size);
|
|
341
|
-
const int64_t samples_read = sox_read(input, samples.data(), buffer_size);
|
|
342
|
-
assert(samples_read != nc * ns && samples_read != 0);
|
|
343
|
-
AT_DISPATCH_ALL_TYPES(otensor.scalar_type(), "effects_buffer", [&] {
|
|
344
|
-
auto* data = otensor.data_ptr<scalar_t>();
|
|
345
|
-
std::copy(samples.begin(), samples.begin() + samples_read, data);
|
|
346
|
-
});
|
|
347
|
-
// free buffer and close mem_read
|
|
348
|
-
sox_close(input);
|
|
349
|
-
free(buffer);
|
|
350
|
-
|
|
351
|
-
if (ch_first) {
|
|
352
|
-
otensor.transpose_(1, 0);
|
|
353
|
-
}
|
|
354
|
-
sr = target_signal->rate;
|
|
355
|
-
|
|
356
|
-
#endif
|
|
357
|
-
// return sample rate, output tensor modified in-place
|
|
358
|
-
return sr;
|
|
359
|
-
}
|
|
360
|
-
} // namespace audio
|
|
361
|
-
} // namespace torch
|
data/ext/torchaudio/csrc/sox.h
DELETED
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
#include <sox.h>
|
|
2
|
-
|
|
3
|
-
#include <string>
|
|
4
|
-
#include <tuple>
|
|
5
|
-
#include <vector>
|
|
6
|
-
#include <unistd.h>
|
|
7
|
-
|
|
8
|
-
// same as <torch/extension.h> without <torch/python.h>
|
|
9
|
-
#include <torch/all.h>
|
|
10
|
-
|
|
11
|
-
namespace at {
|
|
12
|
-
struct Tensor;
|
|
13
|
-
} // namespace at
|
|
14
|
-
|
|
15
|
-
namespace torch { namespace audio {
|
|
16
|
-
|
|
17
|
-
/// Reads an audio file from the given `path` into the `output` `Tensor` and
|
|
18
|
-
/// returns the sample rate of the audio file.
|
|
19
|
-
/// Throws `std::runtime_error` if the audio file could not be opened, or an
|
|
20
|
-
/// error occurred during reading of the audio data.
|
|
21
|
-
int read_audio_file(
|
|
22
|
-
const std::string& file_name,
|
|
23
|
-
at::Tensor output,
|
|
24
|
-
bool ch_first,
|
|
25
|
-
int64_t nframes,
|
|
26
|
-
int64_t offset,
|
|
27
|
-
sox_signalinfo_t* si,
|
|
28
|
-
sox_encodinginfo_t* ei,
|
|
29
|
-
const char* ft);
|
|
30
|
-
|
|
31
|
-
/// Writes the data of a `Tensor` into an audio file at the given `path`, with
|
|
32
|
-
/// a certain extension (e.g. `wav`or `mp3`) and sample rate.
|
|
33
|
-
/// Throws `std::runtime_error` when the audio file could not be opened for
|
|
34
|
-
/// writing, or an error occurred during writing of the audio data.
|
|
35
|
-
void write_audio_file(
|
|
36
|
-
const std::string& file_name,
|
|
37
|
-
const at::Tensor& tensor,
|
|
38
|
-
sox_signalinfo_t* si,
|
|
39
|
-
sox_encodinginfo_t* ei,
|
|
40
|
-
const char* file_type);
|
|
41
|
-
|
|
42
|
-
/// Reads an audio file from the given `path` and returns a tuple of
|
|
43
|
-
/// sox_signalinfo_t and sox_encodinginfo_t, which contain information about
|
|
44
|
-
/// the audio file such as sample rate, length, bit precision, encoding and more.
|
|
45
|
-
/// Throws `std::runtime_error` if the audio file could not be opened, or an
|
|
46
|
-
/// error occurred during reading of the audio data.
|
|
47
|
-
std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
|
|
48
|
-
const std::string& file_name);
|
|
49
|
-
|
|
50
|
-
// Struct for build_flow_effects function
|
|
51
|
-
struct SoxEffect {
|
|
52
|
-
SoxEffect() : ename(""), eopts({""}) { }
|
|
53
|
-
std::string ename;
|
|
54
|
-
std::vector<std::string> eopts;
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
/// Build a SoX chain, flow the effects, and capture the results in a tensor.
|
|
58
|
-
/// An audio file from the given `path` flows through an effects chain given
|
|
59
|
-
/// by a list of effects and effect options to an output buffer which is encoded
|
|
60
|
-
/// into memory to a target signal type and target signal encoding. The resulting
|
|
61
|
-
/// buffer is then placed into a tensor. This function returns the output tensor
|
|
62
|
-
/// and the sample rate of the output tensor.
|
|
63
|
-
int build_flow_effects(const std::string& file_name,
|
|
64
|
-
at::Tensor otensor,
|
|
65
|
-
bool ch_first,
|
|
66
|
-
sox_signalinfo_t* target_signal,
|
|
67
|
-
sox_encodinginfo_t* target_encoding,
|
|
68
|
-
const char* file_type,
|
|
69
|
-
std::vector<SoxEffect> pyeffs,
|
|
70
|
-
int max_num_eopts);
|
|
71
|
-
}} // namespace torch::audio
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
#include <sox.h>
|
|
2
|
-
#include <torchaudio/csrc/sox_effects.h>
|
|
3
|
-
|
|
4
|
-
using namespace torch::indexing;
|
|
5
|
-
|
|
6
|
-
namespace torchaudio {
|
|
7
|
-
namespace sox_effects {
|
|
8
|
-
|
|
9
|
-
namespace {
|
|
10
|
-
|
|
11
|
-
enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
|
|
12
|
-
SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
|
|
13
|
-
|
|
14
|
-
} // namespace
|
|
15
|
-
|
|
16
|
-
void initialize_sox_effects() {
|
|
17
|
-
if (SOX_RESOURCE_STATE == ShutDown) {
|
|
18
|
-
throw std::runtime_error(
|
|
19
|
-
"SoX Effects has been shut down. Cannot initialize again.");
|
|
20
|
-
}
|
|
21
|
-
if (SOX_RESOURCE_STATE == NotInitialized) {
|
|
22
|
-
if (sox_init() != SOX_SUCCESS) {
|
|
23
|
-
throw std::runtime_error("Failed to initialize sox effects.");
|
|
24
|
-
};
|
|
25
|
-
SOX_RESOURCE_STATE = Initialized;
|
|
26
|
-
}
|
|
27
|
-
};
|
|
28
|
-
|
|
29
|
-
void shutdown_sox_effects() {
|
|
30
|
-
if (SOX_RESOURCE_STATE == NotInitialized) {
|
|
31
|
-
throw std::runtime_error(
|
|
32
|
-
"SoX Effects is not initialized. Cannot shutdown.");
|
|
33
|
-
}
|
|
34
|
-
if (SOX_RESOURCE_STATE == Initialized) {
|
|
35
|
-
if (sox_quit() != SOX_SUCCESS) {
|
|
36
|
-
throw std::runtime_error("Failed to initialize sox effects.");
|
|
37
|
-
};
|
|
38
|
-
SOX_RESOURCE_STATE = ShutDown;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
std::vector<std::string> list_effects() {
|
|
43
|
-
std::vector<std::string> names;
|
|
44
|
-
const sox_effect_fn_t* fns = sox_get_effect_fns();
|
|
45
|
-
for (int i = 0; fns[i]; ++i) {
|
|
46
|
-
const sox_effect_handler_t* handler = fns[i]();
|
|
47
|
-
if (handler && handler->name)
|
|
48
|
-
names.push_back(handler->name);
|
|
49
|
-
}
|
|
50
|
-
return names;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
} // namespace sox_effects
|
|
54
|
-
} // namespace torchaudio
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
#ifndef TORCHAUDIO_SOX_EFFECTS_H
|
|
2
|
-
#define TORCHAUDIO_SOX_EFFECTS_H
|
|
3
|
-
|
|
4
|
-
#include <torch/script.h>
|
|
5
|
-
|
|
6
|
-
namespace torchaudio {
|
|
7
|
-
namespace sox_effects {
|
|
8
|
-
|
|
9
|
-
void initialize_sox_effects();
|
|
10
|
-
|
|
11
|
-
void shutdown_sox_effects();
|
|
12
|
-
|
|
13
|
-
std::vector<std::string> list_effects();
|
|
14
|
-
|
|
15
|
-
} // namespace sox_effects
|
|
16
|
-
} // namespace torchaudio
|
|
17
|
-
|
|
18
|
-
#endif
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
#include <sox.h>
|
|
2
|
-
#include <torchaudio/csrc/sox_io.h>
|
|
3
|
-
#include <torchaudio/csrc/sox_utils.h>
|
|
4
|
-
|
|
5
|
-
using namespace torch::indexing;
|
|
6
|
-
using namespace torchaudio::sox_utils;
|
|
7
|
-
|
|
8
|
-
namespace torchaudio {
|
|
9
|
-
namespace sox_io {
|
|
10
|
-
|
|
11
|
-
SignalInfo::SignalInfo(
|
|
12
|
-
const int64_t sample_rate_,
|
|
13
|
-
const int64_t num_channels_,
|
|
14
|
-
const int64_t num_frames_)
|
|
15
|
-
: sample_rate(sample_rate_),
|
|
16
|
-
num_channels(num_channels_),
|
|
17
|
-
num_frames(num_frames_){};
|
|
18
|
-
|
|
19
|
-
int64_t SignalInfo::getSampleRate() const {
|
|
20
|
-
return sample_rate;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
int64_t SignalInfo::getNumChannels() const {
|
|
24
|
-
return num_channels;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
int64_t SignalInfo::getNumFrames() const {
|
|
28
|
-
return num_frames;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
c10::intrusive_ptr<SignalInfo> get_info(const std::string& path) {
|
|
32
|
-
SoxFormat sf(sox_open_read(
|
|
33
|
-
path.c_str(),
|
|
34
|
-
/*signal=*/nullptr,
|
|
35
|
-
/*encoding=*/nullptr,
|
|
36
|
-
/*filetype=*/nullptr));
|
|
37
|
-
|
|
38
|
-
if (static_cast<sox_format_t*>(sf) == nullptr) {
|
|
39
|
-
throw std::runtime_error("Error opening audio file");
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
return c10::make_intrusive<SignalInfo>(
|
|
43
|
-
static_cast<int64_t>(sf->signal.rate),
|
|
44
|
-
static_cast<int64_t>(sf->signal.channels),
|
|
45
|
-
static_cast<int64_t>(sf->signal.length / sf->signal.channels));
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
c10::intrusive_ptr<TensorSignal> load_audio_file(
|
|
49
|
-
const std::string& path,
|
|
50
|
-
const int64_t frame_offset,
|
|
51
|
-
const int64_t num_frames,
|
|
52
|
-
const bool normalize,
|
|
53
|
-
const bool channels_first) {
|
|
54
|
-
if (frame_offset < 0) {
|
|
55
|
-
throw std::runtime_error(
|
|
56
|
-
"Invalid argument: frame_offset must be non-negative.");
|
|
57
|
-
}
|
|
58
|
-
if (num_frames == 0 || num_frames < -1) {
|
|
59
|
-
throw std::runtime_error(
|
|
60
|
-
"Invalid argument: num_frames must be -1 or greater than 0.");
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
SoxFormat sf(sox_open_read(
|
|
64
|
-
path.c_str(),
|
|
65
|
-
/*signal=*/nullptr,
|
|
66
|
-
/*encoding=*/nullptr,
|
|
67
|
-
/*filetype=*/nullptr));
|
|
68
|
-
|
|
69
|
-
validate_input_file(sf);
|
|
70
|
-
|
|
71
|
-
const int64_t num_channels = sf->signal.channels;
|
|
72
|
-
const int64_t num_total_samples = sf->signal.length;
|
|
73
|
-
const int64_t sample_start = sf->signal.channels * frame_offset;
|
|
74
|
-
|
|
75
|
-
if (sox_seek(sf, sample_start, 0) == SOX_EOF) {
|
|
76
|
-
throw std::runtime_error("Error reading audio file: offset past EOF.");
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
const int64_t sample_end = [&]() {
|
|
80
|
-
if (num_frames == -1)
|
|
81
|
-
return num_total_samples;
|
|
82
|
-
const int64_t sample_end_ = num_channels * num_frames + sample_start;
|
|
83
|
-
if (num_total_samples < sample_end_) {
|
|
84
|
-
// For lossy encoding, it is difficult to predict exact size of buffer for
|
|
85
|
-
// reading the number of samples required.
|
|
86
|
-
// So we allocate buffer size of given `num_frames` and ask sox to read as
|
|
87
|
-
// much as possible. For lossless format, sox reads exact number of
|
|
88
|
-
// samples, but for lossy encoding, sox can end up reading less. (i.e.
|
|
89
|
-
// mp3) For the consistent behavior specification between lossy/lossless
|
|
90
|
-
// format, we allow users to provide `num_frames` value that exceeds #of
|
|
91
|
-
// available samples, and we adjust it here.
|
|
92
|
-
return num_total_samples;
|
|
93
|
-
}
|
|
94
|
-
return sample_end_;
|
|
95
|
-
}();
|
|
96
|
-
|
|
97
|
-
const int64_t max_samples = sample_end - sample_start;
|
|
98
|
-
|
|
99
|
-
// Read samples into buffer
|
|
100
|
-
std::vector<sox_sample_t> buffer;
|
|
101
|
-
buffer.reserve(max_samples);
|
|
102
|
-
const int64_t num_samples = sox_read(sf, buffer.data(), max_samples);
|
|
103
|
-
if (num_samples == 0) {
|
|
104
|
-
throw std::runtime_error(
|
|
105
|
-
"Error reading audio file: empty file or read operation failed.");
|
|
106
|
-
}
|
|
107
|
-
// NOTE: num_samples may be smaller than max_samples if the input
|
|
108
|
-
// format is compressed (i.e. mp3).
|
|
109
|
-
|
|
110
|
-
// Convert to Tensor
|
|
111
|
-
auto tensor = convert_to_tensor(
|
|
112
|
-
buffer.data(),
|
|
113
|
-
num_samples,
|
|
114
|
-
num_channels,
|
|
115
|
-
get_dtype(sf->encoding.encoding, sf->signal.precision),
|
|
116
|
-
normalize,
|
|
117
|
-
channels_first);
|
|
118
|
-
|
|
119
|
-
return c10::make_intrusive<TensorSignal>(
|
|
120
|
-
tensor, static_cast<int64_t>(sf->signal.rate), channels_first);
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
void save_audio_file(
|
|
124
|
-
const std::string& file_name,
|
|
125
|
-
const c10::intrusive_ptr<TensorSignal>& signal,
|
|
126
|
-
const double compression) {
|
|
127
|
-
const auto tensor = signal->getTensor();
|
|
128
|
-
const auto sample_rate = signal->getSampleRate();
|
|
129
|
-
const auto channels_first = signal->getChannelsFirst();
|
|
130
|
-
|
|
131
|
-
validate_input_tensor(tensor);
|
|
132
|
-
|
|
133
|
-
const auto filetype = get_filetype(file_name);
|
|
134
|
-
const auto signal_info =
|
|
135
|
-
get_signalinfo(tensor, sample_rate, channels_first, filetype);
|
|
136
|
-
const auto encoding_info =
|
|
137
|
-
get_encodinginfo(filetype, tensor.dtype(), compression);
|
|
138
|
-
|
|
139
|
-
SoxFormat sf(sox_open_write(
|
|
140
|
-
file_name.c_str(),
|
|
141
|
-
&signal_info,
|
|
142
|
-
&encoding_info,
|
|
143
|
-
/*filetype=*/filetype.c_str(),
|
|
144
|
-
/*oob=*/nullptr,
|
|
145
|
-
/*overwrite_permitted=*/nullptr));
|
|
146
|
-
|
|
147
|
-
if (static_cast<sox_format_t*>(sf) == nullptr) {
|
|
148
|
-
throw std::runtime_error("Error saving audio file: failed to open file.");
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
auto tensor_ = tensor;
|
|
152
|
-
if (channels_first) {
|
|
153
|
-
tensor_ = tensor_.t();
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
const int64_t frames_per_chunk = 65536;
|
|
157
|
-
for (int64_t i = 0; i < tensor_.size(0); i += frames_per_chunk) {
|
|
158
|
-
auto chunk = tensor_.index({Slice(i, i + frames_per_chunk), Slice()});
|
|
159
|
-
chunk = unnormalize_wav(chunk).contiguous();
|
|
160
|
-
|
|
161
|
-
const size_t numel = chunk.numel();
|
|
162
|
-
if (sox_write(sf, chunk.data_ptr<int32_t>(), numel) != numel) {
|
|
163
|
-
throw std::runtime_error(
|
|
164
|
-
"Error saving audio file: failed to write the entier buffer.");
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
} // namespace sox_io
|
|
170
|
-
} // namespace torchaudio
|