RubyGems - torchaudio - Versions diffs - 0.1.0 - Mend

torchaudio 0.1.0

Files changed (20) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +26 -0
data/README.md +93 -0
data/ext/torchaudio/csrc/register.cpp +65 -0
data/ext/torchaudio/csrc/sox.cpp +361 -0
data/ext/torchaudio/csrc/sox.h +71 -0
data/ext/torchaudio/csrc/sox_effects.cpp +54 -0
data/ext/torchaudio/csrc/sox_effects.h +18 -0
data/ext/torchaudio/csrc/sox_io.cpp +170 -0
data/ext/torchaudio/csrc/sox_io.h +41 -0
data/ext/torchaudio/csrc/sox_utils.cpp +245 -0
data/ext/torchaudio/csrc/sox_utils.h +100 -0
data/ext/torchaudio/ext.cpp +33 -0
data/ext/torchaudio/extconf.rb +81 -0
data/lib/torchaudio.rb +95 -0
data/lib/torchaudio/datasets/utils.rb +92 -0
data/lib/torchaudio/datasets/yesno.rb +59 -0
data/lib/torchaudio/version.rb +3 -0
metadata +145 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: b527976494325cc12e81342c25d318204d2d7c75bfba7036be4296769cdb30a0
+  data.tar.gz: 2cfde7bd1b0e7a1628818d5bd74657cfbfba6dfa83ef42897f3ad0f98e77f739
+SHA512:
+  metadata.gz: 8e6f34b014340b5ace3193ab589dae75ed0869ab7606402bd4b09de6042299e6f3a118d439dd381491f489ce9552bca4376a7d5b4693dddc3d1c5f5b26540900
+  data.tar.gz: d651c46f5185ceb70ae3d9c90154c77afe29a5c35854d1a9d98913096b7ab9ba39a745242dd268548ca87f9e109b56c96dee9dc5539cf066f9ad0f773eddbdcd

data/CHANGELOG.md ADDED

@@ -0,0 +1,3 @@
+## 0.1.0 (2020-08-24)
+- First release

data/LICENSE.txt ADDED

@@ -0,0 +1,26 @@
+BSD 2-Clause License
+Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
+Copyright (c) 2020 Andrew Kane,
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED

@@ -0,0 +1,93 @@
+# TorchAudio
+:fire: An audio library for Torch.rb
+## Installation
+First, [install SoX](#sox-installation). For Homebrew, use:
+```sh
+brew install sox
+```
+Add this line to your application’s Gemfile:
+```ruby
+gem 'torchaudio'
+```
+## Getting Started
+This library follows the [Python API](https://pytorch.org/audio/). Many methods and options are missing at the moment. PRs welcome!
+## Datasets
+Load a dataset
+```ruby
+TorchAudio::Datasets::YESNO.new(".", download: true)
+```
+Supported datasets are:
+- [YESNO](http://www.openslr.org/1/)
+## Disclaimer
+This library downloads and prepares public datasets. We don’t host any datasets. Be sure to adhere to the license for each dataset.
+If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
+## SoX Installation
+### Mac
+```sh
+brew install sox
+```
+### Windows
+todo
+### Ubuntu
+```sh
+sudo apt install sox libsox-dev libsox-fmt-all
+```
+### Travis CI
+Add to `.travis.yml`:
+```yml
+addons:
+  apt:
+    packages:
+      - sox
+      - libsox-dev
+      - libsox-fmt-all
+```
+## History
+View the [changelog](https://github.com/ankane/torchaudio/blob/master/CHANGELOG.md)
+## Contributing
+Everyone is encouraged to help improve this project. Here are a few ways you can help:
+- [Report bugs](https://github.com/ankane/torchaudio/issues)
+- Fix bugs and [submit pull requests](https://github.com/ankane/torchaudio/pulls)
+- Write, clarify, or fix documentation
+- Suggest or add new features
+To get started with development:
+```sh
+git clone https://github.com/ankane/torchaudio.git
+cd torchaudio
+bundle install
+bundle exec rake compile
+bundle exec rake test
+```

data/ext/torchaudio/csrc/register.cpp ADDED

@@ -0,0 +1,65 @@
+#ifndef TORCHAUDIO_REGISTER_H
+#define TORCHAUDIO_REGISTER_H
+#include <torchaudio/csrc/sox_effects.h>
+#include <torchaudio/csrc/sox_io.h>
+#include <torchaudio/csrc/sox_utils.h>
+namespace torchaudio {
+namespace {
+////////////////////////////////////////////////////////////////////////////////
+// sox_utils.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerTensorSignal =
+    torch::class_<sox_utils::TensorSignal>("torchaudio", "TensorSignal")
+        .def(torch::init<torch::Tensor, int64_t, bool>())
+        .def("get_tensor", &sox_utils::TensorSignal::getTensor)
+        .def("get_sample_rate", &sox_utils::TensorSignal::getSampleRate)
+        .def("get_channels_first", &sox_utils::TensorSignal::getChannelsFirst);
+////////////////////////////////////////////////////////////////////////////////
+// sox_io.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerSignalInfo =
+    torch::class_<sox_io::SignalInfo>("torchaudio", "SignalInfo")
+        .def("get_sample_rate", &sox_io::SignalInfo::getSampleRate)
+        .def("get_num_channels", &sox_io::SignalInfo::getNumChannels)
+        .def("get_num_frames", &sox_io::SignalInfo::getNumFrames);
+static auto registerGetInfo = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_get_info(str path) -> __torch__.torch.classes.torchaudio.SignalInfo info")
+        .catchAllKernel<decltype(sox_io::get_info), &sox_io::get_info>());
+static auto registerLoadAudioFile = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_load_audio_file(str path, int frame_offset, int num_frames, bool normalize, bool channels_first) -> __torch__.torch.classes.torchaudio.TensorSignal signal")
+        .catchAllKernel<
+            decltype(sox_io::load_audio_file),
+            &sox_io::load_audio_file>());
+static auto registerSaveAudioFile = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_save_audio_file(str path, __torch__.torch.classes.torchaudio.TensorSignal signal, float compression) -> ()")
+        .catchAllKernel<
+            decltype(sox_io::save_audio_file),
+            &sox_io::save_audio_file>());
+////////////////////////////////////////////////////////////////////////////////
+// sox_effects.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerSoxEffects =
+    torch::RegisterOperators(
+        "torchaudio::sox_effects_initialize_sox_effects",
+        &sox_effects::initialize_sox_effects)
+        .op("torchaudio::sox_effects_shutdown_sox_effects",
+            &sox_effects::shutdown_sox_effects)
+        .op("torchaudio::sox_effects_list_effects", &sox_effects::list_effects);
+} // namespace
+} // namespace torchaudio
+#endif

data/ext/torchaudio/csrc/sox.cpp ADDED

@@ -0,0 +1,361 @@
+#include <torchaudio/csrc/sox.h>
+#include <algorithm>
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+namespace torch {
+namespace audio {
+namespace {
+/// Helper struct to safely close the sox_format_t descriptor.
+struct SoxDescriptor {
+  explicit SoxDescriptor(sox_format_t* fd) noexcept : fd_(fd) {}
+  SoxDescriptor(const SoxDescriptor& other) = delete;
+  SoxDescriptor(SoxDescriptor&& other) = delete;
+  SoxDescriptor& operator=(const SoxDescriptor& other) = delete;
+  SoxDescriptor& operator=(SoxDescriptor&& other) = delete;
+  ~SoxDescriptor() {
+    if (fd_ != nullptr) {
+      sox_close(fd_);
+    }
+  }
+  sox_format_t* operator->() noexcept {
+    return fd_;
+  }
+  sox_format_t* get() noexcept {
+    return fd_;
+  }
+ private:
+  sox_format_t* fd_;
+};
+int64_t write_audio(SoxDescriptor& fd, at::Tensor tensor) {
+  std::vector<sox_sample_t> buffer(tensor.numel());
+  AT_DISPATCH_ALL_TYPES(tensor.scalar_type(), "write_audio_buffer", [&] {
+    auto* data = tensor.data_ptr<scalar_t>();
+    std::copy(data, data + tensor.numel(), buffer.begin());
+  });
+  const auto samples_written =
+      sox_write(fd.get(), buffer.data(), buffer.size());
+  return samples_written;
+}
+void read_audio(
+    SoxDescriptor& fd,
+    at::Tensor output,
+    int64_t buffer_length) {
+  std::vector<sox_sample_t> buffer(buffer_length);
+  int number_of_channels = fd->signal.channels;
+  const int64_t samples_read = sox_read(fd.get(), buffer.data(), buffer_length);
+  if (samples_read == 0) {
+    throw std::runtime_error(
+        "Error reading audio file: empty file or read failed in sox_read");
+  }
+  output.resize_({samples_read / number_of_channels, number_of_channels});
+  output = output.contiguous();
+  AT_DISPATCH_ALL_TYPES(output.scalar_type(), "read_audio_buffer", [&] {
+    auto* data = output.data_ptr<scalar_t>();
+    std::copy(buffer.begin(), buffer.begin() + samples_read, data);
+  });
+}
+} // namespace
+std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
+    const std::string& file_name
+  ) {
+  SoxDescriptor fd(sox_open_read(
+      file_name.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/nullptr));
+  if (fd.get() == nullptr) {
+    throw std::runtime_error("Error opening audio file");
+  }
+  return std::make_tuple(fd->signal, fd->encoding);
+}
+int read_audio_file(
+    const std::string& file_name,
+    at::Tensor output,
+    bool ch_first,
+    int64_t nframes,
+    int64_t offset,
+    sox_signalinfo_t* si,
+    sox_encodinginfo_t* ei,
+    const char* ft) {
+  SoxDescriptor fd(sox_open_read(file_name.c_str(), si, ei, ft));
+  if (fd.get() == nullptr) {
+    throw std::runtime_error("Error opening audio file");
+  }
+  // signal info
+  const int number_of_channels = fd->signal.channels;
+  const int sample_rate = fd->signal.rate;
+  const int64_t total_length = fd->signal.length;
+  // multiply offset and number of frames by number of channels
+  offset *= number_of_channels;
+  nframes *= number_of_channels;
+  if (total_length == 0) {
+    throw std::runtime_error("Error reading audio file: unknown length");
+  }
+  if (offset > total_length) {
+    throw std::runtime_error("Offset past EOF");
+  }
+  // calculate buffer length
+  int64_t buffer_length = total_length;
+  if (offset > 0) {
+      buffer_length -= offset;
+  }
+  if (nframes > 0 && buffer_length > nframes) {
+      buffer_length = nframes;
+  }
+  // seek to offset point before reading data
+  if (sox_seek(fd.get(), offset, 0) == SOX_EOF) {
+    throw std::runtime_error("sox_seek reached EOF, try reducing offset or num_samples");
+  }
+  // read data and fill output tensor
+  read_audio(fd, output, buffer_length);
+  // L x C -> C x L, if desired
+  if (ch_first) {
+    output.transpose_(1, 0);
+  }
+  return sample_rate;
+}
+void write_audio_file(
+    const std::string& file_name,
+    const at::Tensor& tensor,
+    sox_signalinfo_t* si,
+    sox_encodinginfo_t* ei,
+    const char* file_type) {
+  if (!tensor.is_contiguous()) {
+    throw std::runtime_error(
+        "Error writing audio file: input tensor must be contiguous");
+  }
+#if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
+  si->mult = nullptr;
+#endif
+  SoxDescriptor fd(sox_open_write(
+      file_name.c_str(),
+      si,
+      ei,
+      file_type,
+      /*oob=*/nullptr,
+      /*overwrite=*/nullptr));
+  if (fd.get() == nullptr) {
+    throw std::runtime_error(
+        "Error writing audio file: could not open file for writing");
+  }
+  const auto samples_written = write_audio(fd, tensor);
+  if (samples_written != tensor.numel()) {
+    throw std::runtime_error(
+        "Error writing audio file: could not write entire buffer");
+  }
+}
+int build_flow_effects(const std::string& file_name,
+                       at::Tensor otensor,
+                       bool ch_first,
+                       sox_signalinfo_t* target_signal,
+                       sox_encodinginfo_t* target_encoding,
+                       const char* file_type,
+                       std::vector<SoxEffect> pyeffs,
+                       int max_num_eopts) {
+  /* This function builds an effects flow and puts the results into a tensor.
+     It can also be used to re-encode audio using any of the available encoding
+     options in SoX including sample rate and channel re-encoding.              */
+  // open input
+  sox_format_t* input = sox_open_read(file_name.c_str(), nullptr, nullptr, nullptr);
+  if (input == nullptr) {
+    throw std::runtime_error("Error opening audio file");
+  }
+  // only used if target signal or encoding are null
+  sox_signalinfo_t empty_signal;
+  sox_encodinginfo_t empty_encoding;
+  // set signalinfo and encodinginfo if blank
+  if(target_signal == nullptr) {
+    target_signal = &empty_signal;
+    target_signal->rate = input->signal.rate;
+    target_signal->channels = input->signal.channels;
+    target_signal->length = SOX_UNSPEC;
+    target_signal->precision = input->signal.precision;
+#if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
+    target_signal->mult = nullptr;
+#endif
+  }
+  if(target_encoding == nullptr) {
+    target_encoding = &empty_encoding;
+    target_encoding->encoding = SOX_ENCODING_SIGN2; // Sample format
+    target_encoding->bits_per_sample = input->signal.precision; // Bits per sample
+    target_encoding->compression = 0.0; // Compression factor
+    target_encoding->reverse_bytes = sox_option_default; // Should bytes be reversed
+    target_encoding->reverse_nibbles = sox_option_default; // Should nibbles be reversed
+    target_encoding->reverse_bits = sox_option_default; // Should bits be reversed (pairs of bits?)
+    target_encoding->opposite_endian = sox_false; // Reverse endianness
+  }
+  // check for rate or channels effect and change the output signalinfo accordingly
+  for (SoxEffect se : pyeffs) {
+    if (se.ename == "rate") {
+      target_signal->rate = std::stod(se.eopts[0]);
+    } else if (se.ename == "channels") {
+      target_signal->channels = std::stoi(se.eopts[0]);
+    }
+  }
+  // create interm_signal for effects, intermediate steps change this in-place
+  sox_signalinfo_t interm_signal = input->signal;
+#ifdef __APPLE__
+  // According to Mozilla Deepspeech sox_open_memstream_write doesn't work
+  // with OSX
+  char tmp_name[] = "/tmp/fileXXXXXX";
+  int tmp_fd = mkstemp(tmp_name);
+  close(tmp_fd);
+  sox_format_t* output = sox_open_write(tmp_name, target_signal,
+                                        target_encoding, "wav", nullptr, nullptr);
+#else
+  // create buffer and buffer_size for output in memwrite
+  char* buffer;
+  size_t buffer_size;
+  // in-memory descriptor (this may not work for OSX)
+  sox_format_t* output = sox_open_memstream_write(&buffer,
+                                                  &buffer_size,
+                                                  target_signal,
+                                                  target_encoding,
+                                                  file_type, nullptr);
+#endif
+  if (output == nullptr) {
+    throw std::runtime_error("Error opening output memstream/temporary file");
+  }
+  // Setup the effects chain to decode/resample
+  sox_effects_chain_t* chain =
+    sox_create_effects_chain(&input->encoding, &output->encoding);
+  sox_effect_t* e = sox_create_effect(sox_find_effect("input"));
+  char* io_args[1];
+  io_args[0] = (char*)input;
+  sox_effect_options(e, 1, io_args);
+  sox_add_effect(chain, e, &interm_signal, &input->signal);
+  free(e);
+  for(SoxEffect tae : pyeffs) {
+    if(tae.ename == "no_effects") break;
+    e = sox_create_effect(sox_find_effect(tae.ename.c_str()));
+    e->global_info->global_info->verbosity = 1;
+    if(tae.eopts[0] == "") {
+      sox_effect_options(e, 0, nullptr);
+    } else {
+      int num_opts = tae.eopts.size();
+      char* sox_args[max_num_eopts];
+      for(std::vector<std::string>::size_type i = 0; i != tae.eopts.size(); i++) {
+        sox_args[i] = (char*) tae.eopts[i].c_str();
+      }
+      if(sox_effect_options(e, num_opts, sox_args) != SOX_SUCCESS) {
+#ifdef __APPLE__
+        unlink(tmp_name);
+#endif
+        throw std::runtime_error("invalid effect options, see SoX docs for details");
+      }
+    }
+    sox_add_effect(chain, e, &interm_signal, &output->signal);
+    free(e);
+  }
+  e = sox_create_effect(sox_find_effect("output"));
+  io_args[0] = (char*)output;
+  sox_effect_options(e, 1, io_args);
+  sox_add_effect(chain, e, &interm_signal, &output->signal);
+  free(e);
+  // Finally run the effects chain
+  sox_flow_effects(chain, nullptr, nullptr);
+  sox_delete_effects_chain(chain);
+  // Close sox handles, buffer does not get properly sized until these are closed
+  sox_close(output);
+  sox_close(input);
+  int sr;
+  // Read the in-memory audio buffer or temp file that we just wrote.
+#ifdef __APPLE__
+  /*
+     Temporary filetype must have a valid header.  Wav seems to work here while
+     raw does not.  Certain effects like chorus caused strange behavior on the mac.
+  */
+  // read_audio_file reads the temporary file and returns the sr and otensor
+  sr = read_audio_file(tmp_name, otensor, ch_first, 0, 0,
+                       target_signal, target_encoding, "wav");
+  // delete temporary audio file
+  unlink(tmp_name);
+#else
+  // Resize output tensor to desired dimensions, different effects result in output->signal.length,
+  // interm_signal.length and buffer size being inconsistent with the result of the file output.
+  // We prioritize in the order: output->signal.length > interm_signal.length > buffer_size
+  // Could be related to: https://sourceforge.net/p/sox/bugs/314/
+  int nc, ns;
+  if (output->signal.length == 0) {
+    // sometimes interm_signal length is extremely large, but the buffer_size
+    // is double the length of the output signal
+    if (interm_signal.length > (buffer_size * 10)) {
+      ns = buffer_size / 2;
+    } else {
+      ns = interm_signal.length;
+    }
+    nc = interm_signal.channels;
+  } else {
+    nc = output->signal.channels;
+    ns = output->signal.length;
+  }
+  otensor.resize_({ns/nc, nc});
+  otensor = otensor.contiguous();
+  input = sox_open_mem_read(buffer, buffer_size, target_signal, target_encoding, file_type);
+  std::vector<sox_sample_t> samples(buffer_size);
+  const int64_t samples_read = sox_read(input, samples.data(), buffer_size);
+  assert(samples_read != nc * ns && samples_read != 0);
+  AT_DISPATCH_ALL_TYPES(otensor.scalar_type(), "effects_buffer", [&] {
+    auto* data = otensor.data_ptr<scalar_t>();
+    std::copy(samples.begin(), samples.begin() + samples_read, data);
+  });
+  // free buffer and close mem_read
+  sox_close(input);
+  free(buffer);
+  if (ch_first) {
+    otensor.transpose_(1, 0);
+  }
+  sr = target_signal->rate;
+#endif
+  // return sample rate, output tensor modified in-place
+  return sr;
+}
+} // namespace audio
+} // namespace torch