RubyGems - torchaudio - Versions diffs - 0.1.0 - Mend

torchaudio 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +26 -0
data/README.md +93 -0
data/ext/torchaudio/csrc/register.cpp +65 -0
data/ext/torchaudio/csrc/sox.cpp +361 -0
data/ext/torchaudio/csrc/sox.h +71 -0
data/ext/torchaudio/csrc/sox_effects.cpp +54 -0
data/ext/torchaudio/csrc/sox_effects.h +18 -0
data/ext/torchaudio/csrc/sox_io.cpp +170 -0
data/ext/torchaudio/csrc/sox_io.h +41 -0
data/ext/torchaudio/csrc/sox_utils.cpp +245 -0
data/ext/torchaudio/csrc/sox_utils.h +100 -0
data/ext/torchaudio/ext.cpp +33 -0
data/ext/torchaudio/extconf.rb +81 -0
data/lib/torchaudio.rb +95 -0
data/lib/torchaudio/datasets/utils.rb +92 -0
data/lib/torchaudio/datasets/yesno.rb +59 -0
data/lib/torchaudio/version.rb +3 -0
metadata +145 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: b527976494325cc12e81342c25d318204d2d7c75bfba7036be4296769cdb30a0
+  data.tar.gz: 2cfde7bd1b0e7a1628818d5bd74657cfbfba6dfa83ef42897f3ad0f98e77f739
+SHA512:
+  metadata.gz: 8e6f34b014340b5ace3193ab589dae75ed0869ab7606402bd4b09de6042299e6f3a118d439dd381491f489ce9552bca4376a7d5b4693dddc3d1c5f5b26540900
+  data.tar.gz: d651c46f5185ceb70ae3d9c90154c77afe29a5c35854d1a9d98913096b7ab9ba39a745242dd268548ca87f9e109b56c96dee9dc5539cf066f9ad0f773eddbdcd

data/CHANGELOG.md ADDED

@@ -0,0 +1,3 @@
+## 0.1.0 (2020-08-24)
+- First release

data/LICENSE.txt ADDED

@@ -0,0 +1,26 @@
+BSD 2-Clause License
+Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
+Copyright (c) 2020 Andrew Kane,
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED

@@ -0,0 +1,93 @@
+# TorchAudio
+:fire: An audio library for Torch.rb
+## Installation
+First, [install SoX](#sox-installation). For Homebrew, use:
+```sh
+brew install sox
+```
+Add this line to your application’s Gemfile:
+```ruby
+gem 'torchaudio'
+```
+## Getting Started
+This library follows the [Python API](https://pytorch.org/audio/). Many methods and options are missing at the moment. PRs welcome!
+## Datasets
+Load a dataset
+```ruby
+TorchAudio::Datasets::YESNO.new(".", download: true)
+```
+Supported datasets are:
+- [YESNO](http://www.openslr.org/1/)
+## Disclaimer
+This library downloads and prepares public datasets. We don’t host any datasets. Be sure to adhere to the license for each dataset.
+If you’re a dataset owner and wish to update any details or remove it from this project, let us know.
+## SoX Installation
+### Mac
+```sh
+brew install sox
+```
+### Windows
+todo
+### Ubuntu
+```sh
+sudo apt install sox libsox-dev libsox-fmt-all
+```
+### Travis CI
+Add to `.travis.yml`:
+```yml
+addons:
+  apt:
+    packages:
+      - sox
+      - libsox-dev
+      - libsox-fmt-all
+```
+## History
+View the [changelog](https://github.com/ankane/torchaudio/blob/master/CHANGELOG.md)
+## Contributing
+Everyone is encouraged to help improve this project. Here are a few ways you can help:
+- [Report bugs](https://github.com/ankane/torchaudio/issues)
+- Fix bugs and [submit pull requests](https://github.com/ankane/torchaudio/pulls)
+- Write, clarify, or fix documentation
+- Suggest or add new features
+To get started with development:
+```sh
+git clone https://github.com/ankane/torchaudio.git
+cd torchaudio
+bundle install
+bundle exec rake compile
+bundle exec rake test
+```

data/ext/torchaudio/csrc/register.cpp ADDED

@@ -0,0 +1,65 @@
+#ifndef TORCHAUDIO_REGISTER_H
+#define TORCHAUDIO_REGISTER_H
+#include <torchaudio/csrc/sox_effects.h>
+#include <torchaudio/csrc/sox_io.h>
+#include <torchaudio/csrc/sox_utils.h>
+namespace torchaudio {
+namespace {
+////////////////////////////////////////////////////////////////////////////////
+// sox_utils.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerTensorSignal =
+    torch::class_<sox_utils::TensorSignal>("torchaudio", "TensorSignal")
+        .def(torch::init<torch::Tensor, int64_t, bool>())
+        .def("get_tensor", &sox_utils::TensorSignal::getTensor)
+        .def("get_sample_rate", &sox_utils::TensorSignal::getSampleRate)
+        .def("get_channels_first", &sox_utils::TensorSignal::getChannelsFirst);
+////////////////////////////////////////////////////////////////////////////////
+// sox_io.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerSignalInfo =
+    torch::class_<sox_io::SignalInfo>("torchaudio", "SignalInfo")
+        .def("get_sample_rate", &sox_io::SignalInfo::getSampleRate)
+        .def("get_num_channels", &sox_io::SignalInfo::getNumChannels)
+        .def("get_num_frames", &sox_io::SignalInfo::getNumFrames);
+static auto registerGetInfo = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_get_info(str path) -> __torch__.torch.classes.torchaudio.SignalInfo info")
+        .catchAllKernel<decltype(sox_io::get_info), &sox_io::get_info>());
+static auto registerLoadAudioFile = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_load_audio_file(str path, int frame_offset, int num_frames, bool normalize, bool channels_first) -> __torch__.torch.classes.torchaudio.TensorSignal signal")
+        .catchAllKernel<
+            decltype(sox_io::load_audio_file),
+            &sox_io::load_audio_file>());
+static auto registerSaveAudioFile = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_save_audio_file(str path, __torch__.torch.classes.torchaudio.TensorSignal signal, float compression) -> ()")
+        .catchAllKernel<
+            decltype(sox_io::save_audio_file),
+            &sox_io::save_audio_file>());
+////////////////////////////////////////////////////////////////////////////////
+// sox_effects.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerSoxEffects =
+    torch::RegisterOperators(
+        "torchaudio::sox_effects_initialize_sox_effects",
+        &sox_effects::initialize_sox_effects)
+        .op("torchaudio::sox_effects_shutdown_sox_effects",
+            &sox_effects::shutdown_sox_effects)
+        .op("torchaudio::sox_effects_list_effects", &sox_effects::list_effects);
+} // namespace
+} // namespace torchaudio
+#endif

data/ext/torchaudio/csrc/sox.cpp ADDED

@@ -0,0 +1,361 @@
+#include <torchaudio/csrc/sox.h>
+#include <algorithm>
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+namespace torch {
+namespace audio {
+namespace {
+/// Helper struct to safely close the sox_format_t descriptor.
+struct SoxDescriptor {
+  explicit SoxDescriptor(sox_format_t* fd) noexcept : fd_(fd) {}
+  SoxDescriptor(const SoxDescriptor& other) = delete;
+  SoxDescriptor(SoxDescriptor&& other) = delete;
+  SoxDescriptor& operator=(const SoxDescriptor& other) = delete;
+  SoxDescriptor& operator=(SoxDescriptor&& other) = delete;
+  ~SoxDescriptor() {
+    if (fd_ != nullptr) {
+      sox_close(fd_);
+    }
+  }
+  sox_format_t* operator->() noexcept {
+    return fd_;
+  }
+  sox_format_t* get() noexcept {
+    return fd_;
+  }
+ private:
+  sox_format_t* fd_;
+};
+int64_t write_audio(SoxDescriptor& fd, at::Tensor tensor) {
+  std::vector<sox_sample_t> buffer(tensor.numel());
+  AT_DISPATCH_ALL_TYPES(tensor.scalar_type(), "write_audio_buffer", [&] {
+    auto* data = tensor.data_ptr<scalar_t>();
+    std::copy(data, data + tensor.numel(), buffer.begin());
+  });
+  const auto samples_written =
+      sox_write(fd.get(), buffer.data(), buffer.size());
+  return samples_written;
+}
+void read_audio(
+    SoxDescriptor& fd,
+    at::Tensor output,
+    int64_t buffer_length) {
+  std::vector<sox_sample_t> buffer(buffer_length);
+  int number_of_channels = fd->signal.channels;
+  const int64_t samples_read = sox_read(fd.get(), buffer.data(), buffer_length);
+  if (samples_read == 0) {
+    throw std::runtime_error(
+        "Error reading audio file: empty file or read failed in sox_read");
+  }
+  output.resize_({samples_read / number_of_channels, number_of_channels});
+  output = output.contiguous();
+  AT_DISPATCH_ALL_TYPES(output.scalar_type(), "read_audio_buffer", [&] {
+    auto* data = output.data_ptr<scalar_t>();
+    std::copy(buffer.begin(), buffer.begin() + samples_read, data);
+  });
+}
+} // namespace
+std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
+    const std::string& file_name
+  ) {
+  SoxDescriptor fd(sox_open_read(
+      file_name.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/nullptr));
+  if (fd.get() == nullptr) {
+    throw std::runtime_error("Error opening audio file");
+  }
+  return std::make_tuple(fd->signal, fd->encoding);
+}
+int read_audio_file(
+    const std::string& file_name,
+    at::Tensor output,
+    bool ch_first,
+    int64_t nframes,
+    int64_t offset,
+    sox_signalinfo_t* si,
+    sox_encodinginfo_t* ei,
+    const char* ft) {
+  SoxDescriptor fd(sox_open_read(file_name.c_str(), si, ei, ft));
+  if (fd.get() == nullptr) {
+    throw std::runtime_error("Error opening audio file");
+  }
+  // signal info
+  const int number_of_channels = fd->signal.channels;
+  const int sample_rate = fd->signal.rate;
+  const int64_t total_length = fd->signal.length;
+  // multiply offset and number of frames by number of channels
+  offset *= number_of_channels;
+  nframes *= number_of_channels;
+  if (total_length == 0) {
+    throw std::runtime_error("Error reading audio file: unknown length");
+  }
+  if (offset > total_length) {
+    throw std::runtime_error("Offset past EOF");
+  }
+  // calculate buffer length
+  int64_t buffer_length = total_length;
+  if (offset > 0) {
+      buffer_length -= offset;
+  }
+  if (nframes > 0 && buffer_length > nframes) {
+      buffer_length = nframes;
+  }
+  // seek to offset point before reading data
+  if (sox_seek(fd.get(), offset, 0) == SOX_EOF) {
+    throw std::runtime_error("sox_seek reached EOF, try reducing offset or num_samples");
+  }
+  // read data and fill output tensor
+  read_audio(fd, output, buffer_length);
+  // L x C -> C x L, if desired
+  if (ch_first) {
+    output.transpose_(1, 0);
+  }
+  return sample_rate;
+}
+void write_audio_file(
+    const std::string& file_name,
+    const at::Tensor& tensor,
+    sox_signalinfo_t* si,
+    sox_encodinginfo_t* ei,
+    const char* file_type) {
+  if (!tensor.is_contiguous()) {
+    throw std::runtime_error(
+        "Error writing audio file: input tensor must be contiguous");
+  }
+#if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
+  si->mult = nullptr;
+#endif
+  SoxDescriptor fd(sox_open_write(
+      file_name.c_str(),
+      si,
+      ei,
+      file_type,
+      /*oob=*/nullptr,
+      /*overwrite=*/nullptr));
+  if (fd.get() == nullptr) {
+    throw std::runtime_error(
+        "Error writing audio file: could not open file for writing");
+  }
+  const auto samples_written = write_audio(fd, tensor);
+  if (samples_written != tensor.numel()) {
+    throw std::runtime_error(
+        "Error writing audio file: could not write entire buffer");
+  }
+}
+int build_flow_effects(const std::string& file_name,
+                       at::Tensor otensor,
+                       bool ch_first,
+                       sox_signalinfo_t* target_signal,
+                       sox_encodinginfo_t* target_encoding,
+                       const char* file_type,
+                       std::vector<SoxEffect> pyeffs,
+                       int max_num_eopts) {
+  /* This function builds an effects flow and puts the results into a tensor.
+     It can also be used to re-encode audio using any of the available encoding
+     options in SoX including sample rate and channel re-encoding.              */
+  // open input
+  sox_format_t* input = sox_open_read(file_name.c_str(), nullptr, nullptr, nullptr);
+  if (input == nullptr) {
+    throw std::runtime_error("Error opening audio file");
+  }
+  // only used if target signal or encoding are null
+  sox_signalinfo_t empty_signal;
+  sox_encodinginfo_t empty_encoding;
+  // set signalinfo and encodinginfo if blank
+  if(target_signal == nullptr) {
+    target_signal = &empty_signal;
+    target_signal->rate = input->signal.rate;
+    target_signal->channels = input->signal.channels;
+    target_signal->length = SOX_UNSPEC;
+    target_signal->precision = input->signal.precision;
+#if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
+    target_signal->mult = nullptr;
+#endif
+  }
+  if(target_encoding == nullptr) {
+    target_encoding = &empty_encoding;
+    target_encoding->encoding = SOX_ENCODING_SIGN2; // Sample format
+    target_encoding->bits_per_sample = input->signal.precision; // Bits per sample
+    target_encoding->compression = 0.0; // Compression factor
+    target_encoding->reverse_bytes = sox_option_default; // Should bytes be reversed
+    target_encoding->reverse_nibbles = sox_option_default; // Should nibbles be reversed
+    target_encoding->reverse_bits = sox_option_default; // Should bits be reversed (pairs of bits?)
+    target_encoding->opposite_endian = sox_false; // Reverse endianness
+  }
+  // check for rate or channels effect and change the output signalinfo accordingly
+  for (SoxEffect se : pyeffs) {
+    if (se.ename == "rate") {
+      target_signal->rate = std::stod(se.eopts[0]);
+    } else if (se.ename == "channels") {
+      target_signal->channels = std::stoi(se.eopts[0]);
+    }
+  }
+  // create interm_signal for effects, intermediate steps change this in-place
+  sox_signalinfo_t interm_signal = input->signal;
+#ifdef __APPLE__
+  // According to Mozilla Deepspeech sox_open_memstream_write doesn't work
+  // with OSX
+  char tmp_name[] = "/tmp/fileXXXXXX";
+  int tmp_fd = mkstemp(tmp_name);
+  close(tmp_fd);
+  sox_format_t* output = sox_open_write(tmp_name, target_signal,
+                                        target_encoding, "wav", nullptr, nullptr);
+#else
+  // create buffer and buffer_size for output in memwrite
+  char* buffer;
+  size_t buffer_size;
+  // in-memory descriptor (this may not work for OSX)
+  sox_format_t* output = sox_open_memstream_write(&buffer,
+                                                  &buffer_size,
+                                                  target_signal,
+                                                  target_encoding,
+                                                  file_type, nullptr);
+#endif
+  if (output == nullptr) {
+    throw std::runtime_error("Error opening output memstream/temporary file");
+  }
+  // Setup the effects chain to decode/resample
+  sox_effects_chain_t* chain =
+    sox_create_effects_chain(&input->encoding, &output->encoding);
+  sox_effect_t* e = sox_create_effect(sox_find_effect("input"));
+  char* io_args[1];
+  io_args[0] = (char*)input;
+  sox_effect_options(e, 1, io_args);
+  sox_add_effect(chain, e, &interm_signal, &input->signal);
+  free(e);
+  for(SoxEffect tae : pyeffs) {
+    if(tae.ename == "no_effects") break;
+    e = sox_create_effect(sox_find_effect(tae.ename.c_str()));
+    e->global_info->global_info->verbosity = 1;
+    if(tae.eopts[0] == "") {
+      sox_effect_options(e, 0, nullptr);
+    } else {
+      int num_opts = tae.eopts.size();
+      char* sox_args[max_num_eopts];
+      for(std::vector<std::string>::size_type i = 0; i != tae.eopts.size(); i++) {
+        sox_args[i] = (char*) tae.eopts[i].c_str();
+      }
+      if(sox_effect_options(e, num_opts, sox_args) != SOX_SUCCESS) {
+#ifdef __APPLE__
+        unlink(tmp_name);
+#endif
+        throw std::runtime_error("invalid effect options, see SoX docs for details");
+      }
+    }
+    sox_add_effect(chain, e, &interm_signal, &output->signal);
+    free(e);
+  }
+  e = sox_create_effect(sox_find_effect("output"));
+  io_args[0] = (char*)output;
+  sox_effect_options(e, 1, io_args);
+  sox_add_effect(chain, e, &interm_signal, &output->signal);
+  free(e);
+  // Finally run the effects chain
+  sox_flow_effects(chain, nullptr, nullptr);
+  sox_delete_effects_chain(chain);
+  // Close sox handles, buffer does not get properly sized until these are closed
+  sox_close(output);
+  sox_close(input);
+  int sr;
+  // Read the in-memory audio buffer or temp file that we just wrote.
+#ifdef __APPLE__
+  /*
+     Temporary filetype must have a valid header.  Wav seems to work here while
+     raw does not.  Certain effects like chorus caused strange behavior on the mac.
+  */
+  // read_audio_file reads the temporary file and returns the sr and otensor
+  sr = read_audio_file(tmp_name, otensor, ch_first, 0, 0,
+                       target_signal, target_encoding, "wav");
+  // delete temporary audio file
+  unlink(tmp_name);
+#else
+  // Resize output tensor to desired dimensions, different effects result in output->signal.length,
+  // interm_signal.length and buffer size being inconsistent with the result of the file output.
+  // We prioritize in the order: output->signal.length > interm_signal.length > buffer_size
+  // Could be related to: https://sourceforge.net/p/sox/bugs/314/
+  int nc, ns;
+  if (output->signal.length == 0) {
+    // sometimes interm_signal length is extremely large, but the buffer_size
+    // is double the length of the output signal
+    if (interm_signal.length > (buffer_size * 10)) {
+      ns = buffer_size / 2;
+    } else {
+      ns = interm_signal.length;
+    }
+    nc = interm_signal.channels;
+  } else {
+    nc = output->signal.channels;
+    ns = output->signal.length;
+  }
+  otensor.resize_({ns/nc, nc});
+  otensor = otensor.contiguous();
+  input = sox_open_mem_read(buffer, buffer_size, target_signal, target_encoding, file_type);
+  std::vector<sox_sample_t> samples(buffer_size);
+  const int64_t samples_read = sox_read(input, samples.data(), buffer_size);
+  assert(samples_read != nc * ns && samples_read != 0);
+  AT_DISPATCH_ALL_TYPES(otensor.scalar_type(), "effects_buffer", [&] {
+    auto* data = otensor.data_ptr<scalar_t>();
+    std::copy(samples.begin(), samples.begin() + samples_read, data);
+  });
+  // free buffer and close mem_read
+  sox_close(input);
+  free(buffer);
+  if (ch_first) {
+    otensor.transpose_(1, 0);
+  }
+  sr = target_signal->rate;
+#endif
+  // return sample rate, output tensor modified in-place
+  return sr;
+}
+} // namespace audio
+} // namespace torch