RubyGems - torchaudio - Versions diffs - 0.1.0 - Mend

torchaudio 0.1.0

Files changed (20) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +26 -0
data/README.md +93 -0
data/ext/torchaudio/csrc/register.cpp +65 -0
data/ext/torchaudio/csrc/sox.cpp +361 -0
data/ext/torchaudio/csrc/sox.h +71 -0
data/ext/torchaudio/csrc/sox_effects.cpp +54 -0
data/ext/torchaudio/csrc/sox_effects.h +18 -0
data/ext/torchaudio/csrc/sox_io.cpp +170 -0
data/ext/torchaudio/csrc/sox_io.h +41 -0
data/ext/torchaudio/csrc/sox_utils.cpp +245 -0
data/ext/torchaudio/csrc/sox_utils.h +100 -0
data/ext/torchaudio/ext.cpp +33 -0
data/ext/torchaudio/extconf.rb +81 -0
data/lib/torchaudio.rb +95 -0
data/lib/torchaudio/datasets/utils.rb +92 -0
data/lib/torchaudio/datasets/yesno.rb +59 -0
data/lib/torchaudio/version.rb +3 -0
metadata +145 -0

@@ -0,0 +1,71 @@
+#include <sox.h>
+#include <string>
+#include <tuple>
+#include <vector>
+#include <unistd.h>
+// same as <torch/extension.h> without <torch/python.h>
+#include <torch/all.h>
+namespace at {
+struct Tensor;
+} // namespace at
+namespace torch { namespace audio {
+/// Reads an audio file from the given `path` into the `output` `Tensor` and
+/// returns the sample rate of the audio file.
+/// Throws `std::runtime_error` if the audio file could not be opened, or an
+/// error occurred during reading of the audio data.
+int read_audio_file(
+    const std::string& file_name,
+    at::Tensor output,
+    bool ch_first,
+    int64_t nframes,
+    int64_t offset,
+    sox_signalinfo_t* si,
+    sox_encodinginfo_t* ei,
+    const char* ft);
+/// Writes the data of a `Tensor` into an audio file at the given `path`, with
+/// a certain extension (e.g. `wav`or `mp3`) and sample rate.
+/// Throws `std::runtime_error` when the audio file could not be opened for
+/// writing, or an error occurred during writing of the audio data.
+void write_audio_file(
+    const std::string& file_name,
+    const at::Tensor& tensor,
+    sox_signalinfo_t* si,
+    sox_encodinginfo_t* ei,
+    const char* file_type);
+/// Reads an audio file from the given `path` and returns a tuple of
+/// sox_signalinfo_t and sox_encodinginfo_t, which contain information about
+/// the audio file such as sample rate, length, bit precision, encoding and more.
+/// Throws `std::runtime_error` if the audio file could not be opened, or an
+/// error occurred during reading of the audio data.
+std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
+    const std::string& file_name);
+// Struct for build_flow_effects function
+struct SoxEffect {
+  SoxEffect() : ename(""), eopts({""})  { }
+  std::string ename;
+  std::vector<std::string> eopts;
+};
+/// Build a SoX chain, flow the effects, and capture the results in a tensor.
+/// An audio file from the given `path` flows through an effects chain given
+/// by a list of effects and effect options to an output buffer which is encoded
+/// into memory to a target signal type and target signal encoding.  The resulting
+/// buffer is then placed into a tensor.  This function returns the output tensor
+/// and the sample rate of the output tensor.
+int build_flow_effects(const std::string& file_name,
+                       at::Tensor otensor,
+                       bool ch_first,
+                       sox_signalinfo_t* target_signal,
+                       sox_encodinginfo_t* target_encoding,
+                       const char* file_type,
+                       std::vector<SoxEffect> pyeffs,
+                       int max_num_eopts);
+}} // namespace torch::audio

data/ext/torchaudio/csrc/sox_effects.cpp ADDED

@@ -0,0 +1,54 @@
+#include <sox.h>
+#include <torchaudio/csrc/sox_effects.h>
+using namespace torch::indexing;
+namespace torchaudio {
+namespace sox_effects {
+namespace {
+enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
+SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
+} // namespace
+void initialize_sox_effects() {
+  if (SOX_RESOURCE_STATE == ShutDown) {
+    throw std::runtime_error(
+        "SoX Effects has been shut down. Cannot initialize again.");
+  }
+  if (SOX_RESOURCE_STATE == NotInitialized) {
+    if (sox_init() != SOX_SUCCESS) {
+      throw std::runtime_error("Failed to initialize sox effects.");
+    };
+    SOX_RESOURCE_STATE = Initialized;
+  }
+};
+void shutdown_sox_effects() {
+  if (SOX_RESOURCE_STATE == NotInitialized) {
+    throw std::runtime_error(
+        "SoX Effects is not initialized. Cannot shutdown.");
+  }
+  if (SOX_RESOURCE_STATE == Initialized) {
+    if (sox_quit() != SOX_SUCCESS) {
+      throw std::runtime_error("Failed to initialize sox effects.");
+    };
+    SOX_RESOURCE_STATE = ShutDown;
+  }
+}
+std::vector<std::string> list_effects() {
+  std::vector<std::string> names;
+  const sox_effect_fn_t* fns = sox_get_effect_fns();
+  for (int i = 0; fns[i]; ++i) {
+    const sox_effect_handler_t* handler = fns[i]();
+    if (handler && handler->name)
+      names.push_back(handler->name);
+  }
+  return names;
+}
+} // namespace sox_effects
+} // namespace torchaudio

data/ext/torchaudio/csrc/sox_effects.h ADDED

@@ -0,0 +1,18 @@
+#ifndef TORCHAUDIO_SOX_EFFECTS_H
+#define TORCHAUDIO_SOX_EFFECTS_H
+#include <torch/script.h>
+namespace torchaudio {
+namespace sox_effects {
+void initialize_sox_effects();
+void shutdown_sox_effects();
+std::vector<std::string> list_effects();
+} // namespace sox_effects
+} // namespace torchaudio
+#endif

data/ext/torchaudio/csrc/sox_io.cpp ADDED

@@ -0,0 +1,170 @@
+#include <sox.h>
+#include <torchaudio/csrc/sox_io.h>
+#include <torchaudio/csrc/sox_utils.h>
+using namespace torch::indexing;
+using namespace torchaudio::sox_utils;
+namespace torchaudio {
+namespace sox_io {
+SignalInfo::SignalInfo(
+    const int64_t sample_rate_,
+    const int64_t num_channels_,
+    const int64_t num_frames_)
+    : sample_rate(sample_rate_),
+      num_channels(num_channels_),
+      num_frames(num_frames_){};
+int64_t SignalInfo::getSampleRate() const {
+  return sample_rate;
+}
+int64_t SignalInfo::getNumChannels() const {
+  return num_channels;
+}
+int64_t SignalInfo::getNumFrames() const {
+  return num_frames;
+}
+c10::intrusive_ptr<SignalInfo> get_info(const std::string& path) {
+  SoxFormat sf(sox_open_read(
+      path.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/nullptr));
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error("Error opening audio file");
+  }
+  return c10::make_intrusive<SignalInfo>(
+      static_cast<int64_t>(sf->signal.rate),
+      static_cast<int64_t>(sf->signal.channels),
+      static_cast<int64_t>(sf->signal.length / sf->signal.channels));
+}
+c10::intrusive_ptr<TensorSignal> load_audio_file(
+    const std::string& path,
+    const int64_t frame_offset,
+    const int64_t num_frames,
+    const bool normalize,
+    const bool channels_first) {
+  if (frame_offset < 0) {
+    throw std::runtime_error(
+        "Invalid argument: frame_offset must be non-negative.");
+  }
+  if (num_frames == 0 || num_frames < -1) {
+    throw std::runtime_error(
+        "Invalid argument: num_frames must be -1 or greater than 0.");
+  }
+  SoxFormat sf(sox_open_read(
+      path.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/nullptr));
+  validate_input_file(sf);
+  const int64_t num_channels = sf->signal.channels;
+  const int64_t num_total_samples = sf->signal.length;
+  const int64_t sample_start = sf->signal.channels * frame_offset;
+  if (sox_seek(sf, sample_start, 0) == SOX_EOF) {
+    throw std::runtime_error("Error reading audio file: offset past EOF.");
+  }
+  const int64_t sample_end = [&]() {
+    if (num_frames == -1)
+      return num_total_samples;
+    const int64_t sample_end_ = num_channels * num_frames + sample_start;
+    if (num_total_samples < sample_end_) {
+      // For lossy encoding, it is difficult to predict exact size of buffer for
+      // reading the number of samples required.
+      // So we allocate buffer size of given `num_frames` and ask sox to read as
+      // much as possible. For lossless format, sox reads exact number of
+      // samples, but for lossy encoding, sox can end up reading less. (i.e.
+      // mp3) For the consistent behavior specification between lossy/lossless
+      // format, we allow users to provide `num_frames` value that exceeds #of
+      // available samples, and we adjust it here.
+      return num_total_samples;
+    }
+    return sample_end_;
+  }();
+  const int64_t max_samples = sample_end - sample_start;
+  // Read samples into buffer
+  std::vector<sox_sample_t> buffer;
+  buffer.reserve(max_samples);
+  const int64_t num_samples = sox_read(sf, buffer.data(), max_samples);
+  if (num_samples == 0) {
+    throw std::runtime_error(
+        "Error reading audio file: empty file or read operation failed.");
+  }
+  // NOTE: num_samples may be smaller than max_samples if the input
+  // format is compressed (i.e. mp3).
+  // Convert to Tensor
+  auto tensor = convert_to_tensor(
+      buffer.data(),
+      num_samples,
+      num_channels,
+      get_dtype(sf->encoding.encoding, sf->signal.precision),
+      normalize,
+      channels_first);
+  return c10::make_intrusive<TensorSignal>(
+      tensor, static_cast<int64_t>(sf->signal.rate), channels_first);
+}
+void save_audio_file(
+    const std::string& file_name,
+    const c10::intrusive_ptr<TensorSignal>& signal,
+    const double compression) {
+  const auto tensor = signal->getTensor();
+  const auto sample_rate = signal->getSampleRate();
+  const auto channels_first = signal->getChannelsFirst();
+  validate_input_tensor(tensor);
+  const auto filetype = get_filetype(file_name);
+  const auto signal_info =
+      get_signalinfo(tensor, sample_rate, channels_first, filetype);
+  const auto encoding_info =
+      get_encodinginfo(filetype, tensor.dtype(), compression);
+  SoxFormat sf(sox_open_write(
+      file_name.c_str(),
+      &signal_info,
+      &encoding_info,
+      /*filetype=*/filetype.c_str(),
+      /*oob=*/nullptr,
+      /*overwrite_permitted=*/nullptr));
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error("Error saving audio file: failed to open file.");
+  }
+  auto tensor_ = tensor;
+  if (channels_first) {
+    tensor_ = tensor_.t();
+  }
+  const int64_t frames_per_chunk = 65536;
+  for (int64_t i = 0; i < tensor_.size(0); i += frames_per_chunk) {
+    auto chunk = tensor_.index({Slice(i, i + frames_per_chunk), Slice()});
+    chunk = unnormalize_wav(chunk).contiguous();
+    const size_t numel = chunk.numel();
+    if (sox_write(sf, chunk.data_ptr<int32_t>(), numel) != numel) {
+      throw std::runtime_error(
+          "Error saving audio file: failed to write the entier buffer.");
+    }
+  }
+}
+} // namespace sox_io
+} // namespace torchaudio

data/ext/torchaudio/csrc/sox_io.h ADDED

@@ -0,0 +1,41 @@
+#ifndef TORCHAUDIO_SOX_IO_H
+#define TORCHAUDIO_SOX_IO_H
+#include <torch/script.h>
+#include <torchaudio/csrc/sox_utils.h>
+namespace torchaudio {
+namespace sox_io {
+struct SignalInfo : torch::CustomClassHolder {
+  int64_t sample_rate;
+  int64_t num_channels;
+  int64_t num_frames;
+  SignalInfo(
+      const int64_t sample_rate_,
+      const int64_t num_channels_,
+      const int64_t num_frames_);
+  int64_t getSampleRate() const;
+  int64_t getNumChannels() const;
+  int64_t getNumFrames() const;
+};
+c10::intrusive_ptr<SignalInfo> get_info(const std::string& path);
+c10::intrusive_ptr<torchaudio::sox_utils::TensorSignal> load_audio_file(
+    const std::string& path,
+    const int64_t frame_offset = 0,
+    const int64_t num_frames = -1,
+    const bool normalize = true,
+    const bool channels_first = true);
+void save_audio_file(
+    const std::string& file_name,
+    const c10::intrusive_ptr<torchaudio::sox_utils::TensorSignal>& signal,
+    const double compression = 0.);
+} // namespace sox_io
+} // namespace torchaudio
+#endif

data/ext/torchaudio/csrc/sox_utils.cpp ADDED

@@ -0,0 +1,245 @@
+#include <c10/core/ScalarType.h>
+#include <sox.h>
+#include <torchaudio/csrc/sox_utils.h>
+namespace torchaudio {
+namespace sox_utils {
+TensorSignal::TensorSignal(
+    torch::Tensor tensor_,
+    int64_t sample_rate_,
+    bool channels_first_)
+    : tensor(tensor_),
+      sample_rate(sample_rate_),
+      channels_first(channels_first_){};
+torch::Tensor TensorSignal::getTensor() const {
+  return tensor;
+}
+int64_t TensorSignal::getSampleRate() const {
+  return sample_rate;
+}
+bool TensorSignal::getChannelsFirst() const {
+  return channels_first;
+}
+SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
+SoxFormat::~SoxFormat() {
+  if (fd_ != nullptr) {
+    sox_close(fd_);
+  }
+}
+sox_format_t* SoxFormat::operator->() const noexcept {
+  return fd_;
+}
+SoxFormat::operator sox_format_t*() const noexcept {
+  return fd_;
+}
+void validate_input_file(const SoxFormat& sf) {
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error("Error loading audio file: failed to open file.");
+  }
+  if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    throw std::runtime_error("Error loading audio file: unknown encoding.");
+  }
+  if (sf->signal.length == 0) {
+    throw std::runtime_error("Error reading audio file: unkown length.");
+  }
+}
+void validate_input_tensor(const torch::Tensor tensor) {
+  if (!tensor.device().is_cpu()) {
+    throw std::runtime_error("Input tensor has to be on CPU.");
+  }
+  if (tensor.ndimension() != 2) {
+    throw std::runtime_error("Input tensor has to be 2D.");
+  }
+  const auto dtype = tensor.dtype();
+  if (!(dtype == torch::kFloat32 || dtype == torch::kInt32 ||
+        dtype == torch::kInt16 || dtype == torch::kUInt8)) {
+    throw std::runtime_error(
+        "Input tensor has to be one of float32, int32, int16 or uint8 type.");
+  }
+}
+caffe2::TypeMeta get_dtype(
+    const sox_encoding_t encoding,
+    const unsigned precision) {
+  const auto dtype = [&]() {
+    switch (encoding) {
+      case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
+        return torch::kUInt8;
+      case SOX_ENCODING_SIGN2: // 16-bit or 32-bit PCM WAV
+        switch (precision) {
+          case 16:
+            return torch::kInt16;
+          case 32:
+            return torch::kInt32;
+          default:
+            throw std::runtime_error(
+                "Only 16 and 32 bits are supported for signed PCM.");
+        }
+      default:
+        // default to float32 for the other formats, including
+        // 32-bit flaoting-point WAV,
+        // MP3,
+        // FLAC,
+        // VORBIS etc...
+        return torch::kFloat32;
+    }
+  }();
+  return c10::scalarTypeToTypeMeta(dtype);
+}
+torch::Tensor convert_to_tensor(
+    sox_sample_t* buffer,
+    const int32_t num_samples,
+    const int32_t num_channels,
+    const caffe2::TypeMeta dtype,
+    const bool normalize,
+    const bool channels_first) {
+  auto t = torch::from_blob(
+      buffer, {num_samples / num_channels, num_channels}, torch::kInt32);
+  // Note: Tensor created from_blob does not own data but borrwos
+  // So make sure to create a new copy after processing samples.
+  if (normalize || dtype == torch::kFloat32) {
+    t = t.to(torch::kFloat32);
+    t *= (t > 0) / 2147483647. + (t < 0) / 2147483648.;
+  } else if (dtype == torch::kInt32) {
+    t = t.clone();
+  } else if (dtype == torch::kInt16) {
+    t.floor_divide_(1 << 16);
+    t = t.to(torch::kInt16);
+  } else if (dtype == torch::kUInt8) {
+    t.floor_divide_(1 << 24);
+    t += 128;
+    t = t.to(torch::kUInt8);
+  } else {
+    throw std::runtime_error("Unsupported dtype.");
+  }
+  if (channels_first) {
+    t = t.transpose(1, 0);
+  }
+  return t.contiguous();
+}
+torch::Tensor unnormalize_wav(const torch::Tensor input_tensor) {
+  const auto dtype = input_tensor.dtype();
+  auto tensor = input_tensor;
+  if (dtype == torch::kFloat32) {
+    double multi_pos = 2147483647.;
+    double multi_neg = -2147483648.;
+    auto mult = (tensor > 0) * multi_pos - (tensor < 0) * multi_neg;
+    tensor = tensor.to(torch::dtype(torch::kFloat64));
+    tensor *= mult;
+    tensor.clamp_(multi_neg, multi_pos);
+    tensor = tensor.to(torch::dtype(torch::kInt32));
+  } else if (dtype == torch::kInt32) {
+    // already denormalized
+  } else if (dtype == torch::kInt16) {
+    tensor = tensor.to(torch::dtype(torch::kInt32));
+    tensor *= ((tensor != 0) * 65536);
+  } else if (dtype == torch::kUInt8) {
+    tensor = tensor.to(torch::dtype(torch::kInt32));
+    tensor -= 128;
+    tensor *= 16777216;
+  } else {
+    throw std::runtime_error("Unexpected dtype.");
+  }
+  return tensor;
+}
+const std::string get_filetype(const std::string path) {
+  std::string ext = path.substr(path.find_last_of(".") + 1);
+  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+  return ext;
+}
+sox_encoding_t get_encoding(
+    const std::string filetype,
+    const caffe2::TypeMeta dtype) {
+  if (filetype == "mp3")
+    return SOX_ENCODING_MP3;
+  if (filetype == "flac")
+    return SOX_ENCODING_FLAC;
+  if (filetype == "ogg" || filetype == "vorbis")
+    return SOX_ENCODING_VORBIS;
+  if (filetype == "wav") {
+    if (dtype == torch::kUInt8)
+      return SOX_ENCODING_UNSIGNED;
+    if (dtype == torch::kInt16)
+      return SOX_ENCODING_SIGN2;
+    if (dtype == torch::kInt32)
+      return SOX_ENCODING_SIGN2;
+    if (dtype == torch::kFloat32)
+      return SOX_ENCODING_FLOAT;
+    throw std::runtime_error("Unsupported dtype.");
+  }
+  throw std::runtime_error("Unsupported file type.");
+}
+unsigned get_precision(
+    const std::string filetype,
+    const caffe2::TypeMeta dtype) {
+  if (filetype == "mp3")
+    return SOX_UNSPEC;
+  if (filetype == "flac")
+    return 24;
+  if (filetype == "ogg" || filetype == "vorbis")
+    return SOX_UNSPEC;
+  if (filetype == "wav") {
+    if (dtype == torch::kUInt8)
+      return 8;
+    if (dtype == torch::kInt16)
+      return 16;
+    if (dtype == torch::kInt32)
+      return 32;
+    if (dtype == torch::kFloat32)
+      return 32;
+    throw std::runtime_error("Unsupported dtype.");
+  }
+  throw std::runtime_error("Unsupported file type.");
+}
+sox_signalinfo_t get_signalinfo(
+    const torch::Tensor& tensor,
+    const int64_t sample_rate,
+    const bool channels_first,
+    const std::string filetype) {
+  return sox_signalinfo_t{
+      /*rate=*/static_cast<sox_rate_t>(sample_rate),
+      /*channels=*/static_cast<unsigned>(tensor.size(channels_first ? 0 : 1)),
+      /*precision=*/get_precision(filetype, tensor.dtype()),
+      /*length=*/static_cast<uint64_t>(tensor.numel())};
+}
+sox_encodinginfo_t get_encodinginfo(
+    const std::string filetype,
+    const caffe2::TypeMeta dtype,
+    const double compression) {
+  const double compression_ = [&]() {
+    if (filetype == "mp3")
+      return compression;
+    if (filetype == "flac")
+      return compression;
+    if (filetype == "ogg" || filetype == "vorbis")
+      return compression;
+    if (filetype == "wav")
+      return 0.;
+    throw std::runtime_error("Unsupported file type.");
+  }();
+  return sox_encodinginfo_t{/*encoding=*/get_encoding(filetype, dtype),
+                            /*bits_per_sample=*/get_precision(filetype, dtype),
+                            /*compression=*/compression_,
+                            /*reverse_bytes=*/sox_option_default,
+                            /*reverse_nibbles=*/sox_option_default,
+                            /*reverse_bits=*/sox_option_default,
+                            /*opposite_endian=*/sox_false};
+}
+} // namespace sox_utils
+} // namespace torchaudio