npm - react-native-executorch - Versions diffs - 0.5.15 → 0.6.0 - Mend

react-native-executorch 0.5.15 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (277) hide show

package/common/rnexecutorch/models/text_to_image/Encoder.h ADDED Viewed

@@ -0,0 +1,32 @@
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include <ReactCommon/CallInvoker.h>
+#include <jsi/jsi.h>
+#include <rnexecutorch/jsi/OwningArrayBuffer.h>
+#include <rnexecutorch/models/embeddings/text/TextEmbeddings.h>
+namespace rnexecutorch {
+namespace models::text_to_image {
+using namespace facebook;
+class Encoder final {
+public:
+  explicit Encoder(const std::string &tokenizerSource,
+                   const std::string &encoderSource,
+                   std::shared_ptr<react::CallInvoker> callInvoker);
+  std::vector<float> generate(std::string input);
+  size_t getMemoryLowerBound() const noexcept;
+  void unload() noexcept;
+private:
+  std::shared_ptr<react::CallInvoker> callInvoker;
+  std::unique_ptr<embeddings::TextEmbeddings> encoder;
+};
+} // namespace models::text_to_image
+} // namespace rnexecutorch

package/common/rnexecutorch/models/text_to_image/Scheduler.cpp ADDED Viewed

@@ -0,0 +1,152 @@
+// The implementation of the PNDMScheduler class from the diffusers library
+// https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py
+#include "Scheduler.h"
+#include <algorithm>
+#include <cmath>
+namespace rnexecutorch::models::text_to_image {
+using namespace facebook;
+Scheduler::Scheduler(float betaStart, float betaEnd, int32_t numTrainTimesteps,
+                     int32_t stepsOffset,
+                     std::shared_ptr<react::CallInvoker> callInvoker)
+    : numTrainTimesteps(numTrainTimesteps), stepsOffset(stepsOffset) {
+  const float start = std::sqrt(betaStart);
+  const float end = std::sqrt(betaEnd);
+  const float step = (end - start) / (numTrainTimesteps - 1);
+  float runningProduct = 1.0f;
+  alphas.reserve(numTrainTimesteps);
+  // alphasCumprod[t] — fraction of the signal remaining after t steps
+  alphasCumprod.reserve(numTrainTimesteps);
+  // betas[t] — amount of noise injected at timestep t
+  betas.reserve(numTrainTimesteps);
+  for (int32_t i = 0; i < numTrainTimesteps; ++i) {
+    const float value = start + step * i;
+    const float beta = value * value;
+    betas.push_back(beta);
+    const float alpha = 1.0f - beta;
+    alphas.push_back(alpha);
+    runningProduct *= alpha;
+    alphasCumprod.push_back(runningProduct);
+  }
+  // finalAlphaCumprod — signal at the first training step (highest
+  // signal-to-noise ratio) used as reference at the end of diffusion process
+  if (!alphasCumprod.empty()) {
+    finalAlphaCumprod = alphasCumprod[0];
+  }
+}
+void Scheduler::setTimesteps(size_t numInferenceSteps) {
+  this->numInferenceSteps = numInferenceSteps;
+  ets.clear();
+  if (numInferenceSteps < 2) {
+    timesteps = {1};
+    return;
+  }
+  timesteps.clear();
+  timesteps.reserve(numInferenceSteps + 1);
+  float numStepsRatio =
+      static_cast<float>(numTrainTimesteps) / numInferenceSteps;
+  for (size_t i = 0; i < numInferenceSteps; i++) {
+    const auto timestep =
+        static_cast<int32_t>(std::round(i * numStepsRatio)) + stepsOffset;
+    timesteps.push_back(timestep);
+  }
+  // Duplicate the timestep to provide enough points for the solver
+  timesteps.insert(timesteps.end() - 1, timesteps[numInferenceSteps - 2]);
+  std::ranges::reverse(timesteps);
+}
+std::vector<float> Scheduler::step(const std::vector<float> &sample,
+                                   const std::vector<float> &noise,
+                                   int32_t timestep) {
+  if (numInferenceSteps == 0) {
+    throw std::runtime_error(
+        "Number of inference steps is not set. Call `set_timesteps` first.");
+  }
+  size_t noiseSize = noise.size();
+  std::vector<float> etsOutput(noiseSize);
+  float numStepsRatio =
+      static_cast<float>(numTrainTimesteps) / numInferenceSteps;
+  float timestepPrev = timestep - numStepsRatio;
+  if (ets.empty()) {
+    ets.push_back(noise);
+    etsOutput = noise;
+    tempFirstSample = sample;
+    return getPrevSample(sample, etsOutput, timestep, timestepPrev);
+  }
+  // Use the previous sample as the estimate requires at least 2 points
+  if (ets.size() == 1 && !tempFirstSample.empty()) {
+    for (size_t i = 0; i < noiseSize; i++) {
+      etsOutput[i] = (noise[i] + ets[0][i]) / 2;
+    }
+    auto prevSample = getPrevSample(std::move(tempFirstSample), etsOutput,
+                                    timestep + numStepsRatio, timestep);
+    tempFirstSample.clear();
+    return prevSample;
+  }
+  // Coefficients come from the linear multistep method
+  // https://en.wikipedia.org/wiki/Linear_multistep_method
+  ets.push_back(noise);
+  if (ets.size() == 2) {
+    for (size_t i = 0; i < noiseSize; i++) {
+      etsOutput[i] = (ets[1][i] * 3 - ets[0][i]) / 2;
+    }
+  } else if (ets.size() == 3) {
+    for (size_t i = 0; i < noiseSize; i++) {
+      etsOutput[i] = ((ets[2][i] * 23 - ets[1][i] * 16) + ets[0][i] * 5) / 12;
+    }
+  } else {
+    ets.assign(ets.end() - 4, ets.end());
+    for (size_t i = 0; i < noiseSize; i++) {
+      etsOutput[i] =
+          (ets[3][i] * 55 - ets[2][i] * 59 + ets[1][i] * 37 - ets[0][i] * 9) /
+          24;
+    }
+  }
+  return getPrevSample(sample, etsOutput, timestep, timestepPrev);
+}
+std::vector<float> Scheduler::getPrevSample(const std::vector<float> &sample,
+                                            const std::vector<float> &noise,
+                                            int32_t timestep,
+                                            int32_t timestepPrev) const {
+  const float alpha = alphasCumprod[timestep];
+  const float alphaPrev =
+      timestepPrev >= 0 ? alphasCumprod[timestepPrev] : finalAlphaCumprod;
+  const float beta = 1 - alpha;
+  const float betaPrev = 1 - alphaPrev;
+  size_t noiseSize = noise.size();
+  const float noiseCoeff =
+      (alphaPrev - alpha) /
+      (alpha * std::sqrt(betaPrev) + std::sqrt(alpha * beta * alphaPrev));
+  const float sampleCoeff = std::sqrt(alphaPrev / alpha);
+  std::vector<float> samplePrev;
+  samplePrev.reserve(noiseSize);
+  for (size_t i = 0; i < noiseSize; i++) {
+    const float noiseTerm =
+        (noise[i] * std::sqrt(alpha) + sample[i] * std::sqrt(beta)) *
+        noiseCoeff;
+    samplePrev.push_back(sample[i] * sampleCoeff - noiseTerm);
+  }
+  return samplePrev;
+}
+} // namespace rnexecutorch::models::text_to_image

package/common/rnexecutorch/models/text_to_image/Scheduler.h ADDED Viewed

@@ -0,0 +1,41 @@
+#pragma once
+#include <memory>
+#include <vector>
+#include <ReactCommon/CallInvoker.h>
+namespace rnexecutorch::models::text_to_image {
+using namespace facebook;
+class Scheduler final {
+public:
+  explicit Scheduler(float betaStart, float betaEnd, int32_t numTrainTimesteps,
+                     int32_t stepsOfset,
+                     std::shared_ptr<react::CallInvoker> callInvoker);
+  void setTimesteps(size_t numInferenceSteps);
+  std::vector<float> step(const std::vector<float> &sample,
+                          const std::vector<float> &noise, int32_t timestep);
+  std::vector<int32_t> timesteps;
+private:
+  int32_t numTrainTimesteps;
+  int32_t stepsOffset;
+  std::vector<float> betas;
+  std::vector<float> alphas;
+  std::vector<float> alphasCumprod;
+  std::vector<float> tempFirstSample;
+  std::vector<std::vector<float>> ets;
+  float finalAlphaCumprod{1.0f};
+  size_t numInferenceSteps{0};
+  std::vector<float> getPrevSample(const std::vector<float> &sample,
+                                   const std::vector<float> &noise,
+                                   int32_t timestep,
+                                   int32_t prevTimestep) const;
+};
+} // namespace rnexecutorch::models::text_to_image

package/common/rnexecutorch/models/text_to_image/TextToImage.cpp ADDED Viewed

@@ -0,0 +1,141 @@
+#include "TextToImage.h"
+#include <cmath>
+#include <random>
+#include <span>
+#include <executorch/extension/tensor/tensor.h>
+#include <rnexecutorch/Log.h>
+#include <rnexecutorch/models/text_to_image/Constants.h>
+namespace rnexecutorch::models::text_to_image {
+using namespace executorch::extension;
+TextToImage::TextToImage(const std::string &tokenizerSource,
+                         const std::string &encoderSource,
+                         const std::string &unetSource,
+                         const std::string &decoderSource,
+                         float schedulerBetaStart, float schedulerBetaEnd,
+                         int32_t schedulerNumTrainTimesteps,
+                         int32_t schedulerStepsOffset,
+                         std::shared_ptr<react::CallInvoker> callInvoker)
+    : callInvoker(callInvoker),
+      scheduler(std::make_unique<Scheduler>(
+          schedulerBetaStart, schedulerBetaEnd, schedulerNumTrainTimesteps,
+          schedulerStepsOffset, callInvoker)),
+      encoder(std::make_unique<Encoder>(tokenizerSource, encoderSource,
+                                        callInvoker)),
+      unet(std::make_unique<UNet>(unetSource, callInvoker)),
+      decoder(std::make_unique<Decoder>(decoderSource, callInvoker)) {}
+void TextToImage::setImageSize(int32_t imageSize) {
+  if (imageSize % 32 != 0) {
+    throw std::runtime_error("Image size must be a multiple of 32.");
+  }
+  this->imageSize = imageSize;
+  constexpr int32_t latentDownsample = 8;
+  latentImageSize = std::floor(imageSize / latentDownsample);
+  unet->latentImageSize = latentImageSize;
+  decoder->latentImageSize = latentImageSize;
+}
+void TextToImage::setSeed(int32_t &seed) {
+  // Seed argument is provided
+  if (seed >= 0) {
+    return;
+  }
+  std::random_device rd;
+  seed = rd();
+}
+std::shared_ptr<OwningArrayBuffer>
+TextToImage::generate(std::string input, int32_t imageSize,
+                      size_t numInferenceSteps, int32_t seed,
+                      std::shared_ptr<jsi::Function> callback) {
+  setImageSize(imageSize);
+  setSeed(seed);
+  std::vector<float> embeddings = encoder->generate(input);
+  std::vector<int32_t> embeddingsShape = {2, 77, 768};
+  auto embeddingsTensor =
+      make_tensor_ptr(embeddingsShape, embeddings.data(), ScalarType::Float);
+  int32_t latentsSize = numChannels * latentImageSize * latentImageSize;
+  std::vector<float> latents(latentsSize);
+  std::mt19937 gen(seed);
+  std::normal_distribution<float> dist(0.0, 1.0);
+  for (auto &val : latents) {
+    val = dist(gen);
+  }
+  scheduler->setTimesteps(numInferenceSteps);
+  std::vector<int32_t> timesteps = scheduler->timesteps;
+  auto nativeCallback = [this, callback](size_t stepIdx) {
+    this->callInvoker->invokeAsync([callback, stepIdx](jsi::Runtime &runtime) {
+      callback->call(runtime, jsi::Value(static_cast<int32_t>(stepIdx)));
+    });
+  };
+  for (size_t t = 0; t < numInferenceSteps + 1 && !interrupted; t++) {
+    log(LOG_LEVEL::Debug, "Step:", t, "/", numInferenceSteps);
+    std::vector<float> noisePred =
+        unet->generate(latents, timesteps[t], embeddingsTensor);
+    size_t noiseSize = noisePred.size() / 2;
+    std::span<const float> noisePredSpan{noisePred};
+    std::span<const float> noiseUncond = noisePredSpan.subspan(0, noiseSize);
+    std::span<const float> noiseText =
+        noisePredSpan.subspan(noiseSize, noiseSize);
+    std::vector<float> noise(noiseSize);
+    for (size_t i = 0; i < noiseSize; i++) {
+      noise[i] =
+          noiseUncond[i] * (1 - guidanceScale) + noiseText[i] * guidanceScale;
+    }
+    latents = scheduler->step(latents, noise, timesteps[t]);
+    nativeCallback(t);
+  }
+  if (interrupted) {
+    interrupted = false;
+    return std::make_shared<OwningArrayBuffer>(0);
+  }
+  for (auto &val : latents) {
+    val /= latentsScale;
+  }
+  std::vector<float> output = decoder->generate(latents);
+  return postprocess(output);
+}
+std::shared_ptr<OwningArrayBuffer>
+TextToImage::postprocess(const std::vector<float> &output) const {
+  // Convert RGB to RGBA
+  int32_t imagePixelCount = imageSize * imageSize;
+  std::vector<uint8_t> outputRgba(imagePixelCount * 4);
+  for (int32_t i = 0; i < imagePixelCount; i++) {
+    outputRgba[i * 4 + 0] = output[i * 3 + 0];
+    outputRgba[i * 4 + 1] = output[i * 3 + 1];
+    outputRgba[i * 4 + 2] = output[i * 3 + 2];
+    outputRgba[i * 4 + 3] = 255;
+  }
+  return std::make_shared<OwningArrayBuffer>(outputRgba);
+}
+void TextToImage::interrupt() noexcept { interrupted = true; }
+size_t TextToImage::getMemoryLowerBound() const noexcept {
+  return encoder->getMemoryLowerBound() + unet->getMemoryLowerBound() +
+         decoder->getMemoryLowerBound();
+}
+void TextToImage::unload() noexcept {
+  encoder->unload();
+  unet->unload();
+  decoder->unload();
+}
+} // namespace rnexecutorch::models::text_to_image

package/common/rnexecutorch/models/text_to_image/TextToImage.h ADDED Viewed

@@ -0,0 +1,64 @@
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include <ReactCommon/CallInvoker.h>
+#include <jsi/jsi.h>
+#include <rnexecutorch/jsi/OwningArrayBuffer.h>
+#include <rnexecutorch/metaprogramming/ConstructorHelpers.h>
+#include <rnexecutorch/models/text_to_image/Decoder.h>
+#include <rnexecutorch/models/text_to_image/Encoder.h>
+#include <rnexecutorch/models/text_to_image/Scheduler.h>
+#include <rnexecutorch/models/text_to_image/UNet.h>
+namespace rnexecutorch {
+namespace models::text_to_image {
+using namespace facebook;
+class TextToImage final {
+public:
+  explicit TextToImage(const std::string &tokenizerSource,
+                       const std::string &encoderSource,
+                       const std::string &unetSource,
+                       const std::string &decoderSource,
+                       float schedulerBetaStart, float schedulerBetaEnd,
+                       int32_t schedulerNumTrainTimesteps,
+                       int32_t schedulerStepsOffset,
+                       std::shared_ptr<react::CallInvoker> callInvoker);
+  std::shared_ptr<OwningArrayBuffer>
+  generate(std::string input, int32_t imageSize, size_t numInferenceSteps,
+           int32_t seed, std::shared_ptr<jsi::Function> callback);
+  void interrupt() noexcept;
+  size_t getMemoryLowerBound() const noexcept;
+  void unload() noexcept;
+private:
+  void setImageSize(int32_t imageSize);
+  void setSeed(int32_t &seed);
+  std::shared_ptr<OwningArrayBuffer>
+  postprocess(const std::vector<float> &output) const;
+  size_t memorySizeLowerBound;
+  int32_t imageSize;
+  int32_t latentImageSize;
+  static constexpr int32_t numChannels = 4;
+  static constexpr float guidanceScale = 7.5f;
+  static constexpr float latentsScale = 0.18215f;
+  bool interrupted = false;
+  std::shared_ptr<react::CallInvoker> callInvoker;
+  std::unique_ptr<Scheduler> scheduler;
+  std::unique_ptr<Encoder> encoder;
+  std::unique_ptr<UNet> unet;
+  std::unique_ptr<Decoder> decoder;
+};
+} // namespace models::text_to_image
+REGISTER_CONSTRUCTOR(models::text_to_image::TextToImage, std::string,
+                     std::string, std::string, std::string, float, float,
+                     int32_t, int32_t, std::shared_ptr<react::CallInvoker>);
+} // namespace rnexecutorch

package/common/rnexecutorch/models/text_to_image/UNet.cpp ADDED Viewed

@@ -0,0 +1,38 @@
+#include "UNet.h"
+namespace rnexecutorch::models::text_to_image {
+using namespace executorch::extension;
+UNet::UNet(const std::string &modelSource,
+           std::shared_ptr<react::CallInvoker> callInvoker)
+    : BaseModel(modelSource, callInvoker) {}
+std::vector<float> UNet::generate(std::vector<float> &latents, int32_t timestep,
+                                  TensorPtr &embeddingsTensor) const {
+  std::vector<float> latentsConcat;
+  latentsConcat.reserve(2 * latentImageSize);
+  latentsConcat.insert(latentsConcat.end(), latents.begin(), latents.end());
+  latentsConcat.insert(latentsConcat.end(), latents.begin(), latents.end());
+  std::vector<int32_t> latentsShape = {2, numChannels, latentImageSize,
+                                       latentImageSize};
+  auto timestepTensor =
+      make_tensor_ptr<int64_t>({static_cast<int64_t>(timestep)});
+  auto latentsTensor =
+      make_tensor_ptr(latentsShape, latentsConcat.data(), ScalarType::Float);
+  auto forwardResult =
+      BaseModel::forward({latentsTensor, timestepTensor, embeddingsTensor});
+  if (!forwardResult.ok()) {
+    throw std::runtime_error(
+        "Function forward in UNet failed with error code: " +
+        std::to_string(static_cast<uint32_t>(forwardResult.error())));
+  }
+  auto forwardResultTensor = forwardResult->at(0).toTensor();
+  const auto *dataPtr = forwardResultTensor.const_data_ptr<float>();
+  return {dataPtr, dataPtr + forwardResultTensor.numel()};
+}
+} // namespace rnexecutorch::models::text_to_image

package/common/rnexecutorch/models/text_to_image/UNet.h ADDED Viewed

@@ -0,0 +1,28 @@
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include <executorch/extension/tensor/tensor.h>
+#include <ReactCommon/CallInvoker.h>
+#include <rnexecutorch/models/BaseModel.h>
+namespace rnexecutorch::models::text_to_image {
+using namespace executorch::extension;
+class UNet final : public BaseModel {
+public:
+  explicit UNet(const std::string &modelSource,
+                std::shared_ptr<react::CallInvoker> callInvoker);
+  std::vector<float> generate(std::vector<float> &latents, int32_t timestep,
+                              TensorPtr &embeddingsTensor) const;
+  int32_t latentImageSize;
+private:
+  static constexpr int32_t numChannels = 4;
+};
+} // namespace rnexecutorch::models::text_to_image

package/common/rnexecutorch/models/voice_activity_detection/Constants.h ADDED Viewed

@@ -0,0 +1,27 @@
+#pragma once
+#include <bit>
+#include <cstddef>
+#include <cstdint>
+namespace rnexecutorch::models::voice_activity_detection::constants {
+inline constexpr uint32_t kSampleRate = 16000;
+inline constexpr auto kMstoSecond = 0.001f;
+inline constexpr uint32_t kWindowSizeMs = 25;
+inline constexpr uint32_t kHopLengthMs = 10;
+inline constexpr auto kWindowSize =
+    static_cast<uint32_t>(kMstoSecond * kWindowSizeMs * kSampleRate); // 400
+inline constexpr auto kHopLength =
+    static_cast<uint32_t>(kMstoSecond * kHopLengthMs * kSampleRate); // 160
+inline constexpr auto kPreemphasisCoeff = 0.97f;
+inline constexpr auto kLeftPadding = (kWindowSize - 1) / 2;
+inline constexpr auto kRightPadding = kWindowSize / 2;
+inline constexpr auto kPaddedWindowSize = std::bit_ceil(kWindowSize); // 512
+inline constexpr size_t kModelInputMin = 100;
+inline constexpr size_t kModelInputMax = 1000;
+inline constexpr auto kSpeechThreshold = 0.6f;
+inline constexpr size_t kMinSpeechDuration = 25;  // 250 ms
+inline constexpr size_t kMinSilenceDuration = 10; // 100 ms
+inline constexpr size_t kSpeechPad = 3;           // 30 ms
+} // namespace rnexecutorch::models::voice_activity_detection::constants

package/common/rnexecutorch/models/voice_activity_detection/Types.h ADDED Viewed

@@ -0,0 +1,12 @@
+#pragma once
+#include <cstddef>
+namespace rnexecutorch::models::voice_activity_detection::types {
+struct Segment {
+  size_t start;
+  size_t end;
+};
+} // namespace rnexecutorch::models::voice_activity_detection::types

package/common/rnexecutorch/models/voice_activity_detection/Utils.cpp ADDED Viewed

@@ -0,0 +1,15 @@
+#include "Utils.h"
+namespace rnexecutorch::models::voice_activity_detection::utils {
+size_t getNonSpeechClassProbabilites(const executorch::aten::Tensor &tensor,
+                                     size_t numClass, size_t size,
+                                     std::vector<float> &resultVector,
+                                     size_t startIdx) {
+  const auto *rawData = tensor.const_data_ptr<float>();
+  for (size_t i = 0; i < size; i++) {
+    resultVector[startIdx + i] = rawData[numClass * i];
+  }
+  return startIdx + size;
+}
+} // namespace rnexecutorch::models::voice_activity_detection::utils

package/common/rnexecutorch/models/voice_activity_detection/Utils.h ADDED Viewed

@@ -0,0 +1,13 @@
+#pragma once
+#include <cstddef>
+#include <executorch/extension/tensor/tensor.h>
+#include <vector>
+namespace rnexecutorch::models::voice_activity_detection::utils {
+size_t getNonSpeechClassProbabilites(const executorch::aten::Tensor &tensor,
+                                     size_t numClass, size_t size,
+                                     std::vector<float> &resultVector,
+                                     size_t startIdx);
+} // namespace rnexecutorch::models::voice_activity_detection::utils