npm - react-native-executorch - Versions diffs - 0.5.3 → 0.5.5 - Mend

react-native-executorch 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

package/android/src/main/cpp/CMakeLists.txt CHANGED Viewed

@@ -95,4 +95,5 @@ target_link_libraries(
   ${OPENCV_THIRD_PARTY_LIBS}
   executorch
   ${EXECUTORCH_LIBS}
-)
+  z
+)

package/common/rnexecutorch/data_processing/Numerical.cpp CHANGED Viewed

@@ -9,7 +9,7 @@
 #include <string>
 namespace rnexecutorch::numerical {
-void softmax(std::vector<float> &v) {
+void softmax(std::span<float> v) {
   float max = *std::max_element(v.begin(), v.end());
   float sum = 0.0f;
@@ -22,32 +22,40 @@ void softmax(std::vector<float> &v) {
   }
 }
-void normalize(std::span<float> span) {
-  auto sum = 0.0f;
-  for (const auto &val : span) {
-    sum += val * val;
+void softmaxWithTemperature(std::span<float> input, float temperature) {
+  if (input.empty()) {
+    return;
   }
-  if (isClose(sum, 0.0f)) {
-    return;
+  if (temperature <= 0.0F) {
+    throw std::invalid_argument(
+        "Temperature must be greater than 0 for softmax with temperature.");
   }
-  float norm = std::sqrt(sum);
-  for (auto &val : span) {
-    val /= norm;
+  const auto maxElement = *std::ranges::max_element(input);
+  for (auto &value : input) {
+    value = std::exp((value - maxElement) / temperature);
   }
-}
-void normalize(std::vector<float> &v) {
-  float sum = 0.0f;
-  for (float &x : v) {
-    sum += x * x;
+  const auto sum = std::reduce(input.begin(), input.end());
+  // sum is at least 1 since exp(max - max) == exp(0) == 1
+  for (auto &value : input) {
+    value /= sum;
   }
+}
-  float norm =
-      std::max(std::sqrt(sum), 1e-9f); // Solely for preventing division by 0
-  for (float &x : v) {
-    x /= norm;
+void normalize(std::span<float> input) {
+  const auto sumOfSquares =
+      std::inner_product(input.begin(), input.end(), input.begin(), 0.0F);
+  constexpr auto kEpsilon = 1.0e-15F;
+  const auto norm = std::sqrt(sumOfSquares) + kEpsilon;
+  for (auto &value : input) {
+    value /= norm;
   }
 }

package/common/rnexecutorch/data_processing/Numerical.h CHANGED Viewed

@@ -4,10 +4,59 @@
 #include <vector>
 namespace rnexecutorch::numerical {
-void softmax(std::vector<float> &v);
-void normalize(std::span<float> span);
-void normalize(std::vector<float> &v);
-void normalize(std::span<float> span);
+/**
+ * @brief Applies the softmax function in-place to a sequence of numbers.
+ *
+ * @param input A mutable span of floating-point numbers. After the function
+ * returns, `input` contains the softmax probabilities.
+ */
+void softmax(std::span<float> input);
+/**
+ * @brief Applies the softmax function with temperature scaling in-place to a
+ * sequence of numbers.
+ *
+ * The temperature parameter controls the "sharpness" of the resulting
+ * probability distribution. A temperature of 1.0 means no scaling, while lower
+ * values make the distribution sharper (more peaked), and higher values make it
+ * softer (more uniform).
+ *
+ * @param input A mutable span of floating-point numbers. After the function
+ * returns, `input` contains the softmax probabilities.
+ * @param temperature A positive float value used to scale the logits before
+ * applying softmax. Must be greater than 0.
+ */
+void softmaxWithTemperature(std::span<float> input, float temperature);
+/**
+ * @brief Normalizes the elements of the given float span in-place using the
+ * L2 norm method.
+ *
+ * This function scales the input vector such that its L2 norm (Euclidean norm)
+ * becomes 1. If the norm is zero, the result is a zero vector with the same
+ * size as the input.
+ *
+ * @param input A mutable span of floating-point values representing the data to
+ * be normalized.
+ */
+void normalize(std::span<float> input);
+/**
+ * @brief Computes mean pooling across the modelOutput adjusted by an attention
+ * mask.
+ *
+ * This function aggregates the `modelOutput` span by sections defined by
+ * `attnMask`, computing the mean of sections influenced by the mask. The result
+ * is a vector where each element is the mean of a segment from the original
+ * data.
+ *
+ * @param modelOutput A span of floating-point numbers representing the model
+ * output.
+ * @param attnMask A span of integers where each integer is a weight
+ * corresponding to the elements in `modelOutput`.
+ * @return A std::vector<float> containing the computed mean values of segments.
+ */
 std::vector<float> meanPooling(std::span<const float> modelOutput,
                                std::span<const int64_t> attnMask);
 /**

package/common/rnexecutorch/data_processing/dsp.cpp CHANGED Viewed

@@ -18,7 +18,7 @@ std::vector<float> hannWindow(size_t size) {
   return window;
 }
-std::vector<float> stftFromWaveform(std::span<float> waveform,
+std::vector<float> stftFromWaveform(std::span<const float> waveform,
                                     size_t fftWindowSize, size_t hopSize) {
   // Initialize FFT
   FFT fft(fftWindowSize);

package/common/rnexecutorch/data_processing/dsp.h CHANGED Viewed

@@ -6,7 +6,7 @@
 namespace rnexecutorch::dsp {
 std::vector<float> hannWindow(size_t size);
-std::vector<float> stftFromWaveform(std::span<float> waveform,
+std::vector<float> stftFromWaveform(std::span<const float> waveform,
                                     size_t fftWindowSize, size_t hopSize);
 } // namespace rnexecutorch::dsp

package/common/rnexecutorch/data_processing/gzip.cpp ADDED Viewed

@@ -0,0 +1,47 @@
+#include <vector>
+#include <zlib.h>
+#include "gzip.h"
+namespace rnexecutorch::gzip {
+namespace {
+constexpr int32_t kGzipWrapper = 16;     // gzip header/trailer
+constexpr int32_t kMemLevel = 8;         // memory level
+constexpr size_t kChunkSize = 16 * 1024; // 16 KiB stream buffer
+} // namespace
+size_t deflateSize(const std::string &input) {
+  z_stream strm{};
+  if (::deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+                     MAX_WBITS + kGzipWrapper, kMemLevel,
+                     Z_DEFAULT_STRATEGY) != Z_OK) {
+    throw std::runtime_error("deflateInit2 failed");
+  }
+  size_t outSize = 0;
+  strm.next_in = reinterpret_cast<z_const Bytef *>(
+      const_cast<z_const char *>(input.data()));
+  strm.avail_in = static_cast<uInt>(input.size());
+  std::vector<unsigned char> buf(kChunkSize);
+  int ret;
+  do {
+    strm.next_out = buf.data();
+    strm.avail_out = static_cast<uInt>(buf.size());
+    ret = ::deflate(&strm, strm.avail_in ? Z_NO_FLUSH : Z_FINISH);
+    if (ret == Z_STREAM_ERROR) {
+      ::deflateEnd(&strm);
+      throw std::runtime_error("deflate stream error");
+    }
+    outSize += buf.size() - strm.avail_out;
+  } while (ret != Z_STREAM_END);
+  ::deflateEnd(&strm);
+  return outSize;
+}
+} // namespace rnexecutorch::gzip

package/common/rnexecutorch/data_processing/gzip.h ADDED Viewed

@@ -0,0 +1,7 @@
+#pragma once
+namespace rnexecutorch::gzip {
+size_t deflateSize(const std::string &input);
+} // namespace rnexecutorch::gzip

package/common/rnexecutorch/host_objects/ModelHostObject.h CHANGED Viewed

@@ -62,6 +62,30 @@ public:
                                        "decode"));
     }
+    if constexpr (meta::HasTranscribe<Model>) {
+      addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                                       promiseHostFunction<&Model::transcribe>,
+                                       "transcribe"));
+    }
+    if constexpr (meta::HasStream<Model>) {
+      addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                                       promiseHostFunction<&Model::stream>,
+                                       "stream"));
+    }
+    if constexpr (meta::HasStreamInsert<Model>) {
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>, promiseHostFunction<&Model::streamInsert>,
+          "streamInsert"));
+    }
+    if constexpr (meta::HasStreamStop<Model>) {
+      addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                                       promiseHostFunction<&Model::streamStop>,
+                                       "streamStop"));
+    }
     if constexpr (meta::SameAs<Model, TokenizerModule>) {
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                                        promiseHostFunction<&Model::encode>,

package/common/rnexecutorch/metaprogramming/TypeConcepts.h CHANGED Viewed

@@ -26,6 +26,26 @@ concept HasDecode = requires(T t) {
   { &T::decode };
 };
+template <typename T>
+concept HasTranscribe = requires(T t) {
+  { &T::transcribe };
+};
+template <typename T>
+concept HasStream = requires(T t) {
+  { &T::stream };
+};
+template <typename T>
+concept HasStreamInsert = requires(T t) {
+  { &T::streamInsert };
+};
+template <typename T>
+concept HasStreamStop = requires(T t) {
+  { &T::streamStop };
+};
 template <typename T>
 concept IsNumeric = std::is_arithmetic_v<T>;
@@ -34,4 +54,4 @@ concept ProvidesMemoryLowerBound = requires(T t) {
   { &T::getMemoryLowerBound };
 };
-} // namespace rnexecutorch::meta
+} // namespace rnexecutorch::meta

package/common/rnexecutorch/models/BaseModel.cpp CHANGED Viewed

@@ -142,7 +142,8 @@ BaseModel::getMethodMeta(const std::string &methodName) {
   return module_->method_meta(methodName);
 }
-Result<std::vector<EValue>> BaseModel::forward(const EValue &input_evalue) {
+Result<std::vector<EValue>>
+BaseModel::forward(const EValue &input_evalue) const {
   if (!module_) {
     throw std::runtime_error("Model not loaded: Cannot perform forward pass");
   }
@@ -150,7 +151,7 @@ Result<std::vector<EValue>> BaseModel::forward(const EValue &input_evalue) {
 }
 Result<std::vector<EValue>>
-BaseModel::forward(const std::vector<EValue> &input_evalues) {
+BaseModel::forward(const std::vector<EValue> &input_evalues) const {
   if (!module_) {
     throw std::runtime_error("Model not loaded: Cannot perform forward pass");
   }

package/common/rnexecutorch/models/BaseModel.h CHANGED Viewed

@@ -26,8 +26,9 @@ public:
   getAllInputShapes(std::string methodName = "forward");
   std::vector<JSTensorViewOut>
   forwardJS(std::vector<JSTensorViewIn> tensorViewVec);
-  Result<std::vector<EValue>> forward(const EValue &input_value);
-  Result<std::vector<EValue>> forward(const std::vector<EValue> &input_value);
+  Result<std::vector<EValue>> forward(const EValue &input_value) const;
+  Result<std::vector<EValue>>
+  forward(const std::vector<EValue> &input_value) const;
   Result<std::vector<EValue>> execute(const std::string &methodName,
                                       const std::vector<EValue> &input_value);
   Result<executorch::runtime::MethodMeta>

package/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp CHANGED Viewed

@@ -1,64 +1,128 @@
-#include <rnexecutorch/models/speech_to_text/SpeechToText.h>
-#include <rnexecutorch/models/speech_to_text/WhisperStrategy.h>
-#include <stdexcept>
+#include <thread>
+#include "SpeechToText.h"
 namespace rnexecutorch::models::speech_to_text {
 using namespace ::executorch::extension;
+using namespace asr;
+using namespace types;
+using namespace stream;
-SpeechToText::SpeechToText(const std::string &encoderPath,
-                           const std::string &decoderPath,
-                           const std::string &modelName,
+SpeechToText::SpeechToText(const std::string &encoderSource,
+                           const std::string &decoderSource,
+                           const std::string &tokenizerSource,
                            std::shared_ptr<react::CallInvoker> callInvoker)
-    : EncoderDecoderBase(encoderPath, decoderPath, callInvoker),
-      modelName(modelName) {
-  initializeStrategy();
+    : callInvoker(std::move(callInvoker)),
+      encoder(std::make_unique<BaseModel>(encoderSource, this->callInvoker)),
+      decoder(std::make_unique<BaseModel>(decoderSource, this->callInvoker)),
+      tokenizer(std::make_unique<TokenizerModule>(tokenizerSource,
+                                                  this->callInvoker)),
+      asr(std::make_unique<ASR>(this->encoder.get(), this->decoder.get(),
+                                this->tokenizer.get())),
+      processor(std::make_unique<OnlineASRProcessor>(this->asr.get())),
+      isStreaming(false), readyToProcess(false) {}
+std::shared_ptr<OwningArrayBuffer>
+SpeechToText::encode(std::span<float> waveform) const {
+  std::vector<float> encoderOutput = this->asr->encode(waveform);
+  return this->makeOwningBuffer(encoderOutput);
 }
-void SpeechToText::initializeStrategy() {
-  if (modelName == "whisper") {
-    strategy = std::make_unique<WhisperStrategy>();
-  } else {
-    throw std::runtime_error("Unsupported STT model: " + modelName +
-                             ". Only 'whisper' is supported.");
-  }
+std::shared_ptr<OwningArrayBuffer>
+SpeechToText::decode(std::span<int32_t> tokens,
+                     std::span<float> encoderOutput) const {
+  std::vector<float> decoderOutput = this->asr->decode(tokens, encoderOutput);
+  return this->makeOwningBuffer(decoderOutput);
 }
-void SpeechToText::encode(std::span<float> waveform) {
-  const auto modelInputTensor = strategy->prepareAudioInput(waveform);
+std::string SpeechToText::transcribe(std::span<float> waveform,
+                                     std::string languageOption) const {
+  std::vector<Segment> segments =
+      this->asr->transcribe(waveform, DecodingOptions(languageOption));
+  std::string transcription;
+  size_t transcriptionLength = 0;
+  for (auto &segment : segments) {
+    for (auto &word : segment.words) {
+      transcriptionLength += word.content.size();
+    }
+  }
+  transcription.reserve(transcriptionLength);
-  const auto result = encoder_->forward(modelInputTensor);
-  if (!result.ok()) {
-    throw std::runtime_error(
-        "Forward pass failed during encoding, error code: " +
-        std::to_string(static_cast<int>(result.error())));
+  for (auto &segment : segments) {
+    for (auto &word : segment.words) {
+      transcription += word.content;
+    }
   }
+  return transcription;
+}
-  encoderOutput = result.get().at(0);
+size_t SpeechToText::getMemoryLowerBound() const noexcept {
+  return this->encoder->getMemoryLowerBound() +
+         this->decoder->getMemoryLowerBound() +
+         this->tokenizer->getMemoryLowerBound();
 }
 std::shared_ptr<OwningArrayBuffer>
-SpeechToText::decode(std::vector<int64_t> prevTokens) {
-  if (encoderOutput.isNone()) {
-    throw std::runtime_error("Empty encodings on decode call, make sure to "
-                             "call encode() prior to decode()!");
+SpeechToText::makeOwningBuffer(std::span<const float> vectorView) const {
+  auto owningArrayBuffer =
+      std::make_shared<OwningArrayBuffer>(vectorView.size_bytes());
+  std::memcpy(owningArrayBuffer->data(), vectorView.data(),
+              vectorView.size_bytes());
+  return owningArrayBuffer;
+}
+void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
+                          std::string languageOption) {
+  if (this->isStreaming) {
+    throw std::runtime_error("Streaming is already in progress");
+  }
+  auto nativeCallback = [this, callback](const std::string &committed,
+                                         const std::string &nonCommitted,
+                                         bool isDone) {
+    this->callInvoker->invokeAsync(
+        [callback, committed, nonCommitted, isDone](jsi::Runtime &rt) {
+          callback->call(rt, jsi::String::createFromUtf8(rt, committed),
+                         jsi::String::createFromUtf8(rt, nonCommitted),
+                         jsi::Value(isDone));
+        });
+  };
+  this->resetStreamState();
+  this->isStreaming = true;
+  while (this->isStreaming) {
+    if (!this->readyToProcess ||
+        this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      continue;
+    }
+    ProcessResult res =
+        this->processor->processIter(DecodingOptions(languageOption));
+    nativeCallback(res.committed, res.nonCommitted, false);
+    this->readyToProcess = false;
   }
-  const auto prevTokensTensor = strategy->prepareTokenInput(prevTokens);
+  std::string committed = this->processor->finish();
+  nativeCallback(committed, "", true);
+}
-  const auto decoderMethod = strategy->getDecoderMethod();
-  const auto decoderResult =
-      decoder_->execute(decoderMethod, {prevTokensTensor, encoderOutput});
+void SpeechToText::streamStop() { this->isStreaming = false; }
-  if (!decoderResult.ok()) {
-    throw std::runtime_error(
-        "Forward pass failed during decoding, error code: " +
-        std::to_string(static_cast<int>(decoderResult.error())));
+void SpeechToText::streamInsert(std::span<float> waveform) {
+  if (!this->isStreaming) {
+    throw std::runtime_error("Streaming is not started");
   }
+  this->processor->insertAudioChunk(waveform);
+  this->readyToProcess = true;
+}
-  const auto decoderOutputTensor = decoderResult.get().at(0).toTensor();
-  const auto innerDim = decoderOutputTensor.size(1);
-  return strategy->extractOutputToken(decoderOutputTensor);
+void SpeechToText::resetStreamState() {
+  this->isStreaming = false;
+  this->readyToProcess = false;
+  this->processor = std::make_unique<OnlineASRProcessor>(this->asr.get());
 }
 } // namespace rnexecutorch::models::speech_to_text

package/common/rnexecutorch/models/speech_to_text/SpeechToText.h CHANGED Viewed

@@ -1,38 +1,56 @@
 #pragma once
-#include "ReactCommon/CallInvoker.h"
-#include "executorch/runtime/core/evalue.h"
-#include <cstdint>
-#include <memory>
-#include <span>
-#include <string>
-#include <vector>
-#include "rnexecutorch/metaprogramming/ConstructorHelpers.h"
-#include <rnexecutorch/models/EncoderDecoderBase.h>
-#include <rnexecutorch/models/speech_to_text/SpeechToTextStrategy.h>
+#include "rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h"
 namespace rnexecutorch {
 namespace models::speech_to_text {
-class SpeechToText : public EncoderDecoderBase {
+class SpeechToText {
 public:
-  explicit SpeechToText(const std::string &encoderPath,
-                        const std::string &decoderPath,
-                        const std::string &modelName,
+  explicit SpeechToText(const std::string &encoderSource,
+                        const std::string &decoderSource,
+                        const std::string &tokenizerSource,
                         std::shared_ptr<react::CallInvoker> callInvoker);
-  void encode(std::span<float> waveform);
-  std::shared_ptr<OwningArrayBuffer> decode(std::vector<int64_t> prevTokens);
+  std::shared_ptr<OwningArrayBuffer> encode(std::span<float> waveform) const;
+  std::shared_ptr<OwningArrayBuffer>
+  decode(std::span<int32_t> tokens, std::span<float> encoderOutput) const;
+  std::string transcribe(std::span<float> waveform,
+                         std::string languageOption) const;
+  size_t getMemoryLowerBound() const noexcept;
+  // Stream
+  void stream(std::shared_ptr<jsi::Function> callback,
+              std::string languageOption);
+  void streamStop();
+  void streamInsert(std::span<float> waveform);
 private:
-  const std::string modelName;
-  executorch::runtime::EValue encoderOutput;
-  std::unique_ptr<SpeechToTextStrategy> strategy;
+  std::unique_ptr<BaseModel> encoder;
+  std::unique_ptr<BaseModel> decoder;
+  std::unique_ptr<TokenizerModule> tokenizer;
+  std::unique_ptr<asr::ASR> asr;
-  void initializeStrategy();
+  std::shared_ptr<OwningArrayBuffer>
+  makeOwningBuffer(std::span<const float> vectorView) const;
+  // Stream
+  std::shared_ptr<react::CallInvoker> callInvoker;
+  std::unique_ptr<stream::OnlineASRProcessor> processor;
+  bool isStreaming;
+  bool readyToProcess;
+  constexpr static int32_t kMinAudioSamples = 16000; // 1 second
+  void resetStreamState();
 };
 } // namespace models::speech_to_text
 REGISTER_CONSTRUCTOR(models::speech_to_text::SpeechToText, std::string,
                      std::string, std::string,
                      std::shared_ptr<react::CallInvoker>);
 } // namespace rnexecutorch