npm - react-native-executorch - Versions diffs - 0.8.2 → 0.8.4 - Mend

react-native-executorch 0.8.2 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/android/src/main/java/com/swmansion/rnexecutorch/ETInstallerUnavailable.kt +27 -0
package/android/src/main/java/com/swmansion/rnexecutorch/RnExecutorchPackage.kt +12 -1
package/common/rnexecutorch/host_objects/ModelHostObject.h +12 -1
package/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp +6 -0
package/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h +3 -0
package/common/rnexecutorch/models/llm/LLM.cpp +31 -3
package/common/rnexecutorch/models/llm/LLM.h +2 -0
package/common/rnexecutorch/models/text_to_image/TextToImage.cpp +2 -0
package/common/rnexecutorch/models/text_to_image/TextToImage.h +2 -0
package/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp +6 -0
package/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h +5 -0
package/common/runner/base_llm_runner.cpp +8 -6
package/common/runner/base_llm_runner.h +8 -4
package/common/runner/encoders/vision_encoder.cpp +12 -4
package/common/runner/irunner.h +15 -0
package/common/runner/multimodal_decoder_runner.h +3 -2
package/common/runner/multimodal_runner.cpp +4 -16
package/common/runner/multimodal_runner.h +0 -4
package/common/runner/sampler.cpp +32 -13
package/common/runner/sampler.h +59 -1
package/common/runner/text_decoder_runner.cpp +31 -3
package/common/runner/text_decoder_runner.h +13 -46
package/common/runner/text_runner.cpp +3 -26
package/common/runner/text_runner.h +0 -4
package/common/runner/text_token_generator.h +20 -18
package/lib/module/constants/modelUrls.js +53 -10
package/lib/module/constants/modelUrls.js.map +1 -1
package/lib/module/controllers/LLMController.js +75 -22
package/lib/module/controllers/LLMController.js.map +1 -1
package/lib/module/hooks/natural_language_processing/useLLM.js +1 -0
package/lib/module/hooks/natural_language_processing/useLLM.js.map +1 -1
package/lib/module/index.js +11 -0
package/lib/module/index.js.map +1 -1
package/lib/module/modules/natural_language_processing/LLMModule.js +1 -1
package/lib/module/types/llm.js +4 -1
package/lib/module/types/llm.js.map +1 -1
package/lib/typescript/constants/modelUrls.d.ts +126 -0
package/lib/typescript/constants/modelUrls.d.ts.map +1 -1
package/lib/typescript/controllers/LLMController.d.ts +3 -1
package/lib/typescript/controllers/LLMController.d.ts.map +1 -1
package/lib/typescript/index.d.ts +7 -0
package/lib/typescript/index.d.ts.map +1 -1
package/lib/typescript/modules/natural_language_processing/LLMModule.d.ts +1 -1
package/lib/typescript/types/llm.d.ts +21 -1
package/lib/typescript/types/llm.d.ts.map +1 -1
package/package.json +1 -1
package/src/constants/modelUrls.ts +45 -2
package/src/controllers/LLMController.ts +84 -25
package/src/hooks/natural_language_processing/useLLM.ts +1 -0
package/src/index.ts +11 -0
package/src/modules/natural_language_processing/LLMModule.ts +1 -1
package/src/types/llm.ts +21 -1

package/android/src/main/java/com/swmansion/rnexecutorch/ETInstallerUnavailable.kt ADDED Viewed

@@ -0,0 +1,27 @@
+package com.swmansion.rnexecutorch
+import com.facebook.react.bridge.ReactApplicationContext
+import com.facebook.react.bridge.ReactMethod
+import com.facebook.react.common.annotations.FrameworkAPI
+import com.facebook.react.module.annotations.ReactModule
+/**
+ * Fallback TurboModule returned when native ExecuTorch libraries cannot be
+ * loaded (e.g. 32-bit Android devices where only arm64-v8a binaries are
+ * shipped). Extends the same spec as ETInstaller so JS sees a real linked
+ * module, but install() returns false to signal unavailability.
+ */
+@OptIn(FrameworkAPI::class)
+@ReactModule(name = ETInstallerUnavailable.NAME)
+class ETInstallerUnavailable(
+  reactContext: ReactApplicationContext,
+) : NativeETInstallerSpec(reactContext) {
+  companion object {
+    const val NAME = NativeETInstallerSpec.NAME
+  }
+  @ReactMethod(isBlockingSynchronousMethod = true)
+  override fun install(): Boolean {
+    return false
+  }
+}

package/android/src/main/java/com/swmansion/rnexecutorch/RnExecutorchPackage.kt CHANGED Viewed

@@ -15,7 +15,18 @@ class RnExecutorchPackage : TurboReactPackage() {
     reactContext: ReactApplicationContext,
   ): NativeModule? =
     if (name == ETInstaller.NAME) {
-      ETInstaller(reactContext)
+      try {
+        ETInstaller(reactContext)
+      } catch (e: RuntimeException) {
+        if (e.cause is UnsatisfiedLinkError) {
+          // Native library not available (e.g. 32-bit device without arm64-v8a .so).
+          // Return a fallback module whose install() returns false so JS can
+          // distinguish "unsupported ABI" from "package not linked."
+          ETInstallerUnavailable(reactContext)
+        } else {
+          throw e
+        }
+      }
     } else {
       null
     }

package/common/rnexecutorch/host_objects/ModelHostObject.h CHANGED Viewed

@@ -140,6 +140,15 @@ public:
                                        synchronousHostFunction<&Model::setTopp>,
                                        "setTopp"));
+      addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                                       synchronousHostFunction<&Model::setMinP>,
+                                       "setMinP"));
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          synchronousHostFunction<&Model::setRepetitionPenalty>,
+          "setRepetitionPenalty"));
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>,
           synchronousHostFunction<&Model::getMaxContextLength>,
@@ -375,7 +384,9 @@ public:
             // We need to dispatch a thread if we want the function to be
             // asynchronous. In this thread all accesses to jsi::Runtime need to
             // be done via the callInvoker.
-            threads::GlobalThreadPool::detach([this, promise,
+            threads::GlobalThreadPool::detach([model = this->model,
+                                               callInvoker = this->callInvoker,
+                                               promise,
                                                argsConverted =
                                                    std::move(argsConverted)]() {
               try {

package/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp CHANGED Viewed

@@ -35,8 +35,14 @@ TokenIdsWithAttentionMask TextEmbeddings::preprocess(const std::string &input) {
   return {.inputIds = inputIds64, .attentionMask = attentionMask};
 }
+void TextEmbeddings::unload() noexcept {
+  std::scoped_lock lock(inference_mutex_);
+  BaseModel::unload();
+}
 std::shared_ptr<OwningArrayBuffer>
 TextEmbeddings::generate(const std::string input) {
+  std::scoped_lock lock(inference_mutex_);
   auto preprocessed = preprocess(input);
   std::vector<int32_t> tokenIdsShape = {

package/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h CHANGED Viewed

@@ -1,6 +1,7 @@
 #pragma once
 #include "rnexecutorch/metaprogramming/ConstructorHelpers.h"
+#include <mutex>
 #include <rnexecutorch/TokenizerModule.h>
 #include <rnexecutorch/models/embeddings/BaseEmbeddings.h>
@@ -20,8 +21,10 @@ public:
   [[nodiscard(
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   generate(const std::string input);
+  void unload() noexcept;
 private:
+  mutable std::mutex inference_mutex_;
   std::vector<std::vector<int32_t>> inputShapes;
   TokenIdsWithAttentionMask preprocess(const std::string &input);
   std::unique_ptr<TokenizerModule> tokenizer;

package/common/rnexecutorch/models/llm/LLM.cpp CHANGED Viewed

@@ -20,7 +20,7 @@ using executorch::runtime::Error;
 LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
          std::vector<std::string> capabilities,
          std::shared_ptr<react::CallInvoker> callInvoker)
-    : BaseModel(modelSource, callInvoker, Module::LoadMode::File) {
+    : BaseModel(modelSource, callInvoker, Module::LoadMode::Mmap) {
   if (capabilities.empty()) {
     runner_ =
@@ -42,8 +42,12 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
     throw RnExecutorchError(loadResult, "Failed to load LLM runner");
   }
-  memorySizeLowerBound = fs::file_size(fs::path(modelSource)) +
-                         fs::file_size(fs::path(tokenizerSource));
+  // I am purposefully not adding file size of the model here. The reason is
+  // that Hermes would crash the app if we try to alloc too much memory here.
+  // Also, given we're using mmap, the true memory consumption of a model is not
+  // really equal to the size of the model. The size of the tokenizer file is a
+  // hint to the GC that this object might be worth getting rid of.
+  memorySizeLowerBound = fs::file_size(fs::path(tokenizerSource));
 }
 std::string LLM::generate(std::string input,
@@ -246,6 +250,30 @@ void LLM::setTopp(float topp) {
   runner_->set_topp(topp);
 }
+void LLM::setMinP(float minP) {
+  if (!runner_ || !runner_->is_loaded()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
+                            "Can't configure a model that's not loaded");
+  }
+  if (minP < 0.0f || minP > 1.0f) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
+                            "Min-p must be between 0.0 and 1.0");
+  }
+  runner_->set_min_p(minP);
+}
+void LLM::setRepetitionPenalty(float repetitionPenalty) {
+  if (!runner_ || !runner_->is_loaded()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
+                            "Can't configure a model that's not loaded");
+  }
+  if (repetitionPenalty < 0.0f) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
+                            "Repetition penalty must be non-negative");
+  }
+  runner_->set_repetition_penalty(repetitionPenalty);
+}
 int32_t LLM::getMaxContextLength() const {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(

package/common/rnexecutorch/models/llm/LLM.h CHANGED Viewed

@@ -38,6 +38,8 @@ public:
   void setCountInterval(size_t countInterval);
   void setTemperature(float temperature);
   void setTopp(float topp);
+  void setMinP(float minP);
+  void setRepetitionPenalty(float repetitionPenalty);
   void setTimeInterval(size_t timeInterval);
   int32_t getMaxContextLength() const;

package/common/rnexecutorch/models/text_to_image/TextToImage.cpp CHANGED Viewed

@@ -58,6 +58,7 @@ std::shared_ptr<OwningArrayBuffer>
 TextToImage::generate(std::string input, int32_t imageSize,
                       size_t numInferenceSteps, int32_t seed,
                       std::shared_ptr<jsi::Function> callback) {
+  std::scoped_lock lock(inference_mutex_);
   setImageSize(imageSize);
   setSeed(seed);
@@ -137,6 +138,7 @@ size_t TextToImage::getMemoryLowerBound() const noexcept {
 }
 void TextToImage::unload() noexcept {
+  std::scoped_lock lock(inference_mutex_);
   encoder->unload();
   unet->unload();
   decoder->unload();

package/common/rnexecutorch/models/text_to_image/TextToImage.h CHANGED Viewed

@@ -1,6 +1,7 @@
 #pragma once
 #include <memory>
+#include <mutex>
 #include <string>
 #include <vector>
@@ -49,6 +50,7 @@ private:
   static constexpr float guidanceScale = 7.5f;
   static constexpr float latentsScale = 0.18215f;
   bool interrupted = false;
+  mutable std::mutex inference_mutex_;
   std::shared_ptr<react::CallInvoker> callInvoker;
   std::unique_ptr<Scheduler> scheduler;

package/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp CHANGED Viewed

@@ -54,8 +54,14 @@ VoiceActivityDetection::preprocess(std::span<float> waveform) const {
   return frameBuffer;
 }
+void VoiceActivityDetection::unload() noexcept {
+  std::scoped_lock lock(inference_mutex_);
+  BaseModel::unload();
+}
 std::vector<types::Segment>
 VoiceActivityDetection::generate(std::span<float> waveform) const {
+  std::scoped_lock lock(inference_mutex_);
   auto windowedInput = preprocess(waveform);
   auto [chunksNumber, remainder] = std::div(

package/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h CHANGED Viewed

@@ -5,6 +5,7 @@
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/runtime/core/evalue.h>
+#include <mutex>
 #include <span>
 #include "rnexecutorch/metaprogramming/ConstructorHelpers.h"
@@ -23,7 +24,11 @@ public:
   [[nodiscard("Registered non-void function")]] std::vector<types::Segment>
   generate(std::span<float> waveform) const;
+  void unload() noexcept;
 private:
+  mutable std::mutex inference_mutex_;
   std::vector<std::array<float, constants::kPaddedWindowSize>>
   preprocess(std::span<float> waveform) const;
   std::vector<types::Segment> postprocess(const std::vector<float> &scores,

package/common/runner/base_llm_runner.cpp CHANGED Viewed

@@ -139,20 +139,22 @@ int32_t BaseLLMRunner::get_max_context_length() const {
 void BaseLLMRunner::set_temperature(float temperature) noexcept {
   config_.temperature = temperature;
-  set_temperature_impl(temperature);
 }
-void BaseLLMRunner::set_topp(float topp) noexcept {
-  config_.topp = topp;
-  set_topp_impl(topp);
+void BaseLLMRunner::set_topp(float topp) noexcept { config_.topp = topp; }
+void BaseLLMRunner::set_min_p(float min_p) noexcept { config_.min_p = min_p; }
+void BaseLLMRunner::set_repetition_penalty(float repetition_penalty) noexcept {
+  config_.repetition_penalty = repetition_penalty;
 }
 void BaseLLMRunner::set_count_interval(size_t count_interval) {
-  set_count_interval_impl(count_interval);
+  config_.output_token_batch_size = count_interval;
 }
 void BaseLLMRunner::set_time_interval(size_t time_interval) {
-  set_time_interval_impl(time_interval);
+  config_.batch_time_interval_ms = time_interval;
 }
 int32_t BaseLLMRunner::resolve_max_new_tokens(int32_t num_prompt_tokens,

package/common/runner/base_llm_runner.h CHANGED Viewed

@@ -53,6 +53,8 @@ public:
   void set_temperature(float temperature) noexcept;
   void set_topp(float topp) noexcept;
+  void set_min_p(float min_p) noexcept;
+  void set_repetition_penalty(float repetition_penalty) noexcept;
   void set_count_interval(size_t count_interval);
   void set_time_interval(size_t time_interval);
@@ -65,10 +67,12 @@ public:
 protected:
   virtual ::executorch::runtime::Error load_subcomponents() = 0;
   virtual void stop_impl() = 0;
-  virtual void set_temperature_impl(float temperature) = 0;
-  virtual void set_topp_impl(float topp) = 0;
-  virtual void set_count_interval_impl(size_t count_interval) = 0;
-  virtual void set_time_interval_impl(size_t time_interval) = 0;
+  // Sampling values and token-batching intervals live entirely in `config_`.
+  // The TextDecoderRunner / TextTokenGenerator shared by both TextRunner and
+  // MultimodalRunner are constructed with a const reference to `config_`
+  // and read those fields on every iteration, so writes via the public
+  // set_* methods on BaseLLMRunner take effect immediately with no virtual
+  // dispatch needed.
   int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len,
                                  int32_t max_context_len,

package/common/runner/encoders/vision_encoder.cpp CHANGED Viewed

@@ -77,15 +77,23 @@ Result<VisionEncoder::ImageShape> VisionEncoder::getInputShape() const {
 std::vector<float>
 VisionEncoder::preprocessImage(const std::string &path,
                                const ImageShape &targetShape) const {
-  cv::Mat mat = rnexecutorch::image_processing::readImage(path);
-  cv::resize(mat, mat, cv::Size(targetShape.width, targetShape.height));
-  cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
+  // The bundled vision-encoder PTEs (e.g. LFM2.5-VL) bake rescale + normalize
+  // into the exported graph, so we hand raw 0-255 float pixel values to the
+  // module. Adding rescale / normalize here would double-apply the transform
+  // and destroy the input distribution. We reuse `resizePadded` for the
+  // aspect-ratio-preserving letterbox (it picks the pad colour from the
+  // source image corners, which blends better than a flat gray), then
+  // convert BGR->RGB and repack the raw pixels into CHW float.
+  cv::Mat src = rnexecutorch::image_processing::readImage(path);
+  cv::Mat canvas = rnexecutorch::image_processing::resizePadded(
+      src, cv::Size(targetShape.width, targetShape.height));
+  cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
   const int32_t pixelCount = targetShape.height * targetShape.width;
   std::vector<float> chw(targetShape.channels * pixelCount);
   for (int32_t i = 0; i < pixelCount; ++i) {
     cv::Vec3b px =
-        mat.at<cv::Vec3b>(i / targetShape.width, i % targetShape.width);
+        canvas.at<cv::Vec3b>(i / targetShape.width, i % targetShape.width);
     for (int32_t c = 0; c < targetShape.channels; ++c) {
       chw[c * pixelCount + i] = static_cast<float>(px[c]);
     }

package/common/runner/irunner.h CHANGED Viewed

@@ -58,6 +58,21 @@ struct GenerationConfig {
   // = more deterministic, higher = more diverse generations.
   float topp = -1.F;
+  // Minimum probability threshold: tokens with prob < min_p * max_prob are
+  // excluded. 0.0 disables min_p filtering.
+  float min_p = 0.0f;
+  // Multiplicative penalty applied to logits of recently generated tokens.
+  // Values > 1.0 discourage repetition. 1.0 disables the penalty.
+  float repetition_penalty = 1.0f;
+  // Token-batching parameters for the streaming token callback. The
+  // generator flushes a batch when either `output_token_batch_size` tokens
+  // have accumulated or `batch_time_interval_ms` milliseconds have elapsed
+  // since the last flush, whichever comes first.
+  size_t output_token_batch_size = 10;
+  size_t batch_time_interval_ms = 120;
   // Enable dynamic input shapes (if implemented) or not
   // Impacts the prefill phase and causes TextPrefiller to pass all the tokens
   // at once if set to true.

package/common/runner/multimodal_decoder_runner.h CHANGED Viewed

@@ -16,8 +16,9 @@
 namespace executorch::extension::llm {
 class MultimodalDecoderRunner : public TextDecoderRunner {
 public:
-  explicit MultimodalDecoderRunner(Module &module, IOManager *io_manager)
-      : TextDecoderRunner(module, io_manager) {}
+  explicit MultimodalDecoderRunner(Module &module, IOManager *io_manager,
+                                   const GenerationConfig &config)
+      : TextDecoderRunner(module, io_manager, config) {}
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
   step(TensorPtr &tokens, int64_t start_pos) override {

package/common/runner/multimodal_runner.cpp CHANGED Viewed

@@ -47,8 +47,8 @@ Error MultimodalRunner::load_subcomponents() {
   Stats *stats_ptr = &stats_;
-  mm_decoder_runner_ =
-      std::make_unique<MultimodalDecoderRunner>(*module_, io_manager_.get());
+  mm_decoder_runner_ = std::make_unique<MultimodalDecoderRunner>(
+      *module_, io_manager_.get(), config_);
   IEncoder *image_encoder = nullptr;
   auto enc_it = encoders_.find(MultimodalType::Image);
   if (enc_it != encoders_.end()) {
@@ -58,7 +58,7 @@ Error MultimodalRunner::load_subcomponents() {
       *module_, *mm_decoder_runner_, *tokenizer_, image_encoder);
   mm_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true,
-      std::move(eos_ids_), stats_ptr);
+      std::move(eos_ids_), stats_ptr, config_);
   ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load());
   ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load());
@@ -106,7 +106,7 @@ Error MultimodalRunner::generate_internal(
   auto generate_result = mm_token_generator_->generate(
       seed_tokens, pos_,
       static_cast<uint64_t>(std::max(0, resolved_max_new - 1)),
-      config_.temperature, config_.topp, wrapped_callback);
+      wrapped_callback);
   if (!generate_result.ok())
     return generate_result.error();
@@ -125,16 +125,4 @@ void MultimodalRunner::stop_impl() {
   }
 }
-void MultimodalRunner::set_count_interval_impl(size_t count_interval) {
-  if (mm_token_generator_) {
-    mm_token_generator_->set_count_interval(count_interval);
-  }
-}
-void MultimodalRunner::set_time_interval_impl(size_t time_interval) {
-  if (mm_token_generator_) {
-    mm_token_generator_->set_time_interval(time_interval);
-  }
-}
 } // namespace executorch::extension::llm

package/common/runner/multimodal_runner.h CHANGED Viewed

@@ -30,10 +30,6 @@ public:
 protected:
   ::executorch::runtime::Error load_subcomponents() override;
   void stop_impl() override;
-  void set_temperature_impl(float) override {}
-  void set_topp_impl(float) override {}
-  void set_count_interval_impl(size_t count_interval) override;
-  void set_time_interval_impl(size_t time_interval) override;
 private:
   std::map<MultimodalType, std::unique_ptr<IEncoder>> encoders_;

package/common/runner/sampler.cpp CHANGED Viewed

@@ -35,6 +35,7 @@
 #include "sampler.h"
 #include <algorithm>
 #include <ctime>
+#include <vector>
 namespace executorch {
 namespace extension {
@@ -119,16 +120,16 @@ int32_t Sampler::sample_topp(T *probabilities, float coin) {
   return probindex[last_idx].index; // in case of rounding errors
 }
-Sampler::Sampler(int vocab_size, float temperature, float topp,
-                 unsigned long long rng_seed)
+Sampler::Sampler(int32_t vocab_size, float temperature, float topp,
+                 unsigned long long rng_seed, float min_p,
+                 float repetition_penalty)
     : vocab_size_(vocab_size),
       inv_temperature_((temperature != 0.0f) ? (1.0f / temperature) : 0.0f),
-      topp_(topp), rng_state_(rng_seed) {}
+      topp_(topp), min_p_(min_p), repetition_penalty_(repetition_penalty),
+      rng_state_(rng_seed) {}
 Sampler::Sampler(int vocab_size, float temperature, float topp)
-    : vocab_size_(vocab_size),
-      inv_temperature_((temperature != 0.0f) ? (1.0f / temperature) : 0.0f),
-      topp_(topp), rng_state_(std::time(nullptr)) {}
+    : Sampler(vocab_size, temperature, topp, std::time(nullptr), 0.0f, 1.0f) {}
 template <typename T> static void softmax(T *x, int size) {
   // find max value (for numerical stability)
@@ -162,22 +163,25 @@ static float random_f32(unsigned long long *state) { // random float32 in [0,1)
   return (random_u32(state) >> 8) / 16777216.0f;
 }
-template <typename T> int32_t Sampler::sample(T *logits) {
+template <typename T>
+int32_t Sampler::sample(T *logits, const std::vector<uint64_t> &recent_tokens) {
   // sample the token given the logits and some hyperparameters
   int next;
   if (inv_temperature_ == 0.0f) {
     // greedy argmax sampling: take the token with the highest probability
     next = sample_argmax(logits);
   } else {
-    // apply the temperature to the logits
-    for (int q = 0; q < vocab_size_; q++) {
-      logits[q] *= inv_temperature_;
-    }
-    // apply softmax to the logits to get the probabilities for next token
+    // 1. apply repetition penalty to raw logits (pre-softmax)
+    apply_repetition_penalty(logits, vocab_size_, recent_tokens);
+    // 2. apply the temperature to the logits
+    apply_temperature(logits, vocab_size_);
+    // 3. apply softmax to the logits to get the probabilities for next token
     softmax(logits, vocab_size_);
+    // 4. apply min_p truncation
+    apply_min_p(logits, vocab_size_);
     // flip a (float) coin (this is our source of entropy for sampling)
     float coin = random_f32(&rng_state_);
-    // we sample from this distribution to get the next token
+    // 5. we sample from this distribution to get the next token
     if (topp_ <= 0 || topp_ >= 1) {
       // simply sample from the predicted probability distribution
       next = sample_mult(logits, coin);
@@ -189,6 +193,10 @@ template <typename T> int32_t Sampler::sample(T *logits) {
   return next;
 }
+template <typename T> int32_t Sampler::sample(T *logits) {
+  return sample(logits, {});
+}
 template int32_t Sampler::sample<float>(float *logits);
 template int32_t Sampler::sample<uint16_t>(uint16_t *logits);
 template int32_t
@@ -196,6 +204,17 @@ Sampler::sample<executorch::aten::Half>(executorch::aten::Half *logits);
 template int32_t
 Sampler::sample<executorch::aten::BFloat16>(executorch::aten::BFloat16 *logits);
+template int32_t Sampler::sample<float>(float *logits,
+                                        const std::vector<uint64_t> &);
+template int32_t Sampler::sample<uint16_t>(uint16_t *logits,
+                                           const std::vector<uint64_t> &);
+template int32_t
+Sampler::sample<executorch::aten::Half>(executorch::aten::Half *logits,
+                                        const std::vector<uint64_t> &);
+template int32_t
+Sampler::sample<executorch::aten::BFloat16>(executorch::aten::BFloat16 *logits,
+                                            const std::vector<uint64_t> &);
 } // namespace llm
 } // namespace extension
 } // namespace executorch

package/common/runner/sampler.h CHANGED Viewed

@@ -8,12 +8,15 @@
 #pragma once
+#include <algorithm>
 #include <cctype>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <memory>
+#include <utility>
+#include <vector>
 #ifdef USE_ATEN_LIB
 #include <torch/torch.h>
 #endif
@@ -36,22 +39,77 @@ template <typename T> struct ProbIndex {
 class Sampler {
 public:
   Sampler(int32_t vocab_size, float temperature, float topp,
-          unsigned long long rng_seed);
+          unsigned long long rng_seed, float min_p = 0.0f,
+          float repetition_penalty = 1.0f);
   Sampler(int32_t vocab_size, float temperature, float topp);
   template <typename T> int32_t sample(T *logits);
+  template <typename T>
+  int32_t sample(T *logits, const std::vector<uint64_t> &recent_tokens);
 private:
   template <typename T> int32_t sample_topp(T *probabilities, float coin);
   template <typename T> int32_t sample_mult(T *probabilities, float coin);
   template <typename T> int32_t sample_argmax(T *probabilities);
+  template <typename T>
+  inline void apply_temperature(T *logits, int32_t vocab_size) {
+    for (std::size_t i = 0; std::cmp_less(i, vocab_size); ++i) {
+      logits[i] =
+          static_cast<T>(static_cast<float>(logits[i]) * inv_temperature_);
+    }
+  }
+  template <typename T>
+  inline void
+  apply_repetition_penalty(T *logits, int32_t vocab_size,
+                           const std::vector<uint64_t> &recent_tokens) {
+    if (repetition_penalty_ == 1.0f || recent_tokens.empty())
+      return;
+    for (uint64_t id : recent_tokens) {
+      if (!std::cmp_less(id, vocab_size)) {
+        continue;
+      }
+      T &val = logits[id];
+      if (val > T(0)) {
+        val = static_cast<T>(static_cast<float>(val) / repetition_penalty_);
+      } else {
+        val = static_cast<T>(static_cast<float>(val) * repetition_penalty_);
+      }
+    }
+  }
+  template <typename T>
+  inline void apply_min_p(T *probabilities, int32_t vocab_size) {
+    if (min_p_ <= 0.0f) {
+      return;
+    }
+    T max_prob = *std::max_element(probabilities, probabilities + vocab_size);
+    T threshold = static_cast<T>(min_p_ * static_cast<float>(max_prob));
+    T sum = T(0);
+    for (std::size_t i = 0; std::cmp_less(i, vocab_size); ++i) {
+      if (probabilities[i] < threshold) {
+        probabilities[i] = T(0);
+      } else {
+        sum += probabilities[i];
+      }
+    }
+    if (sum > T(0)) {
+      for (std::size_t i = 0; std::cmp_less(i, vocab_size); ++i) {
+        probabilities[i] /= sum;
+      }
+    }
+  }
 private:
   int32_t vocab_size_;
   // reciprocal of temperature, or 0 if temperature == 0.
   float inv_temperature_;
   float topp_;
+  float min_p_;
+  float repetition_penalty_;
   unsigned long long rng_state_;
 };

package/common/runner/text_decoder_runner.cpp CHANGED Viewed

@@ -10,6 +10,7 @@
 #include "text_decoder_runner.h"
 #include "arange_util.h"
+#include "irunner.h"
 #include "stats.h"
 #include <ctime>
@@ -22,9 +23,8 @@ namespace llm {
 // and a ~5% improvement on Galaxy S22 by switching to
 // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
 TextDecoderRunner::TextDecoderRunner(Module &module, IOManager *io_manager,
-                                     float temperature, float topp)
-    : module_(&module), io_manager_(io_manager), temperature_(temperature),
-      topp_(topp) {}
+                                     const GenerationConfig &config)
+    : module_(&module), io_manager_(io_manager), config_(config) {}
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
@@ -82,6 +82,34 @@ TextDecoderRunner::step(TensorPtr &tokens, int64_t start_pos) {
   }
 }
+int32_t TextDecoderRunner::logits_to_token(
+    const executorch::aten::Tensor &logits_tensor,
+    const std::vector<uint64_t> &recent_tokens) {
+  int32_t result = 0;
+  struct {
+    [[noreturn]] void fail(torch::executor::Error) {
+      ET_CHECK_MSG(false, "Unsupported dtype in logits_to_token");
+    }
+  } ctx;
+  ET_SWITCH_FOUR_TYPES(
+      Float, Half, BFloat16, UInt16, logits_tensor.scalar_type(), ctx,
+      "logits_to_token", CTYPE, [&]() {
+        auto *logits = logits_tensor.mutable_data_ptr<CTYPE>();
+        ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
+        if (logits_tensor.dim() == 3) {
+          auto num_tokens = logits_tensor.size(1);
+          logits += (num_tokens - 1) * vocab_size;
+        }
+        Sampler sampler(vocab_size, config_.temperature, config_.topp,
+                        static_cast<unsigned long long>(std::time(nullptr)),
+                        config_.min_p, config_.repetition_penalty);
+        result = sampler.sample(logits, recent_tokens);
+      });
+  return result;
+}
 } // namespace llm
 } // namespace extension
 } // namespace executorch