npm - react-native-executorch - Versions diffs - 0.9.0 → 0.9.2 - Mend

react-native-executorch 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/common/runner/multimodal_runner.cpp CHANGED Viewed

@@ -3,7 +3,6 @@
 #include "constants.h"
 #include "util.h"
 #include <rnexecutorch/Error.h>
-#include <rnexecutorch/Log.h>
 namespace executorch::extension::llm {
@@ -54,8 +53,14 @@ Error MultimodalRunner::load_subcomponents() {
   if (enc_it != encoders_.end()) {
     image_encoder = enc_it->second.get();
   }
+  IEncoder *audio_encoder = nullptr;
+  auto aud_it = encoders_.find(MultimodalType::Audio);
+  if (aud_it != encoders_.end()) {
+    audio_encoder = aud_it->second.get();
+  }
   mm_prefiller_ = std::make_unique<MultimodalPrefiller>(
-      *module_, *mm_decoder_runner_, *tokenizer_, image_encoder);
+      *module_, *mm_decoder_runner_, *tokenizer_, metadata_, image_encoder,
+      audio_encoder);
   mm_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true,
       std::move(eos_ids_), stats_ptr, config_);
@@ -78,22 +83,24 @@ Error MultimodalRunner::generate_internal(
   }
   stats_.inference_start_ms = time_in_ms();
-  uint64_t prefill_next_token = 0;
-  for (const auto &input : inputs) {
-    auto prefill_result = mm_prefiller_->prefill(input, pos_);
-    if (!prefill_result.ok())
-      return prefill_result.error();
-    prefill_next_token = prefill_result.get();
-  }
+  auto prefill_result = mm_prefiller_->prefill(inputs, pos_);
+  if (!prefill_result.ok())
+    return prefill_result.error();
+  uint64_t prefill_next_token = prefill_result.get();
   stats_.first_token_ms = time_in_ms();
   stats_.prompt_eval_end_ms = time_in_ms();
   stats_.num_prompt_tokens = pos_;
+  // For dynamic-shape PTEs (Gemma4 iter*), get_max_seq_len is the per-call
+  // decoder chunk size (e.g. 128) and the true generation budget lives in
+  // get_max_context_len. Mirrors text_runner.cpp:95-97.
+  const int32_t seq_cap = config_.enable_dynamic_shape
+                              ? config_.max_context_length
+                              : config_.max_seq_len;
   int32_t resolved_max_new = resolve_max_new_tokens(
-      static_cast<int32_t>(pos_), config_.max_seq_len,
-      config_.max_context_length, config_.max_new_tokens);
+      static_cast<int32_t>(pos_), seq_cap, config_.max_context_length,
+      config_.max_new_tokens);
   std::vector<uint64_t> seed_tokens = {prefill_next_token};
   auto wrapped_callback = [&](const std::string &piece) {

package/common/runner/multimodal_runner.h CHANGED Viewed

@@ -10,7 +10,7 @@
 namespace executorch::extension::llm {
-enum class MultimodalType { Image };
+enum class MultimodalType { Image, Audio };
 class MultimodalRunner : public BaseLLMRunner {
 public:

package/common/runner/sampler.cpp CHANGED Viewed

@@ -35,6 +35,10 @@
 #include "sampler.h"
 #include <algorithm>
 #include <ctime>
+#include <limits>
+#include <ranges>
+#include <span>
+#include <type_traits>
 #include <vector>
 namespace executorch {
@@ -46,7 +50,7 @@ template <typename T> int32_t Sampler::sample_argmax(T *probabilities) {
   // return the index that has the highest probability
   int max_i = 0;
   T max_p = probabilities[0];
-  for (int i = 1; i < vocab_size_; i++) {
+  for (size_t i = 1; i < vocab_size_; i++) {
     if (probabilities[i] > max_p) {
       max_i = i;
       max_p = probabilities[i];
@@ -60,7 +64,7 @@ int32_t Sampler::sample_mult(T *probabilities, float coin) {
   // sample index from probabilities (they must sum to 1!)
   // coin is a random number in [0, 1), usually from random_f32()
   T cdf = 0.0;
-  for (int i = 0; i < vocab_size_; i++) {
+  for (size_t i = 0; i < vocab_size_; i++) {
     cdf += probabilities[i];
     if (coin < cdf) {
       return i;
@@ -84,7 +88,7 @@ int32_t Sampler::sample_topp(T *probabilities, float coin) {
       std::make_unique<ProbIndex<T>[]>(vocab_size_);
   const float cutoff = (1.0f - topp_) / (n - 1);
-  for (int i = 0; i < n; i++) {
+  for (size_t i = 0; i < n; i++) {
     if (probabilities[i] >= cutoff) {
       probindex[n0].index = i;
       probindex[n0].prob = probabilities[i];
@@ -92,62 +96,147 @@ int32_t Sampler::sample_topp(T *probabilities, float coin) {
     }
   }
-  auto compare = [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
-    return a.prob > b.prob;
-  };
-  std::sort(probindex.get(), probindex.get() + n0, compare);
+  std::sort(probindex.get(), probindex.get() + n0,
+            [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
+              return a.prob > b.prob;
+            });
   // truncate the list where cumulative probability exceeds topp
   T cumulative_prob = 0;
-  int last_idx = n0 - 1; // in case of rounding errors consider all elements
-  for (int i = 0; i < n0; i++) {
+  int last_idx = n0 - 1;
+  for (size_t i = 0; i < n0; i++) {
     cumulative_prob += probindex[i].prob;
-    if (cumulative_prob > topp_) {
+    if (static_cast<float>(cumulative_prob) > topp_) {
       last_idx = i;
-      break; // we've exceeded topp by including last_idx
+      break;
     }
   }
   // sample from the truncated list
-  const T &r = coin * cumulative_prob;
+  float r = coin * static_cast<float>(cumulative_prob);
   T cdf = 0;
-  for (int i = 0; i <= last_idx; i++) {
+  for (size_t i = 0; i <= last_idx; i++) {
     cdf += probindex[i].prob;
-    if (r < cdf) {
+    if (r < static_cast<float>(cdf)) {
       return probindex[i].index;
     }
   }
-  return probindex[last_idx].index; // in case of rounding errors
+  return probindex[last_idx].index;
 }
-Sampler::Sampler(int32_t vocab_size, float temperature, float topp,
-                 unsigned long long rng_seed, float min_p,
-                 float repetition_penalty)
+// Mask logits outside the top-k by rank to -inf. Ties at the k-th boundary
+// are kept (matches HuggingFace TopKLogitsWarper).
+template <typename T> void Sampler::mask_topk(T *logits) {
+  if (topk_ <= 0 || topk_ >= vocab_size_) {
+    return;
+  }
+  // Partial-select the (topk_-th largest) threshold using nth_element on a
+  // copy of logits; O(n) average.
+  std::vector<T> scratch(logits, logits + vocab_size_);
+  std::nth_element(scratch.begin(), scratch.begin() + (topk_ - 1),
+                   scratch.end(), std::greater<T>());
+  const T threshold = scratch[topk_ - 1];
+  constexpr T neg_inf = std::numeric_limits<T>::lowest();
+  for (size_t i = 0; i < vocab_size_; i++) {
+    if (logits[i] < threshold) {
+      logits[i] = neg_inf;
+    }
+  }
+}
+// Mask logits outside the top-p nucleus to -inf. Approximates the exact
+// sort-based nucleus with a histogram over (logit - max): two O(n) passes, no
+// sort. Binning in logit (not probability) space keeps uniform resolution for
+// peaked and flat distributions alike. kRange=40 spans exp() down to ~4e-18.
+template <typename T> void Sampler::mask_topp(T *logits) {
+  if (topp_ <= 0.0f || topp_ >= 1.0f) {
+    return;
+  }
+  constexpr int32_t kBins = 2048;
+  // Compute in a type at least as wide as T so converting logits never loses
+  // precision: double stays double, everything else (float and the narrow
+  // half/bf16/uint16 logit types) widens to float. Accumulating in T directly
+  // would be unsafe for bf16, whose mantissa saturates when summing exp()
+  // over the full vocab.
+  using acc_t = std::conditional_t<std::is_same_v<T, double>, double, float>;
+  constexpr acc_t kRange = 40;
+  std::span<const T> logit_span{logits, static_cast<size_t>(vocab_size_)};
+  const acc_t max_val =
+      static_cast<acc_t>(*std::ranges::max_element(logit_span));
+  std::vector<acc_t> bin_mass(kBins, acc_t(0));
+  acc_t total = 0;
+  for (size_t i = 0; i < vocab_size_; i++) {
+    acc_t d = static_cast<acc_t>(logits[i]) - max_val;
+    acc_t e = std::exp(d);
+    total += e;
+    int32_t bin = static_cast<int32_t>((d + kRange) / kRange * kBins);
+    bin = std::clamp(bin, 0, kBins - 1);
+    bin_mass[bin] += e;
+  }
+  if (total <= acc_t(0)) {
+    return;
+  }
+  // Highest bin downward until the kept mass reaches topp. The crossing bin is
+  // kept (HuggingFace "keep the token that crosses" convention).
+  const acc_t target = static_cast<acc_t>(topp_) * total;
+  acc_t acc = 0;
+  int32_t keep_bin = 0;
+  for (int32_t bin = kBins - 1; bin >= 0; --bin) {
+    acc += bin_mass[bin];
+    if (acc >= target) {
+      keep_bin = bin;
+      break;
+    }
+  }
+  const acc_t d_threshold =
+      static_cast<acc_t>(keep_bin) / kBins * kRange - kRange;
+  constexpr T neg_inf = std::numeric_limits<T>::lowest();
+  for (size_t i = 0; i < vocab_size_; i++) {
+    if (static_cast<acc_t>(logits[i]) - max_val < d_threshold) {
+      logits[i] = neg_inf;
+    }
+  }
+}
+Sampler::Sampler(int32_t vocab_size, GenerationConfig config,
+                 unsigned long long rng_seed)
     : vocab_size_(vocab_size),
-      inv_temperature_((temperature != 0.0f) ? (1.0f / temperature) : 0.0f),
-      topp_(topp), min_p_(min_p), repetition_penalty_(repetition_penalty),
+      inv_temperature_(
+          (config.temperature != 0.0f) ? (1.0f / config.temperature) : 0.0f),
+      topp_(config.topp), min_p_(config.min_p),
+      repetition_penalty_(config.repetition_penalty), topk_(config.topk),
       rng_state_(rng_seed) {}
-Sampler::Sampler(int vocab_size, float temperature, float topp)
-    : Sampler(vocab_size, temperature, topp, std::time(nullptr), 0.0f, 1.0f) {}
+Sampler::Sampler(int32_t vocab_size, GenerationConfig config)
+    : Sampler(vocab_size, config, std::time(nullptr)) {}
 template <typename T> static void softmax(T *x, int size) {
-  // find max value (for numerical stability)
+  // Runs after top-k/top-p masking, which sets rejected logits to lowest().
+  // Skip exp() on those: it underflows to 0 anyway and is slow on device.
+  constexpr T kMasked = std::numeric_limits<T>::lowest();
   T max_val = x[0];
-  for (int i = 1; i < size; i++) {
+  for (size_t i = 1; i < size; i++) {
     if (x[i] > max_val) {
       max_val = x[i];
     }
   }
-  // exp and sum
   T sum = 0;
-  for (int i = 0; i < size; i++) {
+  for (size_t i = 0; i < size; i++) {
+    if (x[i] == kMasked) {
+      x[i] = T(0);
+      continue;
+    }
     x[i] = expf(x[i] - max_val);
     sum += x[i];
   }
-  // normalize
-  for (int i = 0; i < size; i++) {
-    x[i] /= sum;
+  for (size_t i = 0; i < size; i++) {
+    if (x[i] != T(0)) {
+      x[i] /= sum;
+    }
   }
 }
@@ -175,20 +264,18 @@ int32_t Sampler::sample(T *logits, const std::vector<uint64_t> &recent_tokens) {
     apply_repetition_penalty(logits, vocab_size_, recent_tokens);
     // 2. apply the temperature to the logits
     apply_temperature(logits, vocab_size_);
-    // 3. apply softmax to the logits to get the probabilities for next token
+    // 3. mask out logits outside top-k by rank (pre-softmax, becomes 0 mass)
+    mask_topk(logits);
+    // 4. mask out logits outside top-p by rank (pre-softmax)
+    mask_topp(logits);
+    // 5. apply softmax to the logits to get the probabilities for next token
     softmax(logits, vocab_size_);
-    // 4. apply min_p truncation
+    // 6. apply min_p truncation
     apply_min_p(logits, vocab_size_);
     // flip a (float) coin (this is our source of entropy for sampling)
     float coin = random_f32(&rng_state_);
-    // 5. we sample from this distribution to get the next token
-    if (topp_ <= 0 || topp_ >= 1) {
-      // simply sample from the predicted probability distribution
-      next = sample_mult(logits, coin);
-    } else {
-      // top-p (nucleus) sampling, clamping the least likely tokens to zero
-      next = sample_topp(logits, coin);
-    }
+    // 7. we sample from this distribution to get the next token
+    next = sample_mult(logits, coin);
   }
   return next;
 }

package/common/runner/sampler.h CHANGED Viewed

@@ -8,6 +8,7 @@
 #pragma once
+#include "runner/irunner.h"
 #include <algorithm>
 #include <cctype>
 #include <cmath>
@@ -28,6 +29,7 @@ namespace executorch {
 namespace extension {
 namespace llm {
 // A simple llama2 sampler.
+struct GenerationConfig;
 inline constexpr auto kTopp = 0.9f;
@@ -38,11 +40,13 @@ template <typename T> struct ProbIndex {
 class Sampler {
 public:
-  Sampler(int32_t vocab_size, float temperature, float topp,
-          unsigned long long rng_seed, float min_p = 0.0f,
-          float repetition_penalty = 1.0f);
-  Sampler(int32_t vocab_size, float temperature, float topp);
+  // topk <= 0 disables top-k filtering. topp <= 0 || topp >= 1 disables top-p.
+  // Pipeline when temperature != 0: temperature -> top-k mask -> top-p mask
+  // -> softmax -> multinomial. Note: topk == 1 with temperature != 0 collapses
+  // to greedy; pass topk = 0 to keep full-vocab temperature sampling.
+  Sampler(int32_t vocab_size, GenerationConfig config,
+          unsigned long long rng_seed);
+  Sampler(int32_t vocab_size, GenerationConfig config);
   template <typename T> int32_t sample(T *logits);
@@ -53,6 +57,9 @@ private:
   template <typename T> int32_t sample_topp(T *probabilities, float coin);
   template <typename T> int32_t sample_mult(T *probabilities, float coin);
   template <typename T> int32_t sample_argmax(T *probabilities);
+  // In-place logit warpers: set excluded indices to -inf.
+  template <typename T> void mask_topk(T *logits);
+  template <typename T> void mask_topp(T *logits);
   template <typename T>
   inline void apply_temperature(T *logits, int32_t vocab_size) {
@@ -110,6 +117,7 @@ private:
   float topp_;
   float min_p_;
   float repetition_penalty_;
+  int32_t topk_;
   unsigned long long rng_state_;
 };

package/common/runner/text_decoder_runner.cpp CHANGED Viewed

@@ -31,7 +31,6 @@ TextDecoderRunner::TextDecoderRunner(Module &module, IOManager *io_manager,
 // outer loop (call site) is responsible for managing state.
 ::executorch::runtime::Result<executorch::aten::Tensor>
 TextDecoderRunner::step(TensorPtr &tokens, int64_t start_pos) {
-  // ET_LOG(Info, "Input token %" PRIu64, input_token);
   auto method_meta_result = module_->method_meta("forward");
   if (!method_meta_result.ok()) {
     return method_meta_result.error();
@@ -102,9 +101,7 @@ int32_t TextDecoderRunner::logits_to_token(
           auto num_tokens = logits_tensor.size(1);
           logits += (num_tokens - 1) * vocab_size;
         }
-        Sampler sampler(vocab_size, config_.temperature, config_.topp,
-                        static_cast<unsigned long long>(std::time(nullptr)),
-                        config_.min_p, config_.repetition_penalty);
+        Sampler sampler(vocab_size, config_);
         result = sampler.sample(logits, recent_tokens);
       });
   return result;

package/common/runner/text_decoder_runner.h CHANGED Viewed

@@ -10,6 +10,7 @@
 #pragma once
+#include "constants.h"
 #include "io_manager.h"
 #include "sampler.h"
@@ -40,8 +41,8 @@ public:
   step(TensorPtr &input, int64_t start_pos);
   /**
-   * Load the Module for text decode purpose.
-   * @return The error code.
+   * Load the Module for text decode purpose. Loads the dynamic-shape `forward`
+   * method used for both prefill and decode.
    */
   virtual ::executorch::runtime::Error load() {
     return module_->load_method("forward");

package/common/runner/text_prefiller.cpp CHANGED Viewed

@@ -18,10 +18,11 @@ namespace llm {
 TextPrefiller::TextPrefiller(TextDecoderRunner *text_decoder_runner,
                              bool use_kv_cache, bool enable_parallel_prefill,
-                             int64_t max_seq_len)
+                             int64_t max_seq_len, int32_t prefill_chunk_size)
     : text_decoder_runner_(text_decoder_runner), use_kv_cache_(use_kv_cache),
       enable_parallel_prefill_(enable_parallel_prefill),
-      max_seq_len_(max_seq_len > 0 ? max_seq_len : 128) {}
+      max_seq_len_(max_seq_len > 0 ? max_seq_len : 128),
+      prefill_chunk_size_(prefill_chunk_size) {}
 ::executorch::runtime::Result<uint64_t>
 TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
@@ -31,17 +32,17 @@ TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
   }
-  // Check if we need to chunk the prompt tokens
   int32_t num_prompt_tokens = prompt_tokens.size();
+  int32_t chunk_size =
+      prefill_chunk_size_ > 0 ? prefill_chunk_size_ : max_seq_len_;
-  // If prompt tokens exceed max_seq_len_, we need to chunk them
-  if (num_prompt_tokens > max_seq_len_) {
+  if (num_prompt_tokens > chunk_size) {
     uint64_t cur_token = 0;
     int num_tokens_to_process = 0;
     while (num_tokens_to_process < num_prompt_tokens) {
-      auto num_tokens_to_prefill_with = std::min<int>(
-          num_prompt_tokens - num_tokens_to_process, max_seq_len_);
+      auto num_tokens_to_prefill_with =
+          std::min<int>(num_prompt_tokens - num_tokens_to_process, chunk_size);
       std::vector<uint64_t> prompt_tokens_to_process(
           num_tokens_to_prefill_with);
@@ -75,7 +76,6 @@ TextPrefiller::prefill_chunk(std::vector<uint64_t> &prompt_tokens,
   // store the token
   uint64_t cur_token;
   if (enable_parallel_prefill_ || !use_kv_cache_) {
-    // initialize tensor wrappers
     auto tokens = from_blob(prompt_tokens.data(), {1, num_prompt_tokens},
                             executorch::aten::ScalarType::Long);

package/common/runner/text_prefiller.h CHANGED Viewed

@@ -19,8 +19,14 @@ namespace llm {
 class TextPrefiller {
 public:
+  // prefill_chunk_size: when > 0, the prompt is always processed in steps of
+  // this size (see prefill()). Set to the model's forward sequence-length cap
+  // for the MLX backend (its forward is exported with a sliding-window bound
+  // and one-shot prefill spikes Metal memory). Other backends (XNNPACK/CoreML)
+  // pass 0 → original one-shot behavior.
   TextPrefiller(TextDecoderRunner *text_decoder_runner, bool use_kv_cache,
-                bool enable_parallel_prefill, int64_t max_seq_len = 128);
+                bool enable_parallel_prefill, int64_t max_seq_len = 128,
+                int32_t prefill_chunk_size = 0);
   virtual ~TextPrefiller() = default;
   /**
@@ -70,6 +76,7 @@ private:
   bool use_kv_cache_;
   bool enable_parallel_prefill_;
   int64_t max_seq_len_;
+  int32_t prefill_chunk_size_;
 };
 } // namespace llm

package/common/runner/text_runner.cpp CHANGED Viewed

@@ -26,11 +26,24 @@ Error TextRunner::load_subcomponents() {
   Stats *stats_ptr = &stats_;
-  text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
-      *module_, io_manager_.get(), config_);
+  text_decoder_runner_ =
+      std::make_unique<TextDecoderRunner>(*module_, io_manager_.get(), config_);
+  int32_t prefill_chunk_size = 0;
+  auto fwd_meta = module_->method_meta("forward");
+  if (fwd_meta.ok() && fwd_meta->uses_backend("MLXBackend")) {
+    auto input_meta = fwd_meta->input_tensor_meta(0);
+    if (input_meta.ok()) {
+      auto sizes = input_meta->sizes();
+      if (sizes.size() >= 2 && sizes[sizes.size() - 1] > 0) {
+        prefill_chunk_size = sizes[sizes.size() - 1];
+      }
+    }
+  }
   text_prefiller_ = std::make_unique<TextPrefiller>(
       text_decoder_runner_.get(), config_.enable_kv_cache,
-      config_.enable_dynamic_shape, config_.max_seq_len);
+      config_.enable_dynamic_shape, config_.max_seq_len, prefill_chunk_size);
   text_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
       std::move(eos_ids_), stats_ptr, config_);
@@ -65,6 +78,10 @@ Error TextRunner::generate_internal(
   stats_.inference_start_ms = time_in_ms();
+  // Multi-turn: JS re-renders the full chat history each call, so reset KV
+  // position to 0 and re-prefill from scratch.
+  pos_ = 0;
   int64_t context_len_left =
       static_cast<int64_t>(config_.max_context_length) - pos_;
@@ -79,16 +96,25 @@ Error TextRunner::generate_internal(
   std::vector<uint64_t> prompt_tokens = encodeResult.get();
   int num_prompt_tokens = prompt_tokens.size();
+  // For dynamic-shape PTEs (e.g. Gemma4 MLX/Vulkan), get_max_seq_len is the
+  // per-call decoder chunk size (e.g. the sliding window) and the real
+  // generation budget lives in get_max_context_len. Static-shape PTEs set both
+  // equal, so this collapses to the old behavior. Without this the budget is
+  // computed from the small chunk size, so max_new_tokens can resolve to ~0 and
+  // generation ends immediately after prefill.
+  const int32_t seq_cap = config_.enable_dynamic_shape
+                              ? config_.max_context_length
+                              : config_.max_seq_len;
   ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument,
                            "Expected at least 1 prompt token");
-  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < config_.max_seq_len,
-                           InvalidArgument,
-                           "num_prompt_tokens %d >= max_seq_len %" PRId32,
-                           num_prompt_tokens, config_.max_seq_len);
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < seq_cap, InvalidArgument,
+                           "num_prompt_tokens %d >= seq cap %" PRId32,
+                           num_prompt_tokens, seq_cap);
   int32_t max_new_tokens = resolve_max_new_tokens(
-      num_prompt_tokens, config_.max_seq_len,
-      static_cast<int32_t>(context_len_left), config_.max_new_tokens);
+      num_prompt_tokens, seq_cap, static_cast<int32_t>(context_len_left),
+      config_.max_new_tokens);
   ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument,
                            "Max new tokens %d is <= 0", max_new_tokens);

package/common/runner/text_token_generator.h CHANGED Viewed

@@ -100,8 +100,8 @@ public:
       prev_token = cur_token;
       stats_->on_sampling_begin();
-      cur_token =
-          text_decoder_runner_->logits_to_token(logits_tensor, generated_tokens);
+      cur_token = text_decoder_runner_->logits_to_token(logits_tensor,
+                                                        generated_tokens);
       stats_->on_sampling_end();
       pos++;
@@ -152,7 +152,6 @@ public:
       if (should_stop_) {
         break;
       }
       // data-dependent terminating condition: we have n_eos_ number of EOS
       if (eos_ids_->find(cur_token) != eos_ids_->end()) {
         printf("\n");

package/common/runner/util.h CHANGED Viewed

@@ -8,7 +8,6 @@
 #pragma once
 #include "constants.h"
-#include "text_prefiller.h"
 #include <cctype>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>

package/lib/module/constants/llmDefaults.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { SlidingWindowContextStrategy } from '../utils/llms/context_strategy';
  * Default system prompt used to guide the behavior of Large Language Models (LLMs).
  * @category Utilities - LLM
  */
-export const DEFAULT_SYSTEM_PROMPT = "You are a knowledgeable, efficient, and direct AI assistant. Provide concise answers, focusing on the key information needed. Offer suggestions tactfully when appropriate to improve outcomes. Engage in productive collaboration with the user. Don't return too much text.";
+export const DEFAULT_SYSTEM_PROMPT = "You are a knowledgeable, efficient, and direct AI assistant. Provide concise answers, focusing on the key information needed. Offer suggestions tactfully when appropriate to improve outcomes. Engage in productive collaboration with the user. Don't return too much text. If provided with audio samples treat it with at most importance";
 /**
  * Generates a default structured output prompt based on the provided JSON schema.

package/lib/module/constants/llmDefaults.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"names":["SlidingWindowContextStrategy","DEFAULT_SYSTEM_PROMPT","DEFAULT_STRUCTURED_OUTPUT_PROMPT","structuredOutputSchema","DEFAULT_MESSAGE_HISTORY","DEFAULT_CONTEXT_BUFFER_TOKENS","DEFAULT_CHAT_CONFIG","systemPrompt","initialMessageHistory","contextStrategy"],"sourceRoot":"../../../src","sources":["constants/llmDefaults.ts"],"mappings":";;AACA,SAASA,4BAA4B,QAAQ,gCAAgC;;AAE7E;AACA;AACA;AACA;AACA,OAAO,MAAMC,qBAAqB,GAChC,+~~QAA~~+Q;;~~AAEjR~~;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,MAAMC,gCAAgC,GAC3CC,sBAA8B,IAC3B;AACL;AACA;AACA;AACA;AACA;AACA,EAAEA,sBAAsB;AACxB,CAAC;;AAED;AACA;AACA;AACA;AACA,OAAO,MAAMC,uBAAkC,GAAG,EAAE;;AAEpD;AACA;AACA;AACA;AACA,OAAO,MAAMC,6BAA6B,GAAG,GAAG;;AAEhD;AACA;AACA;AACA;AACA,OAAO,MAAMC,mBAA+B,GAAG;EAC7CC,YAAY,EAAEN,qBAAqB;EACnCO,qBAAqB,EAAEJ,uBAAuB;EAC9CK,eAAe,EAAE,IAAIT,4BAA4B,CAC/CK,6BACF;AACF,CAAC","ignoreList":[]}
1	+ {"version":3,"names":["SlidingWindowContextStrategy","DEFAULT_SYSTEM_PROMPT","DEFAULT_STRUCTURED_OUTPUT_PROMPT","structuredOutputSchema","DEFAULT_MESSAGE_HISTORY","DEFAULT_CONTEXT_BUFFER_TOKENS","DEFAULT_CHAT_CONFIG","systemPrompt","initialMessageHistory","contextStrategy"],"sourceRoot":"../../../src","sources":["constants/llmDefaults.ts"],"mappings":";;AACA,SAASA,4BAA4B,QAAQ,gCAAgC;;AAE7E;AACA;AACA;AACA;AACA,OAAO,MAAMC,qBAAqB,GAChC,+UAA+U;;AAEjV;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,MAAMC,gCAAgC,GAC3CC,sBAA8B,IAC3B;AACL;AACA;AACA;AACA;AACA;AACA,EAAEA,sBAAsB;AACxB,CAAC;;AAED;AACA;AACA;AACA;AACA,OAAO,MAAMC,uBAAkC,GAAG,EAAE;;AAEpD;AACA;AACA;AACA;AACA,OAAO,MAAMC,6BAA6B,GAAG,GAAG;;AAEhD;AACA;AACA;AACA;AACA,OAAO,MAAMC,mBAA+B,GAAG;EAC7CC,YAAY,EAAEN,qBAAqB;EACnCO,qBAAqB,EAAEJ,uBAAuB;EAC9CK,eAAe,EAAE,IAAIT,4BAA4B,CAC/CK,6BACF;AACF,CAAC","ignoreList":[]}