npm - react-native-executorch - Versions diffs - 0.9.0 → 0.9.1 - Mend

react-native-executorch 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

package/common/runner/sampler.cpp CHANGED Viewed

@@ -35,6 +35,7 @@
 #include "sampler.h"
 #include <algorithm>
 #include <ctime>
+#include <limits>
 #include <vector>
 namespace executorch {
@@ -46,7 +47,7 @@ template <typename T> int32_t Sampler::sample_argmax(T *probabilities) {
   // return the index that has the highest probability
   int max_i = 0;
   T max_p = probabilities[0];
-  for (int i = 1; i < vocab_size_; i++) {
+  for (size_t i = 1; i < vocab_size_; i++) {
     if (probabilities[i] > max_p) {
       max_i = i;
       max_p = probabilities[i];
@@ -60,7 +61,7 @@ int32_t Sampler::sample_mult(T *probabilities, float coin) {
   // sample index from probabilities (they must sum to 1!)
   // coin is a random number in [0, 1), usually from random_f32()
   T cdf = 0.0;
-  for (int i = 0; i < vocab_size_; i++) {
+  for (size_t i = 0; i < vocab_size_; i++) {
     cdf += probabilities[i];
     if (coin < cdf) {
       return i;
@@ -84,7 +85,7 @@ int32_t Sampler::sample_topp(T *probabilities, float coin) {
       std::make_unique<ProbIndex<T>[]>(vocab_size_);
   const float cutoff = (1.0f - topp_) / (n - 1);
-  for (int i = 0; i < n; i++) {
+  for (size_t i = 0; i < n; i++) {
     if (probabilities[i] >= cutoff) {
       probindex[n0].index = i;
       probindex[n0].prob = probabilities[i];
@@ -92,61 +93,138 @@ int32_t Sampler::sample_topp(T *probabilities, float coin) {
     }
   }
-  auto compare = [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
-    return a.prob > b.prob;
-  };
-  std::sort(probindex.get(), probindex.get() + n0, compare);
+  std::sort(probindex.get(), probindex.get() + n0,
+            [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
+              return a.prob > b.prob;
+            });
   // truncate the list where cumulative probability exceeds topp
   T cumulative_prob = 0;
-  int last_idx = n0 - 1; // in case of rounding errors consider all elements
-  for (int i = 0; i < n0; i++) {
+  int last_idx = n0 - 1;
+  for (size_t i = 0; i < n0; i++) {
     cumulative_prob += probindex[i].prob;
-    if (cumulative_prob > topp_) {
+    if (static_cast<float>(cumulative_prob) > topp_) {
       last_idx = i;
-      break; // we've exceeded topp by including last_idx
+      break;
     }
   }
   // sample from the truncated list
-  const T &r = coin * cumulative_prob;
+  float r = coin * static_cast<float>(cumulative_prob);
   T cdf = 0;
-  for (int i = 0; i <= last_idx; i++) {
+  for (size_t i = 0; i <= last_idx; i++) {
     cdf += probindex[i].prob;
-    if (r < cdf) {
+    if (r < static_cast<float>(cdf)) {
       return probindex[i].index;
     }
   }
-  return probindex[last_idx].index; // in case of rounding errors
+  return probindex[last_idx].index;
 }
-Sampler::Sampler(int32_t vocab_size, float temperature, float topp,
-                 unsigned long long rng_seed, float min_p,
-                 float repetition_penalty)
+// Mask logits outside the top-k by rank to -inf. Ties at the k-th boundary
+// are kept (matches HuggingFace TopKLogitsWarper).
+template <typename T> void Sampler::mask_topk(T *logits) {
+  if (topk_ <= 0 || topk_ >= vocab_size_) {
+    return;
+  }
+  // Partial-select the (topk_-th largest) threshold using nth_element on a
+  // copy of logits; O(n) average.
+  std::vector<T> scratch(logits, logits + vocab_size_);
+  std::nth_element(scratch.begin(), scratch.begin() + (topk_ - 1),
+                   scratch.end(), std::greater<T>());
+  const T threshold = scratch[topk_ - 1];
+  constexpr T neg_inf = std::numeric_limits<T>::lowest();
+  for (size_t i = 0; i < vocab_size_; i++) {
+    if (logits[i] < threshold) {
+      logits[i] = neg_inf;
+    }
+  }
+}
+// Mask logits whose softmax-prob falls outside the top-p nucleus to -inf.
+// Keeps the token that crosses the threshold (HuggingFace convention).
+template <typename T> void Sampler::mask_topp(T *logits) {
+  if (topp_ <= 0.0f || topp_ >= 1.0f) {
+    return;
+  }
+  // Softmax into a scratch probs[] (do not mutate logits yet).
+  T max_val = logits[0];
+  for (size_t i = 1; i < vocab_size_; i++) {
+    if (logits[i] > max_val) {
+      max_val = logits[i];
+    }
+  }
+  std::unique_ptr<ProbIndex<T>[]> probindex =
+      std::make_unique<ProbIndex<T>[]>(vocab_size_);
+  T sum = 0;
+  for (size_t i = 0; i < vocab_size_; i++) {
+    T e = static_cast<T>(std::expf(static_cast<float>(logits[i] - max_val)));
+    probindex[i].prob = e;
+    probindex[i].index = i;
+    sum += e;
+  }
+  if (sum <= T(0)) {
+    return;
+  }
+  for (size_t i = 0; i < vocab_size_; i++) {
+    probindex[i].prob /= sum;
+  }
+  std::sort(probindex.get(), probindex.get() + vocab_size_,
+            [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
+              return a.prob > b.prob;
+            });
+  // Find the smallest prefix whose cumulative probability >= topp_.
+  T cumulative = 0;
+  int last_idx = vocab_size_ - 1;
+  for (size_t i = 0; i < vocab_size_; i++) {
+    cumulative += probindex[i].prob;
+    if (static_cast<float>(cumulative) >= topp_) {
+      last_idx = i;
+      break;
+    }
+  }
+  // Mark kept indices, then -inf the rest.
+  std::vector<bool> keep(vocab_size_, false);
+  for (size_t i = 0; i <= last_idx; i++) {
+    keep[probindex[i].index] = true;
+  }
+  constexpr T neg_inf = std::numeric_limits<T>::lowest();
+  for (size_t i = 0; i < vocab_size_; i++) {
+    if (!keep[i]) {
+      logits[i] = neg_inf;
+    }
+  }
+}
+Sampler::Sampler(int32_t vocab_size, GenerationConfig config,
+                 unsigned long long rng_seed)
     : vocab_size_(vocab_size),
-      inv_temperature_((temperature != 0.0f) ? (1.0f / temperature) : 0.0f),
-      topp_(topp), min_p_(min_p), repetition_penalty_(repetition_penalty),
+      inv_temperature_(
+          (config.temperature != 0.0f) ? (1.0f / config.temperature) : 0.0f),
+      topp_(config.topp), min_p_(config.min_p),
+      repetition_penalty_(config.repetition_penalty), topk_(config.topk),
       rng_state_(rng_seed) {}
-Sampler::Sampler(int vocab_size, float temperature, float topp)
-    : Sampler(vocab_size, temperature, topp, std::time(nullptr), 0.0f, 1.0f) {}
+Sampler::Sampler(int32_t vocab_size, GenerationConfig config)
+    : Sampler(vocab_size, config, std::time(nullptr)) {}
 template <typename T> static void softmax(T *x, int size) {
   // find max value (for numerical stability)
   T max_val = x[0];
-  for (int i = 1; i < size; i++) {
+  for (size_t i = 1; i < size; i++) {
     if (x[i] > max_val) {
       max_val = x[i];
     }
   }
   // exp and sum
   T sum = 0;
-  for (int i = 0; i < size; i++) {
+  for (size_t i = 0; i < size; i++) {
     x[i] = expf(x[i] - max_val);
     sum += x[i];
   }
   // normalize
-  for (int i = 0; i < size; i++) {
+  for (size_t i = 0; i < size; i++) {
     x[i] /= sum;
   }
 }
@@ -175,20 +253,18 @@ int32_t Sampler::sample(T *logits, const std::vector<uint64_t> &recent_tokens) {
     apply_repetition_penalty(logits, vocab_size_, recent_tokens);
     // 2. apply the temperature to the logits
     apply_temperature(logits, vocab_size_);
-    // 3. apply softmax to the logits to get the probabilities for next token
+    // 3. mask out logits outside top-k by rank (pre-softmax, becomes 0 mass)
+    mask_topk(logits);
+    // 4. mask out logits outside top-p by rank (pre-softmax)
+    mask_topp(logits);
+    // 5. apply softmax to the logits to get the probabilities for next token
     softmax(logits, vocab_size_);
-    // 4. apply min_p truncation
+    // 6. apply min_p truncation
     apply_min_p(logits, vocab_size_);
     // flip a (float) coin (this is our source of entropy for sampling)
     float coin = random_f32(&rng_state_);
-    // 5. we sample from this distribution to get the next token
-    if (topp_ <= 0 || topp_ >= 1) {
-      // simply sample from the predicted probability distribution
-      next = sample_mult(logits, coin);
-    } else {
-      // top-p (nucleus) sampling, clamping the least likely tokens to zero
-      next = sample_topp(logits, coin);
-    }
+    // 7. we sample from this distribution to get the next token
+    next = sample_mult(logits, coin);
   }
   return next;
 }

package/common/runner/sampler.h CHANGED Viewed

@@ -8,6 +8,7 @@
 #pragma once
+#include "runner/irunner.h"
 #include <algorithm>
 #include <cctype>
 #include <cmath>
@@ -28,6 +29,7 @@ namespace executorch {
 namespace extension {
 namespace llm {
 // A simple llama2 sampler.
+struct GenerationConfig;
 inline constexpr auto kTopp = 0.9f;
@@ -38,11 +40,13 @@ template <typename T> struct ProbIndex {
 class Sampler {
 public:
-  Sampler(int32_t vocab_size, float temperature, float topp,
-          unsigned long long rng_seed, float min_p = 0.0f,
-          float repetition_penalty = 1.0f);
-  Sampler(int32_t vocab_size, float temperature, float topp);
+  // topk <= 0 disables top-k filtering. topp <= 0 || topp >= 1 disables top-p.
+  // Pipeline when temperature != 0: temperature -> top-k mask -> top-p mask
+  // -> softmax -> multinomial. Note: topk == 1 with temperature != 0 collapses
+  // to greedy; pass topk = 0 to keep full-vocab temperature sampling.
+  Sampler(int32_t vocab_size, GenerationConfig config,
+          unsigned long long rng_seed);
+  Sampler(int32_t vocab_size, GenerationConfig config);
   template <typename T> int32_t sample(T *logits);
@@ -53,6 +57,9 @@ private:
   template <typename T> int32_t sample_topp(T *probabilities, float coin);
   template <typename T> int32_t sample_mult(T *probabilities, float coin);
   template <typename T> int32_t sample_argmax(T *probabilities);
+  // In-place logit warpers: set excluded indices to -inf.
+  template <typename T> void mask_topk(T *logits);
+  template <typename T> void mask_topp(T *logits);
   template <typename T>
   inline void apply_temperature(T *logits, int32_t vocab_size) {
@@ -110,6 +117,7 @@ private:
   float topp_;
   float min_p_;
   float repetition_penalty_;
+  int32_t topk_;
   unsigned long long rng_state_;
 };

package/common/runner/text_decoder_runner.cpp CHANGED Viewed

@@ -31,7 +31,6 @@ TextDecoderRunner::TextDecoderRunner(Module &module, IOManager *io_manager,
 // outer loop (call site) is responsible for managing state.
 ::executorch::runtime::Result<executorch::aten::Tensor>
 TextDecoderRunner::step(TensorPtr &tokens, int64_t start_pos) {
-  // ET_LOG(Info, "Input token %" PRIu64, input_token);
   auto method_meta_result = module_->method_meta("forward");
   if (!method_meta_result.ok()) {
     return method_meta_result.error();
@@ -102,9 +101,7 @@ int32_t TextDecoderRunner::logits_to_token(
           auto num_tokens = logits_tensor.size(1);
           logits += (num_tokens - 1) * vocab_size;
         }
-        Sampler sampler(vocab_size, config_.temperature, config_.topp,
-                        static_cast<unsigned long long>(std::time(nullptr)),
-                        config_.min_p, config_.repetition_penalty);
+        Sampler sampler(vocab_size, config_);
         result = sampler.sample(logits, recent_tokens);
       });
   return result;

package/common/runner/text_decoder_runner.h CHANGED Viewed

@@ -10,6 +10,7 @@
 #pragma once
+#include "constants.h"
 #include "io_manager.h"
 #include "sampler.h"
@@ -40,8 +41,8 @@ public:
   step(TensorPtr &input, int64_t start_pos);
   /**
-   * Load the Module for text decode purpose.
-   * @return The error code.
+   * Load the Module for text decode purpose. Loads the dynamic-shape `forward`
+   * method used for both prefill and decode.
    */
   virtual ::executorch::runtime::Error load() {
     return module_->load_method("forward");

package/common/runner/text_prefiller.cpp CHANGED Viewed

@@ -18,10 +18,11 @@ namespace llm {
 TextPrefiller::TextPrefiller(TextDecoderRunner *text_decoder_runner,
                              bool use_kv_cache, bool enable_parallel_prefill,
-                             int64_t max_seq_len)
+                             int64_t max_seq_len, int32_t prefill_chunk_size)
     : text_decoder_runner_(text_decoder_runner), use_kv_cache_(use_kv_cache),
       enable_parallel_prefill_(enable_parallel_prefill),
-      max_seq_len_(max_seq_len > 0 ? max_seq_len : 128) {}
+      max_seq_len_(max_seq_len > 0 ? max_seq_len : 128),
+      prefill_chunk_size_(prefill_chunk_size) {}
 ::executorch::runtime::Result<uint64_t>
 TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
@@ -31,17 +32,17 @@ TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
   }
-  // Check if we need to chunk the prompt tokens
   int32_t num_prompt_tokens = prompt_tokens.size();
+  int32_t chunk_size =
+      prefill_chunk_size_ > 0 ? prefill_chunk_size_ : max_seq_len_;
-  // If prompt tokens exceed max_seq_len_, we need to chunk them
-  if (num_prompt_tokens > max_seq_len_) {
+  if (num_prompt_tokens > chunk_size) {
     uint64_t cur_token = 0;
     int num_tokens_to_process = 0;
     while (num_tokens_to_process < num_prompt_tokens) {
-      auto num_tokens_to_prefill_with = std::min<int>(
-          num_prompt_tokens - num_tokens_to_process, max_seq_len_);
+      auto num_tokens_to_prefill_with =
+          std::min<int>(num_prompt_tokens - num_tokens_to_process, chunk_size);
       std::vector<uint64_t> prompt_tokens_to_process(
           num_tokens_to_prefill_with);
@@ -75,7 +76,6 @@ TextPrefiller::prefill_chunk(std::vector<uint64_t> &prompt_tokens,
   // store the token
   uint64_t cur_token;
   if (enable_parallel_prefill_ || !use_kv_cache_) {
-    // initialize tensor wrappers
     auto tokens = from_blob(prompt_tokens.data(), {1, num_prompt_tokens},
                             executorch::aten::ScalarType::Long);

package/common/runner/text_prefiller.h CHANGED Viewed

@@ -19,8 +19,14 @@ namespace llm {
 class TextPrefiller {
 public:
+  // prefill_chunk_size: when > 0, the prompt is always processed in steps of
+  // this size (see prefill()). Set to the model's forward sequence-length cap
+  // for the MLX backend (its forward is exported with a sliding-window bound
+  // and one-shot prefill spikes Metal memory). Other backends (XNNPACK/CoreML)
+  // pass 0 → original one-shot behavior.
   TextPrefiller(TextDecoderRunner *text_decoder_runner, bool use_kv_cache,
-                bool enable_parallel_prefill, int64_t max_seq_len = 128);
+                bool enable_parallel_prefill, int64_t max_seq_len = 128,
+                int32_t prefill_chunk_size = 0);
   virtual ~TextPrefiller() = default;
   /**
@@ -70,6 +76,7 @@ private:
   bool use_kv_cache_;
   bool enable_parallel_prefill_;
   int64_t max_seq_len_;
+  int32_t prefill_chunk_size_;
 };
 } // namespace llm

package/common/runner/text_runner.cpp CHANGED Viewed

@@ -26,11 +26,24 @@ Error TextRunner::load_subcomponents() {
   Stats *stats_ptr = &stats_;
-  text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
-      *module_, io_manager_.get(), config_);
+  text_decoder_runner_ =
+      std::make_unique<TextDecoderRunner>(*module_, io_manager_.get(), config_);
+  int32_t prefill_chunk_size = 0;
+  auto fwd_meta = module_->method_meta("forward");
+  if (fwd_meta.ok() && fwd_meta->uses_backend("MLXBackend")) {
+    auto input_meta = fwd_meta->input_tensor_meta(0);
+    if (input_meta.ok()) {
+      auto sizes = input_meta->sizes();
+      if (sizes.size() >= 2 && sizes[sizes.size() - 1] > 0) {
+        prefill_chunk_size = sizes[sizes.size() - 1];
+      }
+    }
+  }
   text_prefiller_ = std::make_unique<TextPrefiller>(
       text_decoder_runner_.get(), config_.enable_kv_cache,
-      config_.enable_dynamic_shape, config_.max_seq_len);
+      config_.enable_dynamic_shape, config_.max_seq_len, prefill_chunk_size);
   text_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
       std::move(eos_ids_), stats_ptr, config_);
@@ -65,6 +78,10 @@ Error TextRunner::generate_internal(
   stats_.inference_start_ms = time_in_ms();
+  // Multi-turn: JS re-renders the full chat history each call, so reset KV
+  // position to 0 and re-prefill from scratch.
+  pos_ = 0;
   int64_t context_len_left =
       static_cast<int64_t>(config_.max_context_length) - pos_;
@@ -79,16 +96,25 @@ Error TextRunner::generate_internal(
   std::vector<uint64_t> prompt_tokens = encodeResult.get();
   int num_prompt_tokens = prompt_tokens.size();
+  // For dynamic-shape PTEs (e.g. Gemma4 MLX/Vulkan), get_max_seq_len is the
+  // per-call decoder chunk size (e.g. the sliding window) and the real
+  // generation budget lives in get_max_context_len. Static-shape PTEs set both
+  // equal, so this collapses to the old behavior. Without this the budget is
+  // computed from the small chunk size, so max_new_tokens can resolve to ~0 and
+  // generation ends immediately after prefill.
+  const int32_t seq_cap = config_.enable_dynamic_shape
+                              ? config_.max_context_length
+                              : config_.max_seq_len;
   ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument,
                            "Expected at least 1 prompt token");
-  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < config_.max_seq_len,
-                           InvalidArgument,
-                           "num_prompt_tokens %d >= max_seq_len %" PRId32,
-                           num_prompt_tokens, config_.max_seq_len);
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < seq_cap, InvalidArgument,
+                           "num_prompt_tokens %d >= seq cap %" PRId32,
+                           num_prompt_tokens, seq_cap);
   int32_t max_new_tokens = resolve_max_new_tokens(
-      num_prompt_tokens, config_.max_seq_len,
-      static_cast<int32_t>(context_len_left), config_.max_new_tokens);
+      num_prompt_tokens, seq_cap, static_cast<int32_t>(context_len_left),
+      config_.max_new_tokens);
   ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument,
                            "Max new tokens %d is <= 0", max_new_tokens);

package/common/runner/text_token_generator.h CHANGED Viewed

@@ -100,8 +100,8 @@ public:
       prev_token = cur_token;
       stats_->on_sampling_begin();
-      cur_token =
-          text_decoder_runner_->logits_to_token(logits_tensor, generated_tokens);
+      cur_token = text_decoder_runner_->logits_to_token(logits_tensor,
+                                                        generated_tokens);
       stats_->on_sampling_end();
       pos++;
@@ -152,7 +152,6 @@ public:
       if (should_stop_) {
         break;
       }
       // data-dependent terminating condition: we have n_eos_ number of EOS
       if (eos_ids_->find(cur_token) != eos_ids_->end()) {
         printf("\n");

package/common/runner/util.h CHANGED Viewed

@@ -8,7 +8,6 @@
 #pragma once
 #include "constants.h"
-#include "text_prefiller.h"
 #include <cctype>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>

package/lib/module/constants/llmDefaults.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { SlidingWindowContextStrategy } from '../utils/llms/context_strategy';
  * Default system prompt used to guide the behavior of Large Language Models (LLMs).
  * @category Utilities - LLM
  */
-export const DEFAULT_SYSTEM_PROMPT = "You are a knowledgeable, efficient, and direct AI assistant. Provide concise answers, focusing on the key information needed. Offer suggestions tactfully when appropriate to improve outcomes. Engage in productive collaboration with the user. Don't return too much text.";
+export const DEFAULT_SYSTEM_PROMPT = "You are a knowledgeable, efficient, and direct AI assistant. Provide concise answers, focusing on the key information needed. Offer suggestions tactfully when appropriate to improve outcomes. Engage in productive collaboration with the user. Don't return too much text. If provided with audio samples treat it with at most importance";
 /**
  * Generates a default structured output prompt based on the provided JSON schema.

package/lib/module/constants/llmDefaults.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"names":["SlidingWindowContextStrategy","DEFAULT_SYSTEM_PROMPT","DEFAULT_STRUCTURED_OUTPUT_PROMPT","structuredOutputSchema","DEFAULT_MESSAGE_HISTORY","DEFAULT_CONTEXT_BUFFER_TOKENS","DEFAULT_CHAT_CONFIG","systemPrompt","initialMessageHistory","contextStrategy"],"sourceRoot":"../../../src","sources":["constants/llmDefaults.ts"],"mappings":";;AACA,SAASA,4BAA4B,QAAQ,gCAAgC;;AAE7E;AACA;AACA;AACA;AACA,OAAO,MAAMC,qBAAqB,GAChC,+~~QAA~~+Q;;~~AAEjR~~;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,MAAMC,gCAAgC,GAC3CC,sBAA8B,IAC3B;AACL;AACA;AACA;AACA;AACA;AACA,EAAEA,sBAAsB;AACxB,CAAC;;AAED;AACA;AACA;AACA;AACA,OAAO,MAAMC,uBAAkC,GAAG,EAAE;;AAEpD;AACA;AACA;AACA;AACA,OAAO,MAAMC,6BAA6B,GAAG,GAAG;;AAEhD;AACA;AACA;AACA;AACA,OAAO,MAAMC,mBAA+B,GAAG;EAC7CC,YAAY,EAAEN,qBAAqB;EACnCO,qBAAqB,EAAEJ,uBAAuB;EAC9CK,eAAe,EAAE,IAAIT,4BAA4B,CAC/CK,6BACF;AACF,CAAC","ignoreList":[]}
1	+ {"version":3,"names":["SlidingWindowContextStrategy","DEFAULT_SYSTEM_PROMPT","DEFAULT_STRUCTURED_OUTPUT_PROMPT","structuredOutputSchema","DEFAULT_MESSAGE_HISTORY","DEFAULT_CONTEXT_BUFFER_TOKENS","DEFAULT_CHAT_CONFIG","systemPrompt","initialMessageHistory","contextStrategy"],"sourceRoot":"../../../src","sources":["constants/llmDefaults.ts"],"mappings":";;AACA,SAASA,4BAA4B,QAAQ,gCAAgC;;AAE7E;AACA;AACA;AACA;AACA,OAAO,MAAMC,qBAAqB,GAChC,+UAA+U;;AAEjV;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,MAAMC,gCAAgC,GAC3CC,sBAA8B,IAC3B;AACL;AACA;AACA;AACA;AACA;AACA,EAAEA,sBAAsB;AACxB,CAAC;;AAED;AACA;AACA;AACA;AACA,OAAO,MAAMC,uBAAkC,GAAG,EAAE;;AAEpD;AACA;AACA;AACA;AACA,OAAO,MAAMC,6BAA6B,GAAG,GAAG;;AAEhD;AACA;AACA;AACA;AACA,OAAO,MAAMC,mBAA+B,GAAG;EAC7CC,YAAY,EAAEN,qBAAqB;EACnCO,qBAAqB,EAAEJ,uBAAuB;EAC9CK,eAAe,EAAE,IAAIT,4BAA4B,CAC/CK,6BACF;AACF,CAAC","ignoreList":[]}

package/lib/module/constants/modelRegistry.js CHANGED Viewed

@@ -26,7 +26,7 @@ import { RnExecutorchErrorCode } from '../errors/ErrorCodes';
 // Accessors are functions; calling with no opts returns the platform default.
-const BACKEND_ORDER = ['xnnpack', 'coreml', 'vulkan', 'qnn'];
+const BACKEND_ORDER = ['xnnpack', 'coreml', 'mlx', 'vulkan', 'qnn'];
 function firstBackend(variants) {
   for (const b of BACKEND_ORDER) {
     if (variants[b]) return b;
@@ -107,6 +107,32 @@ function tts(c) {
 // Per-backend variant maps for models that ship more than one backend.
 // ─────────────────────────────────────────────────────────────────────────────
+const GEMMA4_E2B_VARIANTS = {
+  mlx: {
+    base: {
+      modelName: 'gemma4-e2b',
+      modelSource: M.GEMMA4_E2B_MLX_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG
+    }
+  },
+  xnnpack: {
+    base: {
+      modelName: 'gemma4-e2b',
+      modelSource: M.GEMMA4_E2B_XNNPACK_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG
+    }
+  },
+  vulkan: {
+    base: {
+      modelName: 'gemma4-e2b',
+      modelSource: M.GEMMA4_E2B_VULKAN_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG
+    }
+  }
+};
 const EFFICIENTNET_V2_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -331,10 +357,15 @@ export const models = {
     lfm2_5_350m: pair(M.LFM2_5_350M, M.LFM2_5_350M_QUANTIZED),
     lfm2_5_1_2b_instruct: pair(M.LFM2_5_1_2B_INSTRUCT, M.LFM2_5_1_2B_INSTRUCT_QUANTIZED),
     bielik_v3_0_1_5b: pair(M.BIELIK_V3_0_1_5B, M.BIELIK_V3_0_1_5B_QUANTIZED),
+    gemma4_e2b: variant(GEMMA4_E2B_VARIANTS, {
+      ios: 'mlx',
+      android: 'vulkan'
+    }),
     // Multimodal LLMs — same hook/module as plain LLMs, listed here so users
     // pick a model by capability ("LLM") rather than by modality.
     lfm2_5_vl_1_6b: base(M.LFM2_5_VL_1_6B_QUANTIZED),
-    lfm2_5_vl_450m: base(M.LFM2_5_VL_450M_QUANTIZED)
+    lfm2_5_vl_450m: base(M.LFM2_5_VL_450M_QUANTIZED),
+    gemma4_e2b_multimodal: base(M.GEMMA4_E2B_MM)
   },
   classification: {
     efficientnet_v2_s: variant(EFFICIENTNET_V2_S_VARIANTS)