npm - react-native-executorch - Versions diffs - 0.5.15 → 0.6.0-nightly-897eae9-20251213 - Mend

react-native-executorch 0.5.15 → 0.6.0-nightly-897eae9-20251213

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (277) hide show

package/common/runner/runner.cpp CHANGED Viewed

@@ -4,6 +4,7 @@
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
  */
 // A simple llama2 runner that includes preprocessing and post processing logic.
@@ -21,8 +22,6 @@ using ::executorch::extension::Module;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::Result;
-namespace llm = ::executorch::extension::llm;
 std::string loadBytesFromFile(const std::string &path) {
   std::ifstream fs(path, std::ios::in | std::ios::binary);
   if (fs.fail()) {
@@ -39,7 +38,6 @@ std::string loadBytesFromFile(const std::string &path) {
 namespace {
 static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
-static constexpr auto kBosId = "get_bos_id";
 static constexpr auto kEosIds = "get_eos_ids";
 static constexpr auto kMaxSeqLen = "get_max_seq_len";
 static constexpr auto kMaxContextLen = "get_max_context_len";
@@ -48,29 +46,16 @@ static constexpr auto kUseKVCache = "use_kv_cache";
 static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 } // namespace
-Runner::Runner(const std::string &model_path, const std::string &tokenizer_path,
-               const float temperature,
-               std::optional<const std::string> data_path)
-    // NOTE: we observed ~2x loading performance increase on iPhone 15
-    // and a ~5% improvement on Galaxy S22 by switching to
-    // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-    : temperature_(temperature), tokenizer_path_(tokenizer_path),
+Runner::Runner(Module *module, const std::string &tokenizer_path,
+               const llm::GenerationConfig &config)
+    : config_(config), module_(module), tokenizer_path_(tokenizer_path),
       metadata_({
           {kEnableDynamicShape, false},
           {kMaxSeqLen, 128},
           {kMaxContextLen, 128},
           {kUseKVCache, true},
           {kUseSDPAWithKVCache, false},
-      }) {
-  if (data_path.has_value()) {
-    module_ = std::make_unique<Module>(model_path, data_path.value(),
-                                       Module::LoadMode::File);
-  } else {
-    module_ = std::make_unique<Module>(model_path, Module::LoadMode::File);
-  }
-  ET_LOG(Info, "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
-         model_path.c_str(), tokenizer_path.c_str());
-}
+      }) {}
 bool Runner::is_loaded() const {
   return module_->is_loaded() && tokenizer_ && text_decoder_runner_ &&
@@ -81,9 +66,10 @@ Error Runner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
-  // load tokenizer.
+  // Load tokenizer.
   auto blob = loadBytesFromFile(tokenizer_path_);
   tokenizer_ = tokenizers::Tokenizer::FromBlobJSON(blob);
@@ -92,9 +78,9 @@ Error Runner::load() {
   auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
   metadata_[kVocabSize] = tokenizer_->GetVocabSize();
+  // Load model metadata
   const auto method_names =
       ET_UNWRAP(module_->method_names(), "Failed reading method names");
   for (auto &pair : metadata_) {
     const auto &method_name = pair.first;
     auto &value = pair.second;
@@ -103,11 +89,13 @@ Error Runner::load() {
                   .toScalar()
                   .to<decltype(metadata_)::mapped_type>();
     } else {
-      ET_LOG(Info, "Methond %s not found, using the default value %" PRId64,
+      ET_LOG(Info, "Method %s not found, using the default value %" PRId64,
              method_name.c_str(), value);
     }
     ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
   }
+  // Load EOS token ids
   if (method_names.count(kEosIds)) {
     eos_ids->clear();
     for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
@@ -116,15 +104,34 @@ Error Runner::load() {
       ET_LOG(Info, "eos_id = %" PRId64, value);
     }
   }
+  // Determine missing config values
+  // If user does not directly specify configuration parameters such as
+  // max_seq_len (i.e. leaves them as default values), they are determined by
+  // reading the exported model's methods.
+  if (config_.max_seq_len < 0)
+    config_.max_seq_len = static_cast<int32_t>(metadata_.at(kMaxSeqLen));
+  if (config_.max_context_length < 0)
+    config_.max_context_length =
+        static_cast<int32_t>(metadata_.at(kMaxContextLen));
+  if (config_.max_new_tokens < 0)
+    config_.max_new_tokens =
+        std::min(config_.max_seq_len, config_.max_context_length);
+  if (config_.enable_dynamic_shape)
+    config_.enable_dynamic_shape =
+        static_cast<bool>(metadata_.at(kEnableDynamicShape));
+  if (config_.enable_kv_cache)
+    config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
+  io_manager_ = std::make_unique<llm::IOManager>(*module_);
   text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
-      module_.get(), metadata_.at(kUseKVCache), metadata_.at(kVocabSize),
-      temperature_);
+      module_, io_manager_.get(), config_.temperature, config_.topp);
   text_prefiller_ = std::make_unique<llm::TextPrefiller>(
-      text_decoder_runner_.get(), metadata_.at(kUseKVCache),
-      metadata_.at(kEnableDynamicShape));
+      text_decoder_runner_.get(), config_.enable_kv_cache,
+      config_.enable_dynamic_shape, config_.max_seq_len);
   text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
-      tokenizer_.get(), text_decoder_runner_.get(), metadata_.at(kUseKVCache),
+      tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
       std::move(eos_ids), &stats_);
   return Error::Ok;
@@ -139,9 +146,9 @@ Error Runner::load() {
   }
 Error Runner::generate(const std::string &prompt,
+                       const llm::GenerationConfig &generation_config,
                        std::function<void(const std::string &)> token_callback,
-                       std::function<void(const llm::Stats &)> stats_callback,
-                       bool echo, bool warmup) {
+                       std::function<void(const llm::Stats &)> stats_callback) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
@@ -151,17 +158,18 @@ Error Runner::generate(const std::string &prompt,
     stats_.model_load_end_ms = llm::time_in_ms();
   }
-  if (warmup) {
+  if (generation_config.warming) {
     ET_LOG(Info, "Doing a warmup run...");
   }
-  RUNNER_ET_LOG(warmup, "RSS after loading model: %f MiB (0 if unsupported)",
+  RUNNER_ET_LOG(generation_config.warming,
+                "RSS after loading model: %f MiB (0 if unsupported)",
                 llm::get_rss_bytes() / 1024.0 / 1024.0);
   // Wrap the token_callback with print function
   std::function<void(const std::string &)> wrapped_callback =
-      [token_callback, warmup](const std::string &piece) {
-        if (!warmup) {
+      [token_callback, &generation_config](const std::string &piece) {
+        if (!generation_config.warming) {
           llm::safe_printf(piece.c_str());
           fflush(stdout);
         }
@@ -175,10 +183,23 @@ Error Runner::generate(const std::string &prompt,
   stats_.inference_start_ms = llm::time_in_ms();
   shouldStop_ = false;
-  // Set the sequence length to the max seq length if not provided
-  int32_t seq_len = (seq_len > 0 && seq_len <= metadata_.at(kMaxSeqLen))
-                        ? seq_len
-                        : metadata_.at(kMaxSeqLen);
+  // Override main config fields with given generation config if specified
+  int32_t max_seq_len = generation_config.max_seq_len >= 0
+                            ? generation_config.max_seq_len
+                            : config_.max_seq_len;
+  int32_t max_context_length = generation_config.max_context_length >= 0
+                                   ? generation_config.max_context_length
+                                   : config_.max_context_length;
+  int32_t new_tokens_limit = generation_config.max_new_tokens >= 0
+                                 ? generation_config.max_new_tokens
+                                 : config_.max_new_tokens;
+  float temperature = generation_config.temperature >= 0.F
+                          ? generation_config.temperature
+                          : config_.temperature;
+  float topp =
+      generation_config.topp >= 0.F ? generation_config.topp : config_.topp;
+  int64_t context_len_left = static_cast<int64_t>(max_context_length) - pos_;
   std::vector<int32_t> prompt_tokens = tokenizer_->Encode(prompt);
   std::vector<uint64_t> prompt_tokens_uint64(prompt_tokens.begin(),
@@ -187,30 +208,38 @@ Error Runner::generate(const std::string &prompt,
   // encode the (string) prompt into tokens sequence
   int num_prompt_tokens = prompt_tokens.size();
-  if (num_prompt_tokens < 1) {
-    ET_LOG(Error,
-           "num_prompt_tokens %d < 1, expected at least 1 token to be passed "
-           "to generate()!",
-           num_prompt_tokens);
-    return Error::InvalidArgument;
-  } else if (num_prompt_tokens >= seq_len) {
-    ET_LOG(Error,
-           "num_prompt_tokens %d >= seq_len %d, Sequence length exceeded - "
-           "please increase the seq_len value passed to generate()!",
-           num_prompt_tokens, seq_len);
-    return Error::InvalidArgument;
-  }
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument,
+                           "Expected at least 1 prompt token");
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < max_seq_len, InvalidArgument,
+                           "num_prompt_tokens %d >= max_context_len %" PRId32
+                           ", Max seq length exceeded - please increase max "
+                           "seq len value in your export script",
+                           num_prompt_tokens, max_seq_len);
+  // Determine max_new_tokens using the GenerationConfig's resolve method,
+  // then subtract pos_ for max_new_tokens.
+  int32_t max_new_tokens = resolve_max_new_tokens(
+      num_prompt_tokens, max_seq_len, static_cast<int32_t>(context_len_left),
+      new_tokens_limit);
+  ET_LOG(Info,
+         "Max new tokens resolved: %d, given pos_ %" PRId64
+         ", num_prompt_tokens %zu, max_context_len %" PRId64,
+         max_new_tokens, pos_, prompt_tokens.size(),
+         static_cast<int64_t>(max_context_length));
+  ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument,
+                           "Max new tokens %d is less than or equal to 0",
+                           max_new_tokens);
   // Prefill first
   // Here feed all tokens to the model and get the next predicted token
   // after the prompt. After that we will enter generate loop.
   // print prompts
-  if (echo) {
+  if (generation_config.echo) {
     wrapped_callback(prompt);
   }
-  int64_t pos = 0;
-  auto prefill_res = text_prefiller_->prefill(prompt_tokens_uint64, pos);
+  auto prefill_res = text_prefiller_->prefill(prompt_tokens_uint64, pos_);
   stats_.first_token_ms = llm::time_in_ms();
   stats_.prompt_eval_end_ms = llm::time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
@@ -219,30 +248,36 @@ Error Runner::generate(const std::string &prompt,
   // print the first token from prefill. No prev_token so use cur_token for it.
   const std::string cur_decoded =
       tokenizer_->Decode(std::vector<int32_t>{static_cast<int32_t>(cur_token)});
-  RUNNER_ET_LOG(warmup, "RSS after prompt prefill: %f MiB (0 if unsupported)",
+  RUNNER_ET_LOG(generation_config.warming,
+                "RSS after prompt prefill: %f MiB (0 if unsupported)",
                 llm::get_rss_bytes() / 1024.0 / 1024.0);
   // start the main loop
   prompt_tokens_uint64.push_back(cur_token);
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      prompt_tokens_uint64, num_prompt_tokens, seq_len, wrapped_callback));
+      prompt_tokens_uint64, pos_, max_new_tokens - 1, temperature, topp,
+      wrapped_callback));
+  pos_ += num_generated_tokens;
   stats_.inference_end_ms = llm::time_in_ms();
-  if (!warmup) {
+  if (!generation_config.warming) {
     printf("\n");
   }
   RUNNER_ET_LOG(
-      warmup, "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      generation_config.warming,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
-  if (num_prompt_tokens + num_generated_tokens == seq_len) {
-    RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len);
+  if (num_generated_tokens == max_new_tokens) {
+    RUNNER_ET_LOG(generation_config.warming, "Max new tokens %i reached!",
+                  max_new_tokens);
   }
   stats_.num_prompt_tokens = num_prompt_tokens;
   stats_.num_generated_tokens = num_generated_tokens;
-  if (warmup) {
+  if (generation_config.warming) {
     ET_LOG(Info, "Warmup run finished!");
   } else {
     // Do not print report during warmup
@@ -256,12 +291,17 @@ Error Runner::generate(const std::string &prompt,
 }
 Error Runner::warmup(const std::string &prompt) {
-  Error err = generate(prompt,
+  // Create a GenerationConfig for warmup
+  llm::GenerationConfig config{.echo = false, .warming = true};
+  // Call generate with the warmup config
+  Error err = generate(prompt, config,
                        /*token_callback=*/nullptr,
-                       /*stats_callbak=*/nullptr,
-                       /*echo=*/false,
-                       /*warmup=*/true);
-  stats_.reset();
+                       /*stats_callbak=*/nullptr);
+  // Reset stats after warmup
+  reset();
   return err;
 }
@@ -273,6 +313,11 @@ void Runner::stop() {
   }
 }
+void Runner::reset() {
+  stats_.reset();
+  pos_ = 0;
+}
 void Runner::set_count_interval(size_t count_interval) {
   text_token_generator_->set_count_interval(count_interval);
 }
@@ -281,4 +326,44 @@ void Runner::set_time_interval(size_t time_interval) {
   text_token_generator_->set_time_interval(time_interval);
 }
+void Runner::set_temperature(float temperature) noexcept {
+  config_.temperature = temperature;
+  if (text_decoder_runner_) {
+    text_decoder_runner_->set_temperature(temperature);
+  }
+}
+void Runner::set_topp(float topp) noexcept {
+  config_.topp = topp;
+  if (text_decoder_runner_) {
+    text_decoder_runner_->set_topp(topp);
+  }
+}
+int32_t Runner::resolve_max_new_tokens(int32_t num_prompt_tokens,
+                                       int32_t max_seq_len,
+                                       int32_t max_context_len,
+                                       int32_t max_new_tokens) const {
+  int32_t result;
+  if (max_seq_len == -1 && max_new_tokens == -1) {
+    // Both are -1, use max context len minus prompt tokens
+    result = max_context_len - num_prompt_tokens;
+  } else if (max_seq_len == -1 && max_new_tokens != -1) {
+    // Only max_new_tokens is specified
+    result = std::min(max_new_tokens, max_context_len - num_prompt_tokens);
+  } else if (max_seq_len != -1 && max_new_tokens == -1) {
+    // Only seq_len is specified
+    result = std::min(max_seq_len, max_context_len) - num_prompt_tokens;
+  } else {
+    // Both are specified
+    result =
+        std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens,
+                 max_new_tokens);
+  }
+  // Ensure result is not negative
+  return std::max(0, result);
+}
 } // namespace example

package/common/runner/runner.h CHANGED Viewed

@@ -27,42 +27,59 @@
 namespace example {
-class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
+namespace llm = ::executorch::extension::llm;
+class Runner : public llm::IRunner {
 public:
-  explicit Runner(const std::string &model_path,
+  explicit Runner(::executorch::extension::Module *module,
                   const std::string &tokenizer_path,
-                  const float temperature = 0.8f,
-                  std::optional<const std::string> data_path = std::nullopt);
+                  const llm::GenerationConfig &config = {
+                      .temperature = 0.8F, .topp = 0.9F}); // The main config
-  bool is_loaded() const;
-  ::executorch::runtime::Error load();
-  ::executorch::runtime::Error
-  generate(const std::string &prompt,
-           std::function<void(const std::string &)> token_callback = {},
-           std::function<void(const ::executorch::extension::llm::Stats &)>
-               stats_callback = {},
-           bool echo = true, bool warming = false);
+  bool is_loaded() const override;
+  ::executorch::runtime::Error load() override;
+  ::executorch::runtime::Error generate(
+      const std::string &prompt,
+      const llm::GenerationConfig &generation_config =
+          {}, // An extra config which temporarily overrides previous model
+              // settings
+      std::function<void(const std::string &)> token_callback = {},
+      std::function<void(const llm::Stats &)> stats_callback = {}) override;
   ::executorch::runtime::Error warmup(const std::string &prompt);
   void set_count_interval(size_t count_interval);
   void set_time_interval(size_t time_interval);
-  void stop();
+  void set_temperature(float temperature) noexcept;
+  void set_topp(float topp) noexcept;
+  void stop() override;
+  void reset() override;
-  ::executorch::extension::llm::Stats stats_;
+  llm::Stats stats_;
 private:
-  float temperature_;
+  // Helper functions
+  int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len,
+                                 int32_t max_context_len,
+                                 int32_t max_new_tokens = -1) const;
+  // Main config
+  llm::GenerationConfig config_;
+  // Flow control
   bool shouldStop_{false};
+  int64_t pos_ = 0; // The position in KV cache of the input, starting from 0.
+  // Main model
+  ::executorch::extension::Module *module_;
-  // model
-  std::unique_ptr<::executorch::extension::Module> module_;
+  // Subcomponents
   std::string tokenizer_path_;
   std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
-      text_decoder_runner_;
-  std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
-  std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
-      text_token_generator_;
+  std::unique_ptr<llm::IOManager> io_manager_;
+  std::unique_ptr<llm::TextDecoderRunner> text_decoder_runner_;
+  std::unique_ptr<llm::TextPrefiller> text_prefiller_;
+  std::unique_ptr<llm::TextTokenGenerator> text_token_generator_;
 };
 } // namespace example

package/common/runner/sampler.cpp CHANGED Viewed

@@ -34,6 +34,7 @@
 #include "sampler.h"
 #include <algorithm>
+#include <ctime>
 namespace executorch {
 namespace extension {
@@ -121,9 +122,14 @@ int32_t Sampler::sample_topp(T *probabilities, float coin) {
 Sampler::Sampler(int vocab_size, float temperature, float topp,
                  unsigned long long rng_seed)
     : vocab_size_(vocab_size),
-      inv_temperature_(static_cast<bool>(temperature) ? 1.0f / temperature : 0),
+      inv_temperature_((temperature != 0.0f) ? (1.0f / temperature) : 0.0f),
       topp_(topp), rng_state_(rng_seed) {}
+Sampler::Sampler(int vocab_size, float temperature, float topp)
+    : vocab_size_(vocab_size),
+      inv_temperature_((temperature != 0.0f) ? (1.0f / temperature) : 0.0f),
+      topp_(topp), rng_state_(std::time(nullptr)) {}
 template <typename T> static void softmax(T *x, int size) {
   // find max value (for numerical stability)
   T max_val = x[0];
@@ -184,6 +190,7 @@ template <typename T> int32_t Sampler::sample(T *logits) {
 }
 template int32_t Sampler::sample<float>(float *logits);
+template int32_t Sampler::sample<uint16_t>(uint16_t *logits);
 template int32_t
 Sampler::sample<executorch::aten::Half>(executorch::aten::Half *logits);
 template int32_t

package/common/runner/sampler.h CHANGED Viewed

@@ -26,16 +26,18 @@ namespace extension {
 namespace llm {
 // A simple llama2 sampler.
-template <typename T> struct ET_EXPERIMENTAL ProbIndex {
+template <typename T> struct ProbIndex {
   T prob;
   int32_t index;
 }; // struct used when sorting probabilities during top-p sampling
-class ET_EXPERIMENTAL Sampler {
+class Sampler {
 public:
   Sampler(int32_t vocab_size, float temperature, float topp,
           unsigned long long rng_seed);
+  Sampler(int32_t vocab_size, float temperature, float topp);
   template <typename T> int32_t sample(T *logits);
 private:

package/common/runner/stats.h CHANGED Viewed

@@ -18,7 +18,7 @@ namespace executorch {
 namespace extension {
 namespace llm {
-struct ET_EXPERIMENTAL Stats {
+struct Stats {
   // Scaling factor for timestamps - in this case, we use ms.
   const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
   // Time stamps for the different stages of the execution
@@ -82,8 +82,6 @@ private:
   long aggregate_sampling_timer_start_timestamp = 0;
 };
-static constexpr auto kTopp = 0.9f;
 inline std::string stats_to_json_string(const Stats &stats) {
   std::stringstream ss;
   ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << ","
@@ -157,7 +155,6 @@ namespace executorch {
 namespace llm {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::extension::llm::kTopp;
 using ::executorch::extension::llm::print_report;
 using ::executorch::extension::llm::Stats;
 } // namespace llm

package/common/runner/text_decoder_runner.cpp CHANGED Viewed

@@ -9,11 +9,11 @@
 // Given inputs, run a text decoder and return logits.
 #include "text_decoder_runner.h"
+#include "arange_util.h"
+#include "stats.h"
 #include <ctime>
-#include "stats.h"
 namespace executorch {
 namespace extension {
 namespace llm {
@@ -21,23 +21,37 @@ namespace llm {
 // NOTE: we observed ~2x loading performance increase on iPhone 15
 // and a ~5% improvement on Galaxy S22 by switching to
 // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-TextDecoderRunner::TextDecoderRunner(Module *module, bool use_kv_cache,
-                                     int32_t vocab_size, float temperature)
-    : module_(module),
-      sampler_(std::make_unique<Sampler>(
-          vocab_size, temperature, kTopp,
-          static_cast<unsigned long long>(std::time(nullptr)))),
-      use_kv_cache_(use_kv_cache) {}
+TextDecoderRunner::TextDecoderRunner(Module *module, IOManager *io_manager,
+                                     float temperature, float topp)
+    : module_(module), io_manager_(io_manager), temperature_(temperature),
+      topp_(topp) {}
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
 // outer loop (call site) is responsible for managing state.
 ::executorch::runtime::Result<executorch::aten::Tensor>
-TextDecoderRunner::step(TensorPtr &tokens, TensorPtr &start_pos) {
+TextDecoderRunner::step(TensorPtr &tokens, int64_t start_pos) {
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
-  if (use_kv_cache_) {
-    auto outputs_res = module_->forward({tokens, start_pos});
+  auto method_meta = ET_UNWRAP(module_->method_meta("forward"));
+  // If only 1 input, we are not using kv cache
+  bool use_kv_cache = method_meta.num_inputs() > 1;
+  std::vector<int64_t> cache_positions;
+  if (use_kv_cache) {
+    auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
+        module_, start_pos, cache_positions, tokens->numel(), "forward"));
+    std::vector<runtime::EValue> inputs;
+    auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);
+    ET_CHECK_OK_OR_RETURN_ERROR(inputs_res.error());
+    inputs = inputs_res.get();
+    auto outputs_res = module_->forward(inputs);
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
+    auto update_err = io_manager_->update_decode(outputs_res.get());
+    ET_CHECK_OK_OR_RETURN_ERROR(update_err);
     ET_CHECK_MSG(outputs_res.get().size() == 1,
                  "More then one output returned from executing LLM.");
     ET_CHECK_MSG(outputs_res.get()[0].isTensor(),