npm - react-native-executorch - Versions diffs - 0.9.0-nightly-0e95b89-20260525 → 0.9.1 - Mend

react-native-executorch 0.9.0-nightly-0e95b89-20260525 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

package/android/libs/classes.jar CHANGED Viewed

Binary file

package/common/rnexecutorch/host_objects/JsiConversions.h CHANGED Viewed

@@ -4,6 +4,7 @@
 #include <cstdint>
 #include <set>
 #include <span>
+#include <string>
 #include <type_traits>
 #include <unordered_map>
 #include <variant>
@@ -17,6 +18,7 @@
 #include <rnexecutorch/metaprogramming/TypeConcepts.h>
 #include <rnexecutorch/models/instance_segmentation/Types.h>
+#include <rnexecutorch/models/llm/Types.h>
 #include <rnexecutorch/models/object_detection/Constants.h>
 #include <rnexecutorch/models/object_detection/Types.h>
 #include <rnexecutorch/models/ocr/Types.h>
@@ -223,6 +225,22 @@ inline std::vector<float> getValue<std::vector<float>>(const jsi::Value &val,
   return getArrayAsVector<float>(val, runtime);
 }
+template <>
+inline std::vector<std::vector<float>>
+getValue<std::vector<std::vector<float>>>(const jsi::Value &val,
+                                          jsi::Runtime &runtime) {
+  jsi::Array array = val.asObject(runtime).asArray(runtime);
+  const size_t length = array.size(runtime);
+  std::vector<std::vector<float>> result;
+  result.reserve(length);
+  for (size_t i = 0; i < length; ++i) {
+    jsi::Value element = array.getValueAtIndex(runtime, i);
+    auto span = getTypedArrayAsSpan<float>(element, runtime);
+    result.emplace_back(span.begin(), span.end());
+  }
+  return result;
+}
 template <>
 inline std::vector<int64_t>
 getValue<std::vector<int64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
@@ -302,6 +320,31 @@ getValue<std::span<uint64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
   return getTypedArrayAsSpan<uint64_t>(val, runtime);
 }
+template <>
+inline models::llm::MultimodalInputs
+getValue<models::llm::MultimodalInputs>(const jsi::Value &val,
+                                        jsi::Runtime &runtime) {
+  models::llm::MultimodalInputs multimodalInputs;
+  jsi::Object obj = val.asObject(runtime);
+  jsi::Value v = obj.getProperty(runtime, "imageToken");
+  if (!v.isUndefined() && !v.isNull()) {
+    auto &images = multimodalInputs.images.emplace();
+    images.token = getValue<std::string>(v, runtime);
+    v = obj.getProperty(runtime, "imagePaths");
+    images.paths = getValue<std::vector<std::string>>(v, runtime);
+  }
+  v = obj.getProperty(runtime, "audioToken");
+  if (!v.isUndefined() && !v.isNull()) {
+    auto &audios = multimodalInputs.audios.emplace();
+    audios.token = getValue<std::string>(v, runtime);
+    v = obj.getProperty(runtime, "audioWaveforms");
+    audios.waveforms = getValue<std::vector<std::vector<float>>>(v, runtime);
+  }
+  return multimodalInputs;
+}
 // Conversion from C++ types to jsi --------------------------------------------
 // Implementation functions might return any type, but in a promise we can only

package/common/rnexecutorch/models/llm/LLM.cpp CHANGED Viewed

@@ -1,11 +1,12 @@
 #include "LLM.h"
+#include "rnexecutorch/models/llm/Types.h"
 #include <executorch/extension/tensor/tensor.h>
 #include <filesystem>
 #include <map>
 #include <rnexecutorch/Error.h>
-#include <rnexecutorch/Log.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
+#include <runner/encoders/audio_encoder.h>
 #include <runner/encoders/vision_encoder.h>
 #include <runner/multimodal_runner.h>
 #include <runner/text_runner.h>
@@ -21,7 +22,6 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
          std::vector<std::string> capabilities,
          std::shared_ptr<react::CallInvoker> callInvoker)
     : BaseModel(modelSource, callInvoker, Module::LoadMode::Mmap) {
   if (capabilities.empty()) {
     runner_ =
         std::make_unique<llm::TextRunner>(std::move(module_), tokenizerSource);
@@ -31,6 +31,9 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
       if (cap == "vision") {
         encoders[llm::MultimodalType::Image] =
             std::make_unique<llm::VisionEncoder>(*module_);
+      } else if (cap == "audio") {
+        encoders[llm::MultimodalType::Audio] =
+            std::make_unique<llm::AudioEncoder>(*module_);
       }
     }
     runner_ = std::make_unique<llm::MultimodalRunner>(
@@ -75,62 +78,73 @@ std::string LLM::generate(std::string input,
 }
 std::string LLM::generateMultimodal(std::string prompt,
-                                    std::vector<std::string> imagePaths,
-                                    std::string imageToken,
-                                    std::shared_ptr<jsi::Function> callback) {
+                                    std::shared_ptr<jsi::Function> callback,
+                                    MultimodalInputs mutlimodalInputs) {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
   if (!runner_->is_multimodal()) {
-    throw RnExecutorchError(
-        RnExecutorchErrorCode::InvalidUserInput,
-        "This model does not support multimodal input. Use generate(prompt, "
-        "callback) for text-only generation.");
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "This model does not support multimodal input.");
   }
-  if (imageToken.empty()) {
+  if (!mutlimodalInputs.images.has_value() &&
+      !mutlimodalInputs.audios.has_value()) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::InvalidUserInput,
-        "imageToken must not be empty. Pass the model's image token (e.g. "
-        "from tokenizer_config.json).");
+        "At least one of imageToken/audioToken must be non-empty");
   }
-  const size_t kImageTokenLen = imageToken.size();
+  // Scan the prompt once, splitting at the earliest placeholder at each step
+  // so that image/audio placeholders can be freely interleaved in the prompt.
   std::vector<llm::MultimodalInput> inputs;
-  size_t imageIdx = 0;
-  size_t searchPos = 0;
-  while (true) {
-    size_t found = prompt.find(imageToken, searchPos);
-    if (found == std::string::npos) {
-      if (searchPos < prompt.size()) {
-        inputs.push_back(llm::make_text_input(prompt.substr(searchPos)));
-      }
+  size_t imageIdx = 0, audioIdx = 0, pos = 0;
+  while (pos < prompt.size()) {
+    size_t imgAt = mutlimodalInputs.images.has_value()
+                       ? prompt.find(mutlimodalInputs.images.value().token, pos)
+                       : std::string::npos;
+    size_t audAt = mutlimodalInputs.audios.has_value()
+                       ? prompt.find(mutlimodalInputs.audios.value().token, pos)
+                       : std::string::npos;
+    if (imgAt == std::string::npos && audAt == std::string::npos) {
+      inputs.push_back(llm::make_text_input(prompt.substr(pos)));
       break;
     }
-    // Text segment before this placeholder
-    if (found > searchPos) {
-      inputs.push_back(
-          llm::make_text_input(prompt.substr(searchPos, found - searchPos)));
+    const bool imageFirst = imgAt != std::string::npos &&
+                            (audAt == std::string::npos || imgAt < audAt);
+    size_t at = imageFirst ? imgAt : audAt;
+    if (at > pos) {
+      inputs.push_back(llm::make_text_input(prompt.substr(pos, at - pos)));
     }
-    // Image at this position
-    if (imageIdx >= imagePaths.size()) {
-      throw RnExecutorchError(
-          RnExecutorchErrorCode::InvalidUserInput,
-          "More '" + imageToken +
-              "' placeholders in prompt than image paths provided");
+    if (imageFirst) {
+      auto &images = mutlimodalInputs.images.value();
+      if (imageIdx >= images.paths.size()) {
+        throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                                "More '" + images.token +
+                                    "' placeholders than image paths");
+      }
+      inputs.push_back(llm::make_image_input(images.paths[imageIdx++]));
+      pos = at + images.token.size();
+    } else {
+      auto &audios = mutlimodalInputs.audios.value();
+      if (audioIdx >= audios.waveforms.size()) {
+        throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                                "More '" + audios.token +
+                                    "' placeholders than audio waveforms");
+      }
+      inputs.push_back(
+          llm::make_audio_input(std::move(audios.waveforms[audioIdx++])));
+      pos = at + audios.token.size();
     }
-    inputs.push_back(llm::make_image_input(imagePaths[imageIdx++]));
-    searchPos = found + kImageTokenLen;
   }
-  if (imageIdx < imagePaths.size()) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "More image paths provided than '" + imageToken +
-                                "' placeholders in prompt");
+  if ((mutlimodalInputs.images.has_value() &&
+       imageIdx < mutlimodalInputs.images.value().paths.size()) ||
+      (mutlimodalInputs.audios.has_value() &&
+       audioIdx < mutlimodalInputs.audios.value().waveforms.size())) {
+    throw RnExecutorchError(
+        RnExecutorchErrorCode::InvalidUserInput,
+        "More image/audio paths provided than placeholders in prompt");
   }
   if (inputs.empty()) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
                             "No inputs to generate from");
@@ -150,7 +164,6 @@ std::string LLM::generateMultimodal(std::string prompt,
   if (error != Error::Ok) {
     throw RnExecutorchError(error, "Failed to generate multimodal response");
   }
   return output;
 }

package/common/rnexecutorch/models/llm/LLM.h CHANGED Viewed

@@ -7,6 +7,7 @@
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
 #include <rnexecutorch/models/BaseModel.h>
+#include <rnexecutorch/models/llm/Types.h>
 #include <runner/base_llm_runner.h>
 namespace rnexecutorch {
@@ -22,10 +23,10 @@ public:
   std::string generate(std::string prompt,
                        std::shared_ptr<jsi::Function> callback);
   std::string generateMultimodal(std::string prompt,
-                                 std::vector<std::string> imagePaths,
-                                 std::string imageToken,
-                                 std::shared_ptr<jsi::Function> callback);
+                                 std::shared_ptr<jsi::Function> callback,
+                                 MultimodalInputs mutlimodalInputs = {});
   void interrupt();
   void reset();

package/common/rnexecutorch/models/llm/Types.h ADDED Viewed

@@ -0,0 +1,23 @@
+#pragma once
+#include <optional>
+#include <string>
+#include <vector>
+namespace rnexecutorch::models::llm {
+struct ImageInputs {
+  std::vector<std::string> paths;
+  std::string token;
+};
+struct AudioInputs {
+  std::vector<std::vector<float>> waveforms;
+  std::string token;
+};
+struct MultimodalInputs {
+  std::optional<ImageInputs> images;
+  std::optional<AudioInputs> audios;
+};
+} // namespace rnexecutorch::models::llm

package/common/runner/base_llm_runner.cpp CHANGED Viewed

@@ -56,11 +56,16 @@ Error BaseLLMRunner::load() {
             ? static_cast<int32_t>(metadata_.at(kMaxContextLen))
             : static_cast<int32_t>(metadata_.at(kMaxSeqLen));
   }
-  if (config_.max_new_tokens < 0)
-    config_.max_new_tokens =
-        std::min(config_.max_seq_len, config_.max_context_length);
   config_.enable_dynamic_shape =
       static_cast<bool>(metadata_.at(kEnableDynamicShape));
+  if (config_.max_new_tokens < 0) {
+    // For dynamic-shape PTEs, max_seq_len is the per-call decoder chunk
+    // size, not the generation budget — use max_context_length instead.
+    const int32_t seq_cap = config_.enable_dynamic_shape
+                                ? config_.max_context_length
+                                : config_.max_seq_len;
+    config_.max_new_tokens = std::min(seq_cap, config_.max_context_length);
+  }
   config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
   eos_ids_ = std::make_unique<std::unordered_set<uint64_t>>();
@@ -149,6 +154,8 @@ void BaseLLMRunner::set_repetition_penalty(float repetition_penalty) noexcept {
   config_.repetition_penalty = repetition_penalty;
 }
+void BaseLLMRunner::set_topk(int32_t topk) noexcept { config_.topk = topk; }
 void BaseLLMRunner::set_count_interval(size_t count_interval) {
   config_.output_token_batch_size = count_interval;
 }

package/common/runner/base_llm_runner.h CHANGED Viewed

@@ -55,6 +55,7 @@ public:
   void set_topp(float topp) noexcept;
   void set_min_p(float min_p) noexcept;
   void set_repetition_penalty(float repetition_penalty) noexcept;
+  void set_topk(int32_t topk) noexcept;
   void set_count_interval(size_t count_interval);
   void set_time_interval(size_t time_interval);

package/common/runner/constants.h CHANGED Viewed

@@ -23,8 +23,22 @@ inline constexpr auto kVisionEncoderMethod = "vision_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
 inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
 inline constexpr auto kTextModelMethod = "text_decoder";
 inline constexpr auto numOfAddedBoSTokens = 0;
 inline constexpr auto numOfAddedEoSTokens = 0;
+// Gemma4
+// PLE models only: token id that marks image placeholder slots in input_ids.
+// token_embedding run on this id produces the per-layer PLE signal for image
+// positions; the inputs_embeds output for those positions is discarded (the
+// vision encoder output replaces it).
+inline constexpr auto kImagePlaceholderId = "image_placeholder_id";
+// True iff the model exposes a per-layer-embedding (PLE) signal alongside
+// inputs_embeds (Gemma4-style). When true, `token_embedding.execute()`
+// returns the tuple (inputs_embeds, ple_tok) and the runner must thread
+// ple_tok into text_decoder; when false (or absent), token_embedding returns
+// inputs_embeds alone. Text-only PTEs that ship a single `forward` method
+// omit this key entirely — it is meaningful only for multimodal PTEs that
+// expose a separate `token_embedding` method.
+inline constexpr auto kHasPLE = "has_ple";
 } // namespace executorch::extension::llm

package/common/runner/encoders/audio_encoder.cpp ADDED Viewed

@@ -0,0 +1,111 @@
+// common/runner/encoders/audio_encoder.cpp
+#include "audio_encoder.h"
+#include <rnexecutorch/Error.h>
+#include <runner/constants.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <vector>
+namespace executorch::extension::llm {
+using ::executorch::aten::SizesType;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::Result;
+namespace {
+constexpr int32_t kSamplingRate = 16e3;
+constexpr int32_t kMaxLengthSeconds = 30;
+constexpr int32_t kSamplesPerBlock = 7680;
+constexpr int64_t kAudioBlockKMin = 1;
+constexpr int64_t kAudioBlockKMax =
+    kSamplingRate * kMaxLengthSeconds / kSamplesPerBlock;
+} // namespace
+AudioEncoder::AudioEncoder(::executorch::extension::Module &module)
+    : module_(&module) {}
+Error AudioEncoder::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  auto method_names_result = module_->method_names();
+  if (!method_names_result.ok()) {
+    return method_names_result.error();
+  }
+  if (method_names_result->count(kAudioEncoderMethod) == 0) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::InvalidConfig,
+        "Model does not support audio: 'audio_encoder' method not found. "
+        "Check that the .pte file matches the declared capabilities.");
+  }
+  return module_->load_method(kAudioEncoderMethod);
+}
+bool AudioEncoder::is_loaded() const noexcept {
+  return module_->is_method_loaded(kAudioEncoderMethod);
+}
+int32_t AudioEncoder::encoderTokenCount() const noexcept {
+  return last_token_count_;
+}
+Result<EValue> AudioEncoder::encode(const MultimodalInput &input) {
+  if (!is_loaded()) {
+    return Error::InvalidState;
+  }
+  if (!input.is_audio()) {
+    return Error::InvalidArgument;
+  }
+  const auto &wav = input.get_audio();
+  ET_CHECK_OR_RETURN_ERROR(!wav.samples.empty(), InvalidArgument,
+                           "AudioEncoder: empty waveform");
+  const int64_t n_valid = static_cast<int64_t>(wav.samples.size());
+  const int64_t k_blocks = (n_valid + kSamplesPerBlock - 1) / kSamplesPerBlock;
+  ET_CHECK_OR_RETURN_ERROR(
+      k_blocks >= kAudioBlockKMin && k_blocks <= kAudioBlockKMax,
+      InvalidArgument,
+      "AudioEncoder: waveform of %lld samples needs k_blocks=%lld.",
+      static_cast<long long>(n_valid), static_cast<long long>(k_blocks));
+  const int64_t n_padded = k_blocks * kSamplesPerBlock;
+  // Own the padded waveform for the lifetime of this call; from_blob below
+  // borrows without copying. The current export takes
+  //   forward(waveform[1, 7680*k] fp32, num_blocks: int64 scalar)
+  // — input 1 is a rank-0 Long telling the encoder how many of the K_MAX
+  // blocks contain real PCM. Passing a 2-d mask here trips "Attempted to
+  // change tensor rank: old=0, new=2".
+  padded_wav_.assign(static_cast<size_t>(n_padded), 0.0f);
+  std::memcpy(padded_wav_.data(), wav.samples.data(),
+              static_cast<size_t>(n_valid) * sizeof(float));
+  valid_samples_scalar_ = n_valid;
+  auto wav_tensor = ::executorch::extension::from_blob(
+      padded_wav_.data(), {1, static_cast<SizesType>(n_padded)},
+      ::executorch::aten::ScalarType::Float);
+  auto num_blocks_tensor = ::executorch::extension::from_blob(
+      &valid_samples_scalar_, {}, ::executorch::aten::ScalarType::Long);
+  std::vector<EValue> args = {EValue(*wav_tensor), EValue(*num_blocks_tensor)};
+  auto exec_result = ET_UNWRAP(module_->execute(kAudioEncoderMethod, args));
+  ET_CHECK_OR_RETURN_ERROR(!exec_result.empty(), InvalidState,
+                           "audio_encoder returned no outputs");
+  auto audio_tensor = exec_result[0].toTensor();
+  ET_CHECK_OR_RETURN_ERROR(audio_tensor.dim() == 3, InvalidState,
+                           "audio_encoder output rank=%zd, expected 3",
+                           audio_tensor.dim());
+  last_token_count_ = static_cast<int32_t>(audio_tensor.size(1));
+  return exec_result[0];
+}
+} // namespace executorch::extension::llm

package/common/runner/encoders/audio_encoder.h ADDED Viewed

@@ -0,0 +1,40 @@
+// common/runner/encoders/audio_encoder.h
+#pragma once
+#include "iencoder.h"
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/evalue.h>
+#include <runner/multimodal_input.h>
+#include <cstdint>
+#include <vector>
+namespace executorch::extension::llm {
+// Runs the Gemma4 `audio_encoder` PTE method.
+//
+// Contract mirrors SpeechToText (Whisper): JS hands in fp32 mono 16 kHz PCM
+// via `MultimodalInput::get_audio()`; the PTE owns the log-mel frontend so
+// this class just wraps the samples in a `[1, N_samples]` Float tensor and
+// executes. Resampling and WAV/MP3 decoding are the caller's responsibility
+// (e.g. react-native-audio-api).
+class AudioEncoder : public IEncoder {
+public:
+  explicit AudioEncoder(::executorch::extension::Module &module);
+  ::executorch::runtime::Error load() override;
+  bool is_loaded() const noexcept override;
+  ::executorch::runtime::Result<::executorch::runtime::EValue>
+  encode(const MultimodalInput &input) override;
+  // Number of audio embedding tokens produced per encode() call. 0 until first
+  // encode, since Gemma4's audio_encoder has a dynamic T dim.
+  int32_t encoderTokenCount() const noexcept override;
+private:
+  ::executorch::extension::Module *module_;
+  int32_t last_token_count_ = 0;
+  std::vector<float> padded_wav_;
+  int64_t valid_samples_scalar_ = 0;
+};
+} // namespace executorch::extension::llm

package/common/runner/encoders/vision_encoder.cpp CHANGED Viewed

@@ -2,7 +2,6 @@
 #include "vision_encoder.h"
 #include <rnexecutorch/Error.h>
-#include <rnexecutorch/Log.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <runner/constants.h>

package/common/runner/irunner.h CHANGED Viewed

@@ -73,6 +73,11 @@ struct GenerationConfig {
   size_t output_token_batch_size = 10;
   size_t batch_time_interval_ms = 120;
+  // Top-k sampling – keep only the k highest-logit tokens before softmax.
+  // 0 (default) disables top-k filtering. Stacks with topp: temperature ->
+  // top-k -> top-p -> softmax -> multinomial.
+  int32_t topk = 0;
   // Enable dynamic input shapes (if implemented) or not
   // Impacts the prefill phase and causes TextPrefiller to pass all the tokens
   // at once if set to true.

package/common/runner/multimodal_decoder_runner.h CHANGED Viewed

@@ -14,19 +14,50 @@
 #include "text_decoder_runner.h"
 namespace executorch::extension::llm {
+// Supports two PTE contracts, selected per-call from the kHasPLE metadata
+// key (mirrors how kEnableDynamicShape etc. are read — queried on demand,
+// not cached in a member). Callers that need it multiple times in a hot
+// path should snapshot into a local.
+//
+//  * Legacy (has_ple == false):
+//      token_embedding(ids) -> inputs_embeds
+//      text_decoder(inputs_embeds, input_pos)
+//
+//  * Gemma-style PLE (has_ple == true):
+//      token_embedding(ids) -> (inputs_embeds, ple_tok)
+//      text_decoder(inputs_embeds, ple_tok, input_pos)
+//    ple_tok carries Gemma4's per-layer PLE signal keyed on input_ids. It's
+//    computed once in token_embedding and threaded through every decoder call
+//    so PLE fires at every position (including multimodal placeholder slots).
 class MultimodalDecoderRunner : public TextDecoderRunner {
 public:
   explicit MultimodalDecoderRunner(Module &module, IOManager *io_manager,
                                    const GenerationConfig &config)
       : TextDecoderRunner(module, io_manager, config) {}
+  bool has_ple() const {
+    auto r = module_->get(kHasPLE);
+    if (r.error() != ::executorch::runtime::Error::Ok) {
+      return false;
+    }
+    return r->toScalar().to<bool>();
+  }
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
   step(TensorPtr &tokens, int64_t start_pos) override {
     auto embed_result = module_->execute(kTokenEmbeddingMethod, tokens);
     if (!embed_result.ok()) {
       return embed_result.error();
     }
-    return decode((*embed_result)[0], start_pos);
+    auto &embed_outputs = *embed_result;
+    if (has_ple()) {
+      ET_CHECK_MSG(embed_outputs.size() == 2,
+                   "Expected 2 outputs (inputs_embeds, ple_tok) from "
+                   "token_embedding, got %zu",
+                   embed_outputs.size());
+      return decode(embed_outputs[0], embed_outputs[1], start_pos);
+    }
+    return decode(embed_outputs[0], start_pos);
   }
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
@@ -46,6 +77,24 @@ public:
     return outputs[0].toTensor();
   }
+  inline ::executorch::runtime::Result<::executorch::aten::Tensor>
+  decode(const ::executorch::runtime::EValue &embeddings,
+         const ::executorch::runtime::EValue &ple_tok, int64_t start_pos) {
+    auto start_pos_tensor = ::executorch::extension::from_blob(
+        &start_pos, {1}, ::executorch::aten::ScalarType::Long);
+    auto outputs_result = module_->execute(
+        kTextModelMethod, {embeddings, ple_tok, start_pos_tensor});
+    if (!outputs_result.ok()) {
+      return outputs_result.error();
+    }
+    auto &outputs = *outputs_result;
+    ET_CHECK_MSG(outputs.size() == 1,
+                 "Expected 1 output from text_decoder, got %zu",
+                 outputs.size());
+    ET_CHECK_MSG(outputs[0].isTensor(), "text_decoder output is not a tensor");
+    return outputs[0].toTensor();
+  }
   inline ::executorch::runtime::Error load() override {
     if (is_method_loaded()) {
       return ::executorch::runtime::Error::Ok;

package/common/runner/multimodal_input.h CHANGED Viewed

@@ -20,6 +20,10 @@ struct ImagePath {
   std::string path;
 };
+struct AudioWaveform {
+  std::vector<float> samples;
+};
 class MultimodalInput {
 public:
   explicit MultimodalInput(std::string text) : data_(std::move(text)) {}
@@ -27,6 +31,7 @@ public:
       : data_(std::move(tokens)) {}
   explicit MultimodalInput(ImagePath image_path)
       : data_(std::move(image_path)) {}
+  explicit MultimodalInput(AudioWaveform audio) : data_(std::move(audio)) {}
   MultimodalInput(const MultimodalInput &) = default;
   MultimodalInput &operator=(const MultimodalInput &) = default;
@@ -42,6 +47,9 @@ public:
   bool is_image() const noexcept {
     return std::holds_alternative<ImagePath>(data_);
   }
+  bool is_audio() const noexcept {
+    return std::holds_alternative<AudioWaveform>(data_);
+  }
   const std::string &get_text() const & { return std::get<std::string>(data_); }
   const std::vector<uint64_t> &get_tokens() const & {
@@ -50,9 +58,13 @@ public:
   const std::string &get_image_path() const & {
     return std::get<ImagePath>(data_).path;
   }
+  const AudioWaveform &get_audio() const & {
+    return std::get<AudioWaveform>(data_);
+  }
 private:
-  std::variant<std::string, std::vector<uint64_t>, ImagePath> data_;
+  std::variant<std::string, std::vector<uint64_t>, ImagePath, AudioWaveform>
+      data_;
 };
 inline MultimodalInput make_text_input(const std::string &text) noexcept {
@@ -64,5 +76,8 @@ inline MultimodalInput make_text_input(std::string &&text) noexcept {
 inline MultimodalInput make_image_input(std::string path) noexcept {
   return MultimodalInput(ImagePath{std::move(path)});
 }
+inline MultimodalInput make_audio_input(std::vector<float> samples) noexcept {
+  return MultimodalInput(AudioWaveform{std::move(samples)});
+}
 } // namespace executorch::extension::llm