npm - react-native-executorch - Versions diffs - 0.5.15 → 0.6.0 - Mend

react-native-executorch 0.5.15 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (277) hide show

package/common/runner/io_manager.h ADDED Viewed

@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+namespace executorch {
+namespace extension {
+namespace llm {
+/**
+ * @brief Base class for managing input/output operations for LLM inference.
+ *
+ * IOManager provides an interface for handling the input preparation and
+ * output processing for both prefill and decode phases of LLM inference.
+ * Derived classes must implement the virtual methods to provide specific IO
+ * management functionality.
+ */
+class IOManager {
+public:
+  /**
+   * @brief Construct an IOManager bound to a Module.
+   *
+   * @param module The Module used for querying method metadata and execution.
+   */
+  explicit IOManager(ET_MODULE_NAMESPACE::Module &module) : module_(module) {}
+  /**
+   * @brief Virtual destructor to allow proper cleanup in derived classes.
+   */
+  virtual ~IOManager() = default;
+  /**
+   * @brief Load the IO manager with method metadata for prefill and
+   * decode operations.
+   *
+   * @param prefill_method The prefill method to initialize with.
+   * @param decode_method The decode method to initialize with.
+   */
+  ET_NODISCARD virtual runtime::Error load(const std::string &prefill_method,
+                                           const std::string &decode_method) {
+    (void)prefill_method;
+    (void)decode_method;
+    return runtime::Error::Ok;
+  }
+  /**
+   * @brief Load the IO manager using the default method names.
+   *
+   * Uses "forward" for both prefill and decode.
+   *
+   * @return Error code.
+   */
+  ET_NODISCARD runtime::Error load() { return load("forward", "forward"); }
+  /**
+   * @brief Reset the IO manager state.
+   *
+   * @param prefill_method The prefill method to reset with.
+   * @param decode_method The decode method to reset with.
+   */
+  ET_NODISCARD virtual runtime::Error reset(const std::string &prefill_method,
+                                            const std::string &decode_method) {
+    (void)prefill_method;
+    (void)decode_method;
+    return runtime::Error::Ok;
+  }
+  /**
+   * @brief Reset the IO manager state using the default method names.
+   *
+   * Uses "forward" for both prefill and decode.
+   *
+   * @return Error code.
+   */
+  ET_NODISCARD runtime::Error reset() { return reset("forward", "forward"); }
+  /**
+   * @brief Prepare inputs for the prefill phase of LLM inference.
+   *
+   * @param input The input tensor containing token IDs.
+   * @param start_pos The tensor containing the starting position of the current
+   * input within the context.
+   * @param prefill_method The prefill method to prepare inputs for.
+   * @return std::vector<runtime::EValue> Vector of prepared inputs
+   * for the prefill method.
+   */
+  virtual runtime::Result<std::vector<runtime::EValue>>
+  prepare_prefill(const TensorPtr &input, const TensorPtr &start_pos,
+                  const std::string &prefill_method) {
+    auto method_meta = module_.method_meta(prefill_method);
+    if (!method_meta.ok()) {
+      return method_meta.error();
+    }
+    if (method_meta->num_inputs() != 2) {
+      ET_LOG(Error,
+             "Expected 2 inputs for prefill method, got %zu. Likely the model "
+             "takes the caches or mask as an argument which this IOManager "
+             "does not support.",
+             method_meta->num_inputs());
+      return runtime::Error::InvalidState;
+    }
+    // Cpu IO Manager supports dynamic shapes for prefill, so no work to be done
+    // here.
+    return std::vector<runtime::EValue>{input, start_pos};
+  }
+  /**
+   * @brief Prepare inputs for the prefill phase using the default method name.
+   *
+   * Uses "forward" as the prefill method.
+   *
+   * @param input The input tensor containing token IDs.
+   * @param start_pos The tensor containing the starting position.
+   * @return Vector of prepared inputs for the prefill method.
+   */
+  runtime::Result<std::vector<runtime::EValue>>
+  prepare_prefill(const TensorPtr &input, const TensorPtr &start_pos) {
+    return prepare_prefill(input, start_pos, "forward");
+  }
+  /**
+   * @brief Prepare inputs for the decode phase of LLM inference.
+   *
+   * @param input The input tensor containing token IDs.
+   * @param start_pos The tensor containing the starting position of the current
+   * input within the context.
+   * @param decode_method The decode method to prepare inputs for.
+   * @return std::vector<runtime::EValue> Vector of prepared inputs
+   * for the decode method.
+   */
+  virtual runtime::Result<std::vector<runtime::EValue>>
+  prepare_decode(const TensorPtr &input, const TensorPtr &start_pos,
+                 const std::string &decode_method) {
+    auto method_meta = module_.method_meta(decode_method);
+    if (!method_meta.ok()) {
+      return method_meta.error();
+    }
+    if (method_meta->num_inputs() != 2) {
+      ET_LOG(Error,
+             "Expected 2 inputs for decode method, got %zu. Likely the model "
+             "takes the caches or mask as an argument which this IOManager "
+             "does not support.",
+             method_meta->num_inputs());
+      return runtime::Error::InvalidState;
+    }
+    // Cpu IO Manager supports dynamic shapes for prefill, so no work to be done
+    // here.
+    return std::vector<runtime::EValue>{input, start_pos};
+  }
+  /**
+   * @brief Prepare inputs for the decode phase using the default method name.
+   *
+   * Uses "forward" as the decode method.
+   *
+   * @param input The input tensor containing token IDs.
+   * @param start_pos The tensor containing the starting position.
+   * @return Vector of prepared inputs for the decode method.
+   */
+  runtime::Result<std::vector<runtime::EValue>>
+  prepare_decode(const TensorPtr &input, const TensorPtr &start_pos) {
+    return prepare_decode(input, start_pos, "forward");
+  }
+  /**
+   * @brief Process and update internal state with outputs from the prefill
+   * phase.
+   *
+   * @param prefill_method The prefill method to update with outputs.
+   * @param model_outputs Vector of outputs from the prefill method execution.
+   */
+  ET_NODISCARD virtual runtime::Error
+  update_prefill(const std::vector<runtime::EValue> &model_outputs,
+                 const std::string &prefill_method) {
+    (void)model_outputs;
+    (void)prefill_method;
+    // No post inference work to do.
+    return runtime::Error::Ok;
+  }
+  /**
+   * @brief Process outputs from the prefill phase using the default method.
+   *
+   * Uses "forward" as the prefill method.
+   *
+   * @param model_outputs Vector of outputs from the prefill execution.
+   * @return Error code.
+   */
+  ET_NODISCARD runtime::Error
+  update_prefill(const std::vector<runtime::EValue> &model_outputs) {
+    return update_prefill(model_outputs, "forward");
+  }
+  /**
+   * @brief Process and update internal state with outputs from the decode
+   * phase.
+   *
+   * @param decode_method The decode method to update with outputs.
+   * @param model_outputs Vector of outputs from the decode method execution.
+   */
+  ET_NODISCARD virtual runtime::Error
+  update_decode(const std::vector<runtime::EValue> &model_outputs,
+                const std::string &decode_method) {
+    (void)model_outputs;
+    (void)decode_method;
+    // No post inference work to do.
+    return runtime::Error::Ok;
+  }
+  /**
+   * @brief Process outputs from the decode phase using the default method.
+   *
+   * Uses "forward" as the decode method.
+   *
+   * @param model_outputs Vector of outputs from the decode execution.
+   * @return Error code.
+   */
+  ET_NODISCARD runtime::Error
+  update_decode(const std::vector<runtime::EValue> &model_outputs) {
+    return update_decode(model_outputs, "forward");
+  }
+private:
+  /**
+   * @brief Reference to the Module used for method metadata and execution.
+   */
+  ET_MODULE_NAMESPACE::Module &module_;
+};
+} // namespace llm
+} // namespace extension
+} // namespace executorch

package/common/runner/irunner.h CHANGED Viewed

@@ -6,41 +6,112 @@
  * LICENSE file in the root directory of this source tree.
  */
-// An interface for LLM runners. Developers can create their own runner that
-// implements their own load and generation logic to run the model.
+// Interface for text generation runners.
 #pragma once
+#include "stats.h"
+#include <cstdint>
 #include <functional>
+#include <memory>
 #include <string>
-#include "stats.h"
-#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/error.h>
 namespace executorch {
 namespace extension {
 namespace llm {
-class ET_EXPERIMENTAL IRunner {
+// Configuration struct for generation parameters
+struct GenerationConfig {
+  // Whether to echo the input prompt in the output
+  bool echo = false;
+  // Whether this is a warmup run (affects perf benchmarking)
+  bool warming = false;
+  // Maximum number of new tokens to generate
+  // If the max_context_len metadata that's serialized in the .pte file exists,
+  // then the number of prompt tokens + max_new_tokens won't exceed
+  // max_context_len. If this field is -1, it means we will rely on
+  // max_context_len metadata and seq_len value.
+  int32_t max_new_tokens = -1;
+  // Maximum number of total tokens
+  // If the .pte file contains the max_context_len metadata, it will override
+  // this value if it's smaller. If this field is -1, we will use the
+  // max_context_len metadata directly.
+  int32_t max_seq_len = -1;
+  // Maximum context length
+  // If the .pte file contains the max_context_len metadata, it will override
+  // this value if it's smaller. If this field is -1, we will use the
+  // max_context_len metadata directly.
+  int32_t max_context_length = -1;
+  // Temperature for sampling (higher = more random)
+  float temperature = -1.F;
+  // Top-p (nucleus sampling) – limits next token selection to the smallest set
+  // whose cumulative probability exceeds topp. Range: 0.0 to 1.0. Lower values
+  // = more deterministic, higher = more diverse generations.
+  float topp = -1.F;
+  // Enable dynamic input shapes (if implemented) or not
+  // Impacts the prefill phase and causes TextPrefiller to pass all the tokens
+  // at once if set to true.
+  bool enable_dynamic_shape = true;
+  // Use KV_CACHE implementation (if implemented) or not
+  bool enable_kv_cache = true;
+};
+// Base interface for LLM runners
+class IRunner {
 public:
   virtual ~IRunner() = default;
-  // Checks if the model is loaded.
+  /**
+   * Check if the runner is loaded and ready for inference.
+   *
+   * @return true if the runner is loaded, false otherwise
+   */
   virtual bool is_loaded() const = 0;
-  // Load the model and tokenizer.
-  virtual ::executorch::runtime::Error load() = 0;
+  /**
+   * Load the model and prepare for inference.
+   *
+   * @return Error::Ok if successful, an error otherwise
+   */
+  virtual runtime::Error load() = 0;
-  // Generate the output tokens.
-  virtual ::executorch::runtime::Error
-  generate(const std::string &prompt,
-           std::function<void(const std::string &)> token_callback = {},
-           std::function<void(const ::executorch::extension::llm::Stats &)>
-               stats_callback = {},
-           bool echo = true, bool warming = false) = 0;
+  /**
+   * Generate text based on the provided prompt and generation config.
+   *
+   * @param prompt The input prompt to generate from
+   * @param config Generation configuration parameters
+   * @param token_callback Callback function called for each generated token
+   * @param stats_callback Callback function for generation statistics
+   * @return Error::Ok if successful, an error otherwise
+   */
+  virtual runtime::Error
+  generate(const std::string &prompt, const GenerationConfig &config,
+           std::function<void(const std::string &)> token_callback,
+           std::function<void(const Stats &)> stats_callback) = 0;
-  // Stop the generation.
+  /**
+   * Stop the generation process.
+   */
   virtual void stop() = 0;
+  /**
+   * Force remove prefilled tokens and reset KV cache start position
+   *
+   * This method removes the prefilled tokens from the KV cache and resets the
+   * start position to 0.
+   */
+  virtual void reset() = 0;
 };
 } // namespace llm

package/common/runner/kernel_includes.h ADDED Viewed

@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @file
+ *
+ * Common includes used by all kernel implementations.
+ */
+#pragma once
+// This list should be very conservative since most kernel .cpp files will
+// include these and depend on their transitive deps. Only add a header if 99%
+// of kernels would have included it anyway.
+#include <executorch/runtime/core/exec_aten/exec_aten.h> // IWYU pragma: export
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h> // IWYU pragma: export
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
+#include <executorch/runtime/kernel/kernel_runtime_context.h> // IWYU pragma: export