npm - react-native-executorch - Versions diffs - 0.7.0 → 0.7.2 - Mend

react-native-executorch 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/post_processor.h CHANGED Viewed

@@ -38,6 +38,10 @@ public:
   /**
    * Process the token IDs (single sequence).
+   *
+   * NOTE: Unlike the Rust implementation which  uses a single method
+   * taking  Encoding and an Option<Encoding>, we use overloads here
+   * to explicitly handle single vs pair sequences while processing raw IDs.
    */
   virtual std::vector<uint64_t>
   process(const std::vector<uint64_t> &tokens,
@@ -54,27 +58,65 @@ public:
 // -- Factory/Common Types -----------------------------------------------------
+// Helper macro to standardize addition of config member fields
+#define POST_PROCESSOR_CONFIG_MEMBER(type, name)                               \
+  std::optional<type> name;                                                    \
+  PostProcessorConfig &set_##name(type arg) {                                  \
+    this->name = std::move(arg);                                               \
+    return *this;                                                              \
+  }
 enum class SequenceId { A, B };
+struct SpecialToken {
+  std::string id;
+  std::vector<uint64_t> ids;
+  std::vector<std::string> tokens;
+};
 struct Piece {
   bool is_special_token;
   std::string id; // For SpecialToken (e.g. "[CLS]"). For Sequence (e.g. "A").
-  uint32_t type_id;
+  uint64_t type_id;
-  static Piece Sequence(SequenceId id, uint32_t type_id) {
+  static Piece Sequence(SequenceId id, uint64_t type_id) {
     return {false, id == SequenceId::A ? "A" : "B", type_id};
   }
-  static Piece SpecialToken(std::string id, uint32_t type_id) {
+  static Piece SpecialToken(std::string id, uint64_t type_id) {
     return {true, std::move(id), type_id};
   }
 };
 using Template = std::vector<Piece>;
+// -- Config -------------------------------------------------------------------
-struct SpecialToken {
-  std::string id;
-  std::vector<uint32_t> ids;
-  std::vector<std::string> tokens;
+class PostProcessorConfig {
+public:
+  using SpecialTokenMap = std::map<std::string, tokenizers::SpecialToken>;
+  using StringIdPair = std::pair<std::string, uint64_t>;
+  std::string type;
+  // TemplateProcessing
+  POST_PROCESSOR_CONFIG_MEMBER(Template, single)
+  POST_PROCESSOR_CONFIG_MEMBER(Template, pair)
+  POST_PROCESSOR_CONFIG_MEMBER(SpecialTokenMap, special_tokens)
+  // Bert / Roberta (unused params in no-op, but kept for parsing logic)
+  POST_PROCESSOR_CONFIG_MEMBER(StringIdPair, sep)
+  POST_PROCESSOR_CONFIG_MEMBER(StringIdPair, cls)
+  POST_PROCESSOR_CONFIG_MEMBER(bool, trim_offsets)
+  POST_PROCESSOR_CONFIG_MEMBER(bool, add_prefix_space)
+  // Sequence
+  using Configs = std::vector<PostProcessorConfig>;
+  POST_PROCESSOR_CONFIG_MEMBER(Configs, processors)
+  explicit PostProcessorConfig(std::string type = "");
+  PostProcessor::Ptr create() const;
+  PostProcessorConfig &parse_json(const nlohmann::json &json_config);
 };
 // -- TemplateProcessing -------------------------------------------------------
@@ -106,11 +148,9 @@ private:
                                        bool add_special_tokens) const;
 };
-// -- BertProcessing -----------------------------------------------------------
-class BertProcessing : public PostProcessor {
+class Sequence : public PostProcessor {
 public:
-  BertProcessing();
+  explicit Sequence(std::vector<PostProcessor::Ptr> processors);
   size_t added_tokens(bool is_pair) const override;
@@ -120,13 +160,17 @@ public:
   std::vector<uint64_t> process(const std::vector<uint64_t> &tokens_a,
                                 const std::vector<uint64_t> &tokens_b,
                                 bool add_special_tokens = true) const override;
-};
-// -- RobertaProcessing --------------------------------------------------------
+private:
+  std::vector<PostProcessor::Ptr> processors_;
+};
-class RobertaProcessing : public PostProcessor {
+// -- BertProcessing -----------------------------------------------------------
+// Used for BERT post-processing (adding special tokens)
+class BertProcessing : public PostProcessor {
 public:
-  RobertaProcessing();
+  BertProcessing(std::pair<std::string, uint64_t> sep,
+                 std::pair<std::string, uint64_t> cls);
   size_t added_tokens(bool is_pair) const override;
@@ -136,13 +180,19 @@ public:
   std::vector<uint64_t> process(const std::vector<uint64_t> &tokens_a,
                                 const std::vector<uint64_t> &tokens_b,
                                 bool add_special_tokens = true) const override;
-};
-// -- Sequence -----------------------------------------------------------------
+private:
+  std::pair<std::string, uint64_t> sep_;
+  std::pair<std::string, uint64_t> cls_;
+};
-class Sequence : public PostProcessor {
+// -- RobertaProcessing --------------------------------------------------------
+// Used for RoBERTa post-processing
+class RobertaProcessing : public PostProcessor {
 public:
-  explicit Sequence(std::vector<PostProcessor::Ptr> processors);
+  RobertaProcessing(std::pair<std::string, uint64_t> sep,
+                    std::pair<std::string, uint64_t> cls, bool trim_offsets,
+                    bool add_prefix_space);
   size_t added_tokens(bool is_pair) const override;
@@ -154,34 +204,43 @@ public:
                                 bool add_special_tokens = true) const override;
 private:
-  std::vector<PostProcessor::Ptr> processors_;
+  std::pair<std::string, uint64_t> sep_;
+  std::pair<std::string, uint64_t> cls_;
+  bool trim_offsets_;
+  bool add_prefix_space_;
 };
-// -- Config -------------------------------------------------------------------
-class PostProcessorConfig {
-public:
-  std::string type;
-  // TemplateProcessing
-  Template single;
-  Template pair;
-  std::map<std::string, SpecialToken> special_tokens;
-  // Bert / Roberta (unused params in no-op, but kept for parsing logic)
-  std::pair<std::string, uint32_t> sep;
-  std::pair<std::string, uint32_t> cls;
-  bool trim_offsets = true;
-  bool add_prefix_space = true;
+// -- ByteLevel
+// ----------------------------------------------------------------
+// TODO: Implement ByteLevelProcessor
+// This is a broader issue, as most of the processing is done on offsets.
+// Our current implementation doesn't supoort it and would require us to
+// introduce a complex Encoding type. Something similiar to the originl hf
+// implementaiton:
+// https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/tokenizer/encoding.rs
+// so we could store the offsets from pretokenization step.
+/*
+class ByteLevel : public PostProcessor {
+ public:
+  ByteLevel(bool trim_offsets, bool add_prefix_space);
-  // Sequence
-  std::vector<PostProcessorConfig> processors;
+  size_t added_tokens(bool is_pair) const override;
-  explicit PostProcessorConfig(std::string type = "");
+  std::vector<uint64_t> process(
+      const std::vector<uint64_t>& tokens,
+      bool add_special_tokens = true) const override;
-  PostProcessor::Ptr create() const;
+  std::vector<uint64_t> process(
+      const std::vector<uint64_t>& tokens_a,
+      const std::vector<uint64_t>& tokens_b,
+      bool add_special_tokens = true) const override;
-  PostProcessorConfig &parse_json(const nlohmann::json &json_config);
+ private:
+  bool trim_offsets_;
+  bool add_prefix_space_;
 };
+*/
-} // namespace tokenizers
+// -- Sequence
+// -----------------------------------------------------------------
+} // namespace tokenizers

package/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/pre_tokenizer.h CHANGED Viewed

@@ -53,7 +53,7 @@ public:
 // -- Factory ------------------------------------------------------------------
 // Helper macro to standardize addition of config member fields
-#define CONFIG_MEMBER(type, name)                                              \
+#define PRETOKENIZER_CONFIG_MEMBER(type, name)                                 \
   std::optional<type> name;                                                    \
   PreTokenizerConfig &set_##name(type arg) {                                   \
     this->name = std::move(arg);                                               \
@@ -92,37 +92,38 @@ public:
   /**
    * Used by: RegexPreTokenizer, ByteLevelPreTokenizer
    */
-  CONFIG_MEMBER(std::string, pattern)
+  PRETOKENIZER_CONFIG_MEMBER(std::string, pattern)
   /**
    * Used by: DigitsPreTokenizer
    */
-  CONFIG_MEMBER(bool, individual_digits)
+  PRETOKENIZER_CONFIG_MEMBER(bool, individual_digits)
   /**
    * Used by: ByteLevelPreTokenizer
    */
-  CONFIG_MEMBER(bool, add_prefix_space)
+  PRETOKENIZER_CONFIG_MEMBER(bool, add_prefix_space)
   /**
    * Used by RegexPreTokenizer
    */
-  CONFIG_MEMBER(bool, is_delimiter)
+  PRETOKENIZER_CONFIG_MEMBER(bool, is_delimiter)
   /**
    * Used by RegexPreTokenizer - Split behavior
    */
-  CONFIG_MEMBER(std::string, behavior)
+  PRETOKENIZER_CONFIG_MEMBER(std::string, behavior)
   /**
    * Used by RegexPreTokenizer - Split invert flag
    */
-  CONFIG_MEMBER(bool, invert)
+  PRETOKENIZER_CONFIG_MEMBER(bool, invert)
   /**
    * Used by: SequencePreTokenizer
    */
-  CONFIG_MEMBER(std::vector<PreTokenizerConfig>, pretokenizers)
+  using Configs = std::vector<PreTokenizerConfig>;
+  PRETOKENIZER_CONFIG_MEMBER(Configs, pretokenizers)
   /*----------------*/
   /* Public methods */
@@ -259,6 +260,21 @@ public:
 private:
   const std::vector<PreTokenizer::Ptr> pre_tokenizers_;
-}; // end class ByteLevelPreTokenizer
+}; // end class SequencePreTokenizer
+// -- Bert ---------------------------------------------------------------------
+// Used for BERT-style pre-tokenization (splitting on whitespace and
+// punctuation) CITE:
+// https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/bert.rs
+class BertPreTokenizer : public PreTokenizer {
+public:
+  BertPreTokenizer() = default;
+  /** Perform BERT pre-tokenization */
+  std::vector<std::string>
+  pre_tokenize(const std::string &input) const override;
+}; // end class BertPreTokenizer
 } // namespace tokenizers

package/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/token_decoder.h CHANGED Viewed

@@ -55,6 +55,14 @@ public:
 // -- Factory ------------------------------------------------------------------
+// Helper macro to standardize addition of config member fields
+#define TOKEN_DECODER_CONFIG_MEMBER(type, name)                                \
+  std::optional<type> name;                                                    \
+  TokenDecoderConfig &set_##name(type arg) {                                   \
+    this->name = std::move(arg);                                               \
+    return *this;                                                              \
+  }
 /**
  * Factory and config class for creating a new TokenDecoder
  */
@@ -67,16 +75,20 @@ public:
   std::string type;
   // Parameters for Replace decoder
-  std::string replace_pattern;
-  std::string replace_content;
+  TOKEN_DECODER_CONFIG_MEMBER(std::string, replace_pattern)
+  TOKEN_DECODER_CONFIG_MEMBER(std::string, replace_content)
   // Parameters for Sequence decoder
-  std::vector<nlohmann::json> sequence_decoders;
+  TOKEN_DECODER_CONFIG_MEMBER(std::vector<nlohmann::json>, sequence_decoders)
   // Parameters for Strip decoder
-  std::string strip_content;
-  size_t strip_start;
-  size_t strip_stop;
+  TOKEN_DECODER_CONFIG_MEMBER(std::string, strip_content)
+  TOKEN_DECODER_CONFIG_MEMBER(size_t, strip_start)
+  TOKEN_DECODER_CONFIG_MEMBER(size_t, strip_stop)
+  // Parameters for WordPiece decoder
+  TOKEN_DECODER_CONFIG_MEMBER(std::string, wordpiece_prefix)
+  TOKEN_DECODER_CONFIG_MEMBER(bool, wordpiece_cleanup)
   /*----------------*/
   /* Public methods */
@@ -161,6 +173,21 @@ private:
   size_t stop_;
 }; // end class StripTokenDecoder
+// -- WordPiece ----------------------------------------------------------------
+// Used for WordPiece decoding
+class WordPieceTokenDecoder : public TokenDecoder {
+public:
+  explicit WordPieceTokenDecoder(std::string prefix = "##",
+                                 bool cleanup = true);
+  std::vector<std::string>
+  decode(const std::vector<std::string> &tokens) const override;
+private:
+  std::string prefix_;
+  bool cleanup_;
+}; // end class WordPieceTokenDecoder
 // -- Sequence -----------------------------------------------------------------
 // Applies a sequence of decoders in order

package/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/tokenizer.h CHANGED Viewed

@@ -13,8 +13,8 @@
 #pragma once
-#include "error.h"
-#include "result.h"
+#include <pytorch/tokenizers/error.h>
+#include <pytorch/tokenizers/result.h>
 #include <string>
 #include <vector>

package/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/truncation.h ADDED Viewed

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// @lint-ignore-every LICENSELINT
+#pragma once
+// Standard
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+// Third Party
+#include <nlohmann/json.hpp>
+namespace tokenizers {
+// -- Truncation ---------------------------------------------------------------
+enum class TruncationStrategy {
+  LongestFirst,
+  OnlyFirst,
+  OnlySecond,
+};
+enum class TruncationDirection {
+  Left,
+  Right,
+};
+struct TruncationParams {
+  TruncationDirection direction = TruncationDirection::Right;
+  size_t max_length = 512;
+  TruncationStrategy strategy = TruncationStrategy::LongestFirst;
+  size_t stride = 0;
+};
+class Truncation {
+public:
+  /** Shared pointer type */
+  typedef std::shared_ptr<Truncation> Ptr;
+  /**
+   * @param params: The truncation parameters
+   */
+  explicit Truncation(const TruncationParams &params);
+  /**
+   * Truncate the tokens according to the configuration.
+   *
+   * @param tokens The tokens to truncate.
+   * @param num_tokens_to_add The number of special tokens that will be added
+   * later. These are subtracted from max_length during truncation calculation.
+   */
+  std::vector<uint64_t> truncate(std::vector<uint64_t> tokens,
+                                 size_t num_tokens_to_add = 0) const;
+  /**
+   * Truncate a pair of sequences according to the configuration.
+   */
+  std::pair<std::vector<uint64_t>, std::vector<uint64_t>>
+  truncate_pair(std::vector<uint64_t> a, std::vector<uint64_t> b,
+                size_t num_tokens_to_add = 0) const;
+private:
+  TruncationParams params_;
+};
+// -- Factory ------------------------------------------------------------------
+class TruncationConfig {
+public:
+  /**
+   * Construct the truncation instance from the member data
+   */
+  Truncation::Ptr create() const;
+  /**
+   * Populate from a json config file
+   */
+  TruncationConfig &parse_json(const nlohmann::json &json_config);
+  // Configuration members
+  TruncationParams params;
+};
+} // namespace tokenizers

package/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/wordpiece_model.h ADDED Viewed

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// @lint-ignore-every LICENSELINT
+#pragma once
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+#include <pytorch/tokenizers/model.h>
+#include <pytorch/tokenizers/regex.h>
+#include <pytorch/tokenizers/result.h>
+#include <pytorch/tokenizers/string_integer_map.h>
+namespace tokenizers {
+class WordPieceModel : public Model {
+public:
+  explicit WordPieceModel(detail::TokenMap token_map,
+                          detail::TokenMap special_token_map,
+                          std::string unk_token,
+                          std::string continuing_subword_prefix,
+                          size_t max_input_chars_per_word,
+                          std::optional<uint64_t> unk_token_id,
+                          std::optional<uint64_t> bos_token_id,
+                          std::optional<uint64_t> eos_token_id);
+  ~WordPieceModel() override = default;
+  Result<std::vector<uint64_t>>
+  tokenize(const std::string &piece) const override;
+  Result<std::string> id_to_piece(uint64_t token) const override;
+  Result<uint64_t> piece_to_id(const std::string &token) const override;
+  int32_t vocab_size() const override { return vocab_size_; }
+  bool is_special_token(uint64_t token) const override;
+  bool is_loaded() const override { return initialized_; }
+  std::pair<std::optional<std::string>, std::string>
+  split_with_allowed_special_token(const std::string &input,
+                                   size_t offset) const override;
+  uint64_t bos_token_id() const override { return bos_token_id_.value_or(0); }
+  uint64_t eos_token_id() const override { return eos_token_id_.value_or(0); }
+private:
+  detail::TokenMap token_map_;
+  detail::TokenMap special_token_map_;
+  std::unique_ptr<IRegex> special_token_regex_;
+  std::string unk_token_;
+  std::string continuing_subword_prefix_;
+  size_t max_input_chars_per_word_;
+  std::optional<uint64_t> unk_token_id_;
+  std::optional<uint64_t> bos_token_id_;
+  std::optional<uint64_t> eos_token_id_;
+  bool initialized_ = false;
+  int32_t vocab_size_ = 0;
+};
+} // namespace tokenizers

package/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/ExecutorchLib CHANGED Viewed

Binary file

package/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib CHANGED Viewed

Binary file