npm - sentencepiece-buf - Versions diffs - 0.2.1-0 - Mend

sentencepiece-buf 0.2.1-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +9 -0
package/dist/sentencepiece_model_pb.d.ts +717 -0
package/dist/sentencepiece_model_pb.js +94 -0
package/dist/sentencepiece_pb.d.ts +135 -0
package/dist/sentencepiece_pb.js +47 -0
package/package.json +49 -0

package/README.md ADDED Viewed

@@ -0,0 +1,9 @@
+# sentencepiece-buf
+This is an NPM package containing a lightweight compiled version of [`sentencepiece.proto`](https://github.com/google/sentencepiece/blob/v0.2.1/src/sentencepiece.proto) for JavaScript using [`@bufbuild/protobuf`](https://github.com/bufbuild/protobuf-es).
+It can be imported and run from Node.js and Web.
+## Versioning
+The version of this package tracks the `sentencepiece` repository version, with an additional suffix for changes. For instance `v0.2.1-0` of this repository is `v0.2.1` of SentencePiece.

package/dist/sentencepiece_model_pb.d.ts ADDED Viewed

@@ -0,0 +1,717 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+// @generated by protoc-gen-es v2.10.2 with parameter "target=js+dts,import_extension=js,js_import_style=module"
+// @generated from file sentencepiece_model.proto (package sentencepiece, syntax proto2)
+/* eslint-disable */
+import type { GenEnum, GenFile, GenMessage } from "@bufbuild/protobuf/codegenv2";
+import type { Message } from "@bufbuild/protobuf";
+/**
+ * Describes the file sentencepiece_model.proto.
+ */
+export declare const file_sentencepiece_model: GenFile;
+/**
+ * TrainerSpec encodes a various parameters for SentencePiece training.
+ * Next id: 55
+ *
+ * @generated from message sentencepiece.TrainerSpec
+ */
+export declare type TrainerSpec = Message<"sentencepiece.TrainerSpec"> & {
+  /**
+   * /////////////////////////////////////////////////////////////////
+   * General parameters
+   *
+   * Input corpus files.
+   *  Trainer accepts the following two formats:
+   *  A) Monolingual: plain text, one sentence per line.
+   *  B) Bilingual:   TSV, source sentence <tab> target sentence
+   *  When bilingual data is passed, shared vocabulary model is built.
+   *  Note that the input file must be raw corpus, not a preprocessed corpus.
+   *  Trainer only loads the first `input_sentence_size` sentences specified
+   *  with this parameter.
+   *
+   * @generated from field: repeated string input = 1;
+   */
+  input: string[];
+  /**
+   * Input corpus format:
+   * "text": one-sentence-per-line text format (default)
+   * "tsv":  sentence <tab> freq
+   *
+   * @generated from field: optional string input_format = 7;
+   */
+  inputFormat: string;
+  /**
+   * Output model file prefix.
+   * <model_prefix>.model and <model_prefix>.vocab are generated.
+   *
+   * @generated from field: optional string model_prefix = 2;
+   */
+  modelPrefix: string;
+  /**
+   * @generated from field: optional sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
+   */
+  modelType: TrainerSpec_ModelType;
+  /**
+   * Vocabulary size. 8k is the default size.
+   *
+   * @generated from field: optional int32 vocab_size = 4 [default = 8000];
+   */
+  vocabSize: number;
+  /**
+   * List of the languages this model can accept.
+   * Since the model is language-agnostic, this field is used as a reference.
+   *
+   * @generated from field: repeated string accept_language = 5;
+   */
+  acceptLanguage: string[];
+  /**
+   * Size of self-test samples, which are encoded in the model file.
+   *
+   * @generated from field: optional int32 self_test_sample_size = 6 [default = 0];
+   */
+  selfTestSampleSize: number;
+  /**
+   * Whether to use DP version of sentencepiece. Use it with TSV input format
+   * (requires precomputed word tab counts to work).
+   *
+   * @generated from field: optional bool enable_differential_privacy = 50 [default = false];
+   */
+  enableDifferentialPrivacy: boolean;
+  /**
+   * Set these parameters if you need DP version of sentencepiece.
+   * std of noise to add.
+   *
+   * @generated from field: optional float differential_privacy_noise_level = 51 [default = 0];
+   */
+  differentialPrivacyNoiseLevel: number;
+  /**
+   * Clipping threshold to apply after adding noise. All the words with
+   * frequency less than this value are dropped.
+   *
+   * @generated from field: optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
+   */
+  differentialPrivacyClippingThreshold: bigint;
+  /**
+   * /////////////////////////////////////////////////////////////////
+   * Training parameters.
+   *
+   * Uses characters which cover the corpus with the ratio of `chars_coverage`.
+   * This parameter determines the set of basic Alphabet of sentence piece.
+   * 1.0 - `chars_coverage` characters are treated as UNK.
+   * See also required_chars field.
+   *
+   * @generated from field: optional float character_coverage = 10 [default = 0.9995];
+   */
+  characterCoverage: number;
+  /**
+   * Maximum size of sentences the trainer loads from `input` parameter.
+   * Trainer simply loads the `input` files in sequence.
+   * It is better to shuffle the input corpus randomly.
+   *
+   * @generated from field: optional uint64 input_sentence_size = 11 [default = 0];
+   */
+  inputSentenceSize: bigint;
+  /**
+   * @generated from field: optional bool shuffle_input_sentence = 19 [default = true];
+   */
+  shuffleInputSentence: boolean;
+  /**
+   * Maximum size of sentences to make seed sentence pieces.
+   * Extended suffix array is constructed to extract frequent
+   * sub-strings from the corpus. This uses 20N working space,
+   * where N is the size of corpus.
+   *
+   * @generated from field: optional int32 mining_sentence_size = 12 [deprecated = true];
+   * @deprecated
+   */
+  miningSentenceSize: number;
+  /**
+   * Maximum size of sentences to train sentence pieces.
+   *
+   * @generated from field: optional int32 training_sentence_size = 13 [deprecated = true];
+   * @deprecated
+   */
+  trainingSentenceSize: number;
+  /**
+   * The size of seed sentencepieces.
+   * `seed_sentencepiece_size` must be larger than `vocab_size`.
+   *
+   * @generated from field: optional int32 seed_sentencepiece_size = 14 [default = 1000000];
+   */
+  seedSentencepieceSize: number;
+  /**
+   * In every EM sub-iterations, keeps top
+   * `shrinking_factor` * `current sentencepieces size` with respect to
+   * the loss of the sentence piece. This value should be smaller than 1.0.
+   *
+   * @generated from field: optional float shrinking_factor = 15 [default = 0.75];
+   */
+  shrinkingFactor: number;
+  /**
+   * The maximum sentence length in byte. The sentences with the length
+   * larger than `max_sentence_length` is simply ignored.
+   * Longer input tends to bring the following risks:
+   *  * Overflow during EM training (unigram language model only)
+   *  * Performance drop because of O(n log n) cost in BPE.
+   *
+   * @generated from field: optional int32 max_sentence_length = 18 [default = 4192];
+   */
+  maxSentenceLength: number;
+  /**
+   * Number of threads in the training.
+   *
+   * @generated from field: optional int32 num_threads = 16 [default = 16];
+   */
+  numThreads: number;
+  /**
+   * Number of EM sub iterations.
+   *
+   * @generated from field: optional int32 num_sub_iterations = 17 [default = 2];
+   */
+  numSubIterations: number;
+  /**
+   * /////////////////////////////////////////////////////////////////
+   * SentencePiece parameters which control the shapes of sentence piece.
+   *
+   * Maximum length of sentencepiece.
+   *
+   * @generated from field: optional int32 max_sentencepiece_length = 20 [default = 16];
+   */
+  maxSentencepieceLength: number;
+  /**
+   * Uses Unicode script to split sentence pieces.
+   * When `split_by_unicode_script` is true, we do not allow sentence piece to
+   * include multiple Unicode scripts, e.g. "F1" is not a valid piece.
+   * Exception: CJ characters (Hiragana/Katakana/Han) are all handled
+   * as one script type, since Japanese word can consist of multiple scripts.
+   * This exception is always applied regardless of the accept-language
+   * parameter.
+   *
+   * @generated from field: optional bool split_by_unicode_script = 21 [default = true];
+   */
+  splitByUnicodeScript: boolean;
+  /**
+   * When `split_by_number` is true, put a boundary between number and
+   * non-number transition. If we want to treat "F1" is one token, set this flag
+   * to be false.
+   *
+   * @generated from field: optional bool split_by_number = 23 [default = true];
+   */
+  splitByNumber: boolean;
+  /**
+   * Use a white space to split sentence pieces.
+   * When `split_by_whitespace` is false, we may have the piece containing
+   * a white space in the middle. e.g., "in_the".
+   *
+   * @generated from field: optional bool split_by_whitespace = 22 [default = true];
+   */
+  splitByWhitespace: boolean;
+  /**
+   * Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
+   * hello_. When `treat_whitespace_as_suffix` is true,
+   * NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
+   * of sentence.
+   *
+   * @generated from field: optional bool treat_whitespace_as_suffix = 24 [default = false];
+   */
+  treatWhitespaceAsSuffix: boolean;
+  /**
+   * Allows pieces that only contain whitespaces instead of appearing only as
+   * prefix or suffix of other pieces.
+   *
+   * @generated from field: optional bool allow_whitespace_only_pieces = 26 [default = false];
+   */
+  allowWhitespaceOnlyPieces: boolean;
+  /**
+   * Split all digits (0-9) into separate pieces.
+   *
+   * @generated from field: optional bool split_digits = 25 [default = false];
+   */
+  splitDigits: boolean;
+  /**
+   * Defines the pre-tokenization delimiter.
+   * When specified, no pieces crossing this delimiter is not included
+   * in the vocab. Then the delimiter string is virtually ignored
+   * during the training. This field can allows constraints on the vocabulary
+   * selection. Note that this field is available on unigram mode.
+   *
+   * @generated from field: optional string pretokenization_delimiter = 53 [default = ""];
+   */
+  pretokenizationDelimiter: string;
+  /**
+   * /////////////////////////////////////////////////////////////////
+   * Vocabulary management
+   *
+   * Defines control symbols used as an indicator to
+   * change the behavior of the decoder. <s> and </s> are pre-defined.
+   * We can use this field to encode various meta information,
+   * including language indicator in multilingual model.
+   * These symbols are not visible to users, but visible to
+   * the decoder. Note that when the input sentence contains control symbols,
+   * they are not treated as one token, but segmented into normal pieces.
+   * Control symbols must be inserted independently from the segmentation.
+   *
+   * @generated from field: repeated string control_symbols = 30;
+   */
+  controlSymbols: string[];
+  /**
+   * Defines user defined symbols.
+   * These symbols are added with extremely high score
+   * so they are always treated as one unique symbol in any context.
+   * Typical usage of user_defined_symbols is placeholder for named entities.
+   *
+   * @generated from field: repeated string user_defined_symbols = 31;
+   */
+  userDefinedSymbols: string[];
+  /**
+   * Defines required characters. Each UTF8 character in this string is included
+   * in the character set regardless of character_coverage value. Unlike
+   * user_defined_symbols, these characters have scores based on the frequency
+   * on input sentences, and the model can form subwords using characters
+   * in this field.
+   *
+   * @generated from field: optional string required_chars = 36;
+   */
+  requiredChars: string;
+  /**
+   * Decomposes unknown pieces into UTF-8 bytes.
+   *
+   * @generated from field: optional bool byte_fallback = 35 [default = false];
+   */
+  byteFallback: boolean;
+  /**
+   * When creating the vocabulary file, defines whether or not to additionally
+   * output the score for each piece.
+   *
+   * @generated from field: optional bool vocabulary_output_piece_score = 32 [default = true];
+   */
+  vocabularyOutputPieceScore: boolean;
+  /**
+   * `vocab_size` is treated as hard limit. Crash if
+   * the model can not produce the vocab of size `vocab_size`,
+   * When `hard_vocab_limit` is false, vocab_size is treated
+   * as soft limit. Note that when model_type=char,
+   * always assumes hard_vocab_limit = false.
+   *
+   * @generated from field: optional bool hard_vocab_limit = 33 [default = true];
+   */
+  hardVocabLimit: boolean;
+  /**
+   * use all symbols for vocab extraction. This flag is valid
+   * if model type is either CHAR or WORD
+   *
+   * @generated from field: optional bool use_all_vocab = 34 [default = false];
+   */
+  useAllVocab: boolean;
+  /**
+   * /////////////////////////////////////////////////////////////////
+   * Reserved special meta tokens.
+   * * -1 is not used.
+   * * unk_id must not be -1.
+   * Id must starts with 0 and be contigous.
+   *
+   * <unk>
+   *
+   * @generated from field: optional int32 unk_id = 40 [default = 0];
+   */
+  unkId: number;
+  /**
+   * <s>
+   *
+   * @generated from field: optional int32 bos_id = 41 [default = 1];
+   */
+  bosId: number;
+  /**
+   * </s>
+   *
+   * @generated from field: optional int32 eos_id = 42 [default = 2];
+   */
+  eosId: number;
+  /**
+   * <pad> (padding)
+   *
+   * @generated from field: optional int32 pad_id = 43 [default = -1];
+   */
+  padId: number;
+  /**
+   * @generated from field: optional string unk_piece = 45 [default = "<unk>"];
+   */
+  unkPiece: string;
+  /**
+   * @generated from field: optional string bos_piece = 46 [default = "<s>"];
+   */
+  bosPiece: string;
+  /**
+   * @generated from field: optional string eos_piece = 47 [default = "</s>"];
+   */
+  eosPiece: string;
+  /**
+   * @generated from field: optional string pad_piece = 48 [default = "<pad>"];
+   */
+  padPiece: string;
+  /**
+   * Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
+   * since this character can be useful both for user and
+   * developer. We can easily figure out that <unk> is emitted.
+   *
+   * @generated from field: optional string unk_surface = 44 [default = " ⁇ "];
+   */
+  unkSurface: string;
+  /**
+   * Increase bit depth to allow unigram model training on large
+   * (>10M sentences) corpora. A Side-effect of enabling this flag
+   * is increased memory usage.
+   *
+   * @generated from field: optional bool train_extremely_large_corpus = 49 [default = false];
+   */
+  trainExtremelyLargeCorpus: boolean;
+  /**
+   * Path to a seed sentencepieces file, with one tab-separated
+   * seed sentencepiece <tab> frequency per line.
+   *
+   * @generated from field: optional string seed_sentencepieces_file = 54 [default = ""];
+   */
+  seedSentencepiecesFile: string;
+};
+/**
+ * Describes the message sentencepiece.TrainerSpec.
+ * Use `create(TrainerSpecSchema)` to create a new message.
+ */
+export declare const TrainerSpecSchema: GenMessage<TrainerSpec>;
+/**
+ * Model type. only have UNIGRAM now.
+ *
+ * @generated from enum sentencepiece.TrainerSpec.ModelType
+ */
+export enum TrainerSpec_ModelType {
+  /**
+   * Unigram language model with dynamic algorithm
+   *
+   * @generated from enum value: UNIGRAM = 1;
+   */
+  UNIGRAM = 1,
+  /**
+   * Byte Pair Encoding
+   *
+   * @generated from enum value: BPE = 2;
+   */
+  BPE = 2,
+  /**
+   * Delimitered by whitespace.
+   *
+   * @generated from enum value: WORD = 3;
+   */
+  WORD = 3,
+  /**
+   * tokenizes into character sequence
+   *
+   * @generated from enum value: CHAR = 4;
+   */
+  CHAR = 4,
+}
+/**
+ * Describes the enum sentencepiece.TrainerSpec.ModelType.
+ */
+export declare const TrainerSpec_ModelTypeSchema: GenEnum<TrainerSpec_ModelType>;
+/**
+ * NormalizerSpec encodes a various parameters for string normalizaiton
+ *
+ * @generated from message sentencepiece.NormalizerSpec
+ */
+export declare type NormalizerSpec = Message<"sentencepiece.NormalizerSpec"> & {
+  /**
+   * name of normalization rule.
+   *
+   * @generated from field: optional string name = 1;
+   */
+  name: string;
+  /**
+   * Pre-compiled normalization rule created by
+   * Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
+   * Usually this field is set by Builder::GetNormalizerSpec() method.
+   *
+   * @generated from field: optional bytes precompiled_charsmap = 2;
+   */
+  precompiledCharsmap: Uint8Array<ArrayBuffer>;
+  /**
+   * Adds dummy whitespace at the beginning of text in order to
+   * treat "world" in "world" and "hello world" in the same way.
+   *
+   * @generated from field: optional bool add_dummy_prefix = 3 [default = true];
+   */
+  addDummyPrefix: boolean;
+  /**
+   * Removes leading, trailing, and duplicate internal whitespace.
+   *
+   * @generated from field: optional bool remove_extra_whitespaces = 4 [default = true];
+   */
+  removeExtraWhitespaces: boolean;
+  /**
+   * Replaces whitespace with meta symbol.
+   * This field must be true to train sentence piece model.
+   *
+   * @generated from field: optional bool escape_whitespaces = 5 [default = true];
+   */
+  escapeWhitespaces: boolean;
+  /**
+   * Custom normalization rule file in TSV format.
+   * https://github.com/google/sentencepiece/blob/master/doc/normalization.md
+   * This field is only used in SentencePieceTrainer::Train() method, which
+   * compiles the rule into the binary rule stored in `precompiled_charsmap`.
+   *
+   * @generated from field: optional string normalization_rule_tsv = 6;
+   */
+  normalizationRuleTsv: string;
+};
+/**
+ * Describes the message sentencepiece.NormalizerSpec.
+ * Use `create(NormalizerSpecSchema)` to create a new message.
+ */
+export declare const NormalizerSpecSchema: GenMessage<NormalizerSpec>;
+/**
+ * Proto to store samples for self-testing.
+ *
+ * @generated from message sentencepiece.SelfTestData
+ */
+export declare type SelfTestData = Message<"sentencepiece.SelfTestData"> & {
+  /**
+   * @generated from field: repeated sentencepiece.SelfTestData.Sample samples = 1;
+   */
+  samples: SelfTestData_Sample[];
+};
+/**
+ * Describes the message sentencepiece.SelfTestData.
+ * Use `create(SelfTestDataSchema)` to create a new message.
+ */
+export declare const SelfTestDataSchema: GenMessage<SelfTestData>;
+/**
+ * @generated from message sentencepiece.SelfTestData.Sample
+ */
+export declare type SelfTestData_Sample = Message<"sentencepiece.SelfTestData.Sample"> & {
+  /**
+   * @generated from field: optional string input = 1;
+   */
+  input: string;
+  /**
+   * @generated from field: optional string expected = 2;
+   */
+  expected: string;
+};
+/**
+ * Describes the message sentencepiece.SelfTestData.Sample.
+ * Use `create(SelfTestData_SampleSchema)` to create a new message.
+ */
+export declare const SelfTestData_SampleSchema: GenMessage<SelfTestData_Sample>;
+/**
+ * ModelProto stores model parameters.
+ * SentencePieceProcessor is supposed to be self-contained.
+ * All settings/parameters which may change the behavior must be encoded
+ * in ModelProto.
+ *
+ * @generated from message sentencepiece.ModelProto
+ */
+export declare type ModelProto = Message<"sentencepiece.ModelProto"> & {
+  /**
+   * Sentence pieces with scores.
+   *
+   * @generated from field: repeated sentencepiece.ModelProto.SentencePiece pieces = 1;
+   */
+  pieces: ModelProto_SentencePiece[];
+  /**
+   * Spec used to generate this model file.
+   *
+   * @generated from field: optional sentencepiece.TrainerSpec trainer_spec = 2;
+   */
+  trainerSpec?: TrainerSpec;
+  /**
+   * Spec for text normalization.
+   *
+   * @generated from field: optional sentencepiece.NormalizerSpec normalizer_spec = 3;
+   */
+  normalizerSpec?: NormalizerSpec;
+  /**
+   * Stores sample input and its expected segmentation to verify the model.
+   *
+   * @generated from field: optional sentencepiece.SelfTestData self_test_data = 4;
+   */
+  selfTestData?: SelfTestData;
+  /**
+   * Spec for text de-normalization.
+   *
+   * @generated from field: optional sentencepiece.NormalizerSpec denormalizer_spec = 5;
+   */
+  denormalizerSpec?: NormalizerSpec;
+};
+/**
+ * Describes the message sentencepiece.ModelProto.
+ * Use `create(ModelProtoSchema)` to create a new message.
+ */
+export declare const ModelProtoSchema: GenMessage<ModelProto>;
+/**
+ * @generated from message sentencepiece.ModelProto.SentencePiece
+ */
+export declare type ModelProto_SentencePiece = Message<"sentencepiece.ModelProto.SentencePiece"> & {
+  /**
+   * piece must not be empty.
+   *
+   * @generated from field: optional string piece = 1;
+   */
+  piece: string;
+  /**
+   * @generated from field: optional float score = 2;
+   */
+  score: number;
+  /**
+   * @generated from field: optional sentencepiece.ModelProto.SentencePiece.Type type = 3 [default = NORMAL];
+   */
+  type: ModelProto_SentencePiece_Type;
+};
+/**
+ * Describes the message sentencepiece.ModelProto.SentencePiece.
+ * Use `create(ModelProto_SentencePieceSchema)` to create a new message.
+ */
+export declare const ModelProto_SentencePieceSchema: GenMessage<ModelProto_SentencePiece>;
+/**
+ * @generated from enum sentencepiece.ModelProto.SentencePiece.Type
+ */
+export enum ModelProto_SentencePiece_Type {
+  /**
+   * normal symbol
+   *
+   * @generated from enum value: NORMAL = 1;
+   */
+  NORMAL = 1,
+  /**
+   * unknown symbol. only <unk> for now.
+   *
+   * @generated from enum value: UNKNOWN = 2;
+   */
+  UNKNOWN = 2,
+  /**
+   * control symbols. </s>, <s>, <2ja> etc.
+   *
+   * @generated from enum value: CONTROL = 3;
+   */
+  CONTROL = 3,
+  /**
+   * user defined symbols.
+   *
+   * @generated from enum value: USER_DEFINED = 4;
+   */
+  USER_DEFINED = 4,
+  /**
+   * Typical usage of USER_DEFINED symbol
+   * is placeholder.
+   *
+   * byte symbols. Used when `byte_fallback` is true.
+   *
+   * @generated from enum value: BYTE = 6;
+   */
+  BYTE = 6,
+  /**
+   * this piece is not used.
+   *
+   * @generated from enum value: UNUSED = 5;
+   */
+  UNUSED = 5,
+}
+/**
+ * Describes the enum sentencepiece.ModelProto.SentencePiece.Type.
+ */
+export declare const ModelProto_SentencePiece_TypeSchema: GenEnum<ModelProto_SentencePiece_Type>;

package/dist/sentencepiece_model_pb.js ADDED Viewed

@@ -0,0 +1,94 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+// @generated by protoc-gen-es v2.10.2 with parameter "target=js+dts,import_extension=js,js_import_style=module"
+// @generated from file sentencepiece_model.proto (package sentencepiece, syntax proto2)
+/* eslint-disable */
+import { enumDesc, fileDesc, messageDesc, tsEnum } from "@bufbuild/protobuf/codegenv2";
+/**
+ * Describes the file sentencepiece_model.proto.
+ */
+export const file_sentencepiece_model = /*@__PURE__*/
+  fileDesc("ChlzZW50ZW5jZXBpZWNlX21vZGVsLnByb3RvEg1zZW50ZW5jZXBpZWNlIqQMCgtUcmFpbmVyU3BlYxINCgVpbnB1dBgBIAMoCRIUCgxpbnB1dF9mb3JtYXQYByABKAkSFAoMbW9kZWxfcHJlZml4GAIgASgJEkEKCm1vZGVsX3R5cGUYAyABKA4yJC5zZW50ZW5jZXBpZWNlLlRyYWluZXJTcGVjLk1vZGVsVHlwZToHVU5JR1JBTRIYCgp2b2NhYl9zaXplGAQgASgFOgQ4MDAwEhcKD2FjY2VwdF9sYW5ndWFnZRgFIAMoCRIgChVzZWxmX3Rlc3Rfc2FtcGxlX3NpemUYBiABKAU6ATASKgobZW5hYmxlX2RpZmZlcmVudGlhbF9wcml2YWN5GDIgASgIOgVmYWxzZRIrCiBkaWZmZXJlbnRpYWxfcHJpdmFjeV9ub2lzZV9sZXZlbBgzIAEoAjoBMBIyCidkaWZmZXJlbnRpYWxfcHJpdmFjeV9jbGlwcGluZ190aHJlc2hvbGQYNCABKAQ6ATASIgoSY2hhcmFjdGVyX2NvdmVyYWdlGAogASgCOgYwLjk5OTUSHgoTaW5wdXRfc2VudGVuY2Vfc2l6ZRgLIAEoBDoBMBIkChZzaHVmZmxlX2lucHV0X3NlbnRlbmNlGBMgASgIOgR0cnVlEiAKFG1pbmluZ19zZW50ZW5jZV9zaXplGAwgASgFQgIYARIiChZ0cmFpbmluZ19zZW50ZW5jZV9zaXplGA0gASgFQgIYARIoChdzZWVkX3NlbnRlbmNlcGllY2Vfc2l6ZRgOIAEoBToHMTAwMDAwMBIeChBzaHJpbmtpbmdfZmFjdG9yGA8gASgCOgQwLjc1EiEKE21heF9zZW50ZW5jZV9sZW5ndGgYEiABKAU6BDQxOTISFwoLbnVtX3RocmVhZHMYECABKAU6AjE2Eh0KEm51bV9zdWJfaXRlcmF0aW9ucxgRIAEoBToBMhIkChhtYXhfc2VudGVuY2VwaWVjZV9sZW5ndGgYFCABKAU6AjE2EiUKF3NwbGl0X2J5X3VuaWNvZGVfc2NyaXB0GBUgASgIOgR0cnVlEh0KD3NwbGl0X2J5X251bWJlchgXIAEoCDoEdHJ1ZRIhChNzcGxpdF9ieV93aGl0ZXNwYWNlGBYgASgIOgR0cnVlEikKGnRyZWF0X3doaXRlc3BhY2VfYXNfc3VmZml4GBggASgIOgVmYWxzZRIrChxhbGxvd193aGl0ZXNwYWNlX29ubHlfcGllY2VzGBogASgIOgVmYWxzZRIbCgxzcGxpdF9kaWdpdHMYGSABKAg6BWZhbHNlEiMKGXByZXRva2VuaXphdGlvbl9kZWxpbWl0ZXIYNSABKAk6ABIXCg9jb250cm9sX3N5bWJvbHMYHiADKAkSHAoUdXNlcl9kZWZpbmVkX3N5bWJvbHMYHyADKAkSFgoOcmVxdWlyZWRfY2hhcnMYJCABKAkSHAoNYnl0ZV9mYWxsYmFjaxgjIAEoCDoFZmFsc2USKwoddm9jYWJ1bGFyeV9vdXRwdXRfcGllY2Vfc2NvcmUYICABKAg6BHRydWUSHgoQaGFyZF92b2NhYl9saW1pdBghIAEoCDoEdHJ1ZRIcCg11c2VfYWxsX3ZvY2FiGCIgASgIOgVmYWxzZRIRCgZ1bmtfaWQYKCABKAU6ATASEQoGYm9zX2lkGCkgASgFOgExEhEKBmVvc19pZBgqIAEoBToBMhISCgZwYWRfaWQYKyABKAU6Ai0xEhgKCXVua19waWVjZRgtIAEoCToFPHVuaz4SFgoJYm9zX3BpZWNlGC4gASgJOgM8cz4SFwoJZW9zX3BpZWNlGC8gASgJOgQ8L3M+EhgKCXBhZF9waWVjZRgwIAEoCToFPHBhZD4SGgoLdW5rX3N1cmZhY2UYLCABKAk6BSDigYcgEisKHHRyYWluX2V4dHJlbWVseV9sYXJnZV9jb3JwdXMYMSABKAg6BWZhbHNlEiIKGHNlZWRfc2VudGVuY2VwaWVjZXNfZmlsZRg2IAEoCToAIjUKCU1vZGVsVHlwZRILCgdVTklHUkFNEAESBwoDQlBFEAISCAoEV09SRBADEggKBENIQVIQBCoJCMgBEICAgIACItEBCg5Ob3JtYWxpemVyU3BlYxIMCgRuYW1lGAEgASgJEhwKFHByZWNvbXBpbGVkX2NoYXJzbWFwGAIgASgMEh4KEGFkZF9kdW1teV9wcmVmaXgYAyABKAg6BHRydWUSJgoYcmVtb3ZlX2V4dHJhX3doaXRlc3BhY2VzGAQgASgIOgR0cnVlEiAKEmVzY2FwZV93aGl0ZXNwYWNlcxgFIAEoCDoEdHJ1ZRIeChZub3JtYWxpemF0aW9uX3J1bGVfdHN2GAYgASgJKgkIyAEQgICAgAIieQoMU2VsZlRlc3REYXRhEjMKB3NhbXBsZXMYASADKAsyIi5zZW50ZW5jZXBpZWNlLlNlbGZUZXN0RGF0YS5TYW1wbGUaKQoGU2FtcGxlEg0KBWlucHV0GAEgASgJEhAKCGV4cGVjdGVkGAIgASgJKgkIyAEQgICAgAIi/gMKCk1vZGVsUHJvdG8SNwoGcGllY2VzGAEgAygLMicuc2VudGVuY2VwaWVjZS5Nb2RlbFByb3RvLlNlbnRlbmNlUGllY2USMAoMdHJhaW5lcl9zcGVjGAIgASgLMhouc2VudGVuY2VwaWVjZS5UcmFpbmVyU3BlYxI2Cg9ub3JtYWxpemVyX3NwZWMYAyABKAsyHS5zZW50ZW5jZXBpZWNlLk5vcm1hbGl6ZXJTcGVjEjMKDnNlbGZfdGVzdF9kYXRhGAQgASgLMhsuc2VudGVuY2VwaWVjZS5TZWxmVGVzdERhdGESOAoRZGVub3JtYWxpemVyX3NwZWMYBSABKAsyHS5zZW50ZW5jZXBpZWNlLk5vcm1hbGl6ZXJTcGVjGtIBCg1TZW50ZW5jZVBpZWNlEg0KBXBpZWNlGAEgASgJEg0KBXNjb3JlGAIgASgCEkIKBHR5cGUYAyABKA4yLC5zZW50ZW5jZXBpZWNlLk1vZGVsUHJvdG8uU2VudGVuY2VQaWVjZS5UeXBlOgZOT1JNQUwiVAoEVHlwZRIKCgZOT1JNQUwQARILCgdVTktOT1dOEAISCwoHQ09OVFJPTBADEhAKDFVTRVJfREVGSU5FRBAEEggKBEJZVEUQBhIKCgZVTlVTRUQQBSoJCMgBEICAgIACKgkIyAEQgICAgAJCAkgD");
+/**
+ * Describes the message sentencepiece.TrainerSpec.
+ * Use `create(TrainerSpecSchema)` to create a new message.
+ */
+export const TrainerSpecSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece_model, 0);
+/**
+ * Describes the enum sentencepiece.TrainerSpec.ModelType.
+ */
+export const TrainerSpec_ModelTypeSchema = /*@__PURE__*/
+  enumDesc(file_sentencepiece_model, 0, 0);
+/**
+ * Model type. only have UNIGRAM now.
+ *
+ * @generated from enum sentencepiece.TrainerSpec.ModelType
+ */
+export const TrainerSpec_ModelType = /*@__PURE__*/
+  tsEnum(TrainerSpec_ModelTypeSchema);
+/**
+ * Describes the message sentencepiece.NormalizerSpec.
+ * Use `create(NormalizerSpecSchema)` to create a new message.
+ */
+export const NormalizerSpecSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece_model, 1);
+/**
+ * Describes the message sentencepiece.SelfTestData.
+ * Use `create(SelfTestDataSchema)` to create a new message.
+ */
+export const SelfTestDataSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece_model, 2);
+/**
+ * Describes the message sentencepiece.SelfTestData.Sample.
+ * Use `create(SelfTestData_SampleSchema)` to create a new message.
+ */
+export const SelfTestData_SampleSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece_model, 2, 0);
+/**
+ * Describes the message sentencepiece.ModelProto.
+ * Use `create(ModelProtoSchema)` to create a new message.
+ */
+export const ModelProtoSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece_model, 3);
+/**
+ * Describes the message sentencepiece.ModelProto.SentencePiece.
+ * Use `create(ModelProto_SentencePieceSchema)` to create a new message.
+ */
+export const ModelProto_SentencePieceSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece_model, 3, 0);
+/**
+ * Describes the enum sentencepiece.ModelProto.SentencePiece.Type.
+ */
+export const ModelProto_SentencePiece_TypeSchema = /*@__PURE__*/
+  enumDesc(file_sentencepiece_model, 3, 0, 0);
+/**
+ * @generated from enum sentencepiece.ModelProto.SentencePiece.Type
+ */
+export const ModelProto_SentencePiece_Type = /*@__PURE__*/
+  tsEnum(ModelProto_SentencePiece_TypeSchema);

package/dist/sentencepiece_pb.d.ts ADDED Viewed

@@ -0,0 +1,135 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+// @generated by protoc-gen-es v2.10.2 with parameter "target=js+dts,import_extension=js,js_import_style=module"
+// @generated from file sentencepiece.proto (package sentencepiece, syntax proto2)
+/* eslint-disable */
+import type { GenFile, GenMessage } from "@bufbuild/protobuf/codegenv2";
+import type { Message } from "@bufbuild/protobuf";
+/**
+ * Describes the file sentencepiece.proto.
+ */
+export declare const file_sentencepiece: GenFile;
+/**
+ * SentencePieceText manages a user-facing source sentence,
+ * postprocessed target sentence, and internal segmentation
+ * with byte offsets.
+ *
+ * @generated from message sentencepiece.SentencePieceText
+ */
+export declare type SentencePieceText = Message<"sentencepiece.SentencePieceText"> & {
+  /**
+   * User input or postprocessed text. This should be immutable
+   * since the byte range in SentencePiece is pointing to a span over this
+   * text. Meta symbols for whitespaces are not included.
+   *
+   * @generated from field: optional string text = 1;
+   */
+  text: string;
+  /**
+   * A sequence of sentence pieces. These `pieces` are guaranteed to be ordered
+   * in increasing order of `begin` field. Except for the last piece, `end` of
+   * each piece is always equal to `begin` of the next piece.
+   *
+   * @generated from field: repeated sentencepiece.SentencePieceText.SentencePiece pieces = 2;
+   */
+  pieces: SentencePieceText_SentencePiece[];
+  /**
+   * Score (usually log probability) for MultiSentencePieceText.
+   *
+   * @generated from field: optional float score = 3;
+   */
+  score: number;
+};
+/**
+ * Describes the message sentencepiece.SentencePieceText.
+ * Use `create(SentencePieceTextSchema)` to create a new message.
+ */
+export declare const SentencePieceTextSchema: GenMessage<SentencePieceText>;
+/**
+ * @generated from message sentencepiece.SentencePieceText.SentencePiece
+ */
+export declare type SentencePieceText_SentencePiece = Message<"sentencepiece.SentencePieceText.SentencePiece"> & {
+  /**
+   * Internal representation for the decoder.
+   * - Decoder can use |piece| as a basic token.
+   * - the piece must be non-empty.
+   * - A whitespace is replaced with a meta symbol.
+   * - Concatenation of pieces is not always the same as the |text|.
+   *
+   * @generated from field: optional string piece = 1;
+   */
+  piece: string;
+  /**
+   * Vocabulary id.
+   *
+   * @generated from field: optional uint32 id = 2;
+   */
+  id: number;
+  /**
+   * External representation for the client.
+   * - It is always guaranteed that
+   *   text.substr(begin, end - begin) == surface.
+   * - Concatenation of surface is always the same as the |text|.
+   * - |surface| may contain whitespaces.
+   * - |surface| may be empty if the piece encodes
+   *   a control vocabulary. e.g., <s>, </s>, <unk>.
+   * - When |surface| is empty, always begin == end. (zero-length span).
+   *
+   * @generated from field: optional string surface = 3;
+   */
+  surface: string;
+  /**
+   * @generated from field: optional uint32 begin = 4;
+   */
+  begin: number;
+  /**
+   * @generated from field: optional uint32 end = 5;
+   */
+  end: number;
+};
+/**
+ * Describes the message sentencepiece.SentencePieceText.SentencePiece.
+ * Use `create(SentencePieceText_SentencePieceSchema)` to create a new message.
+ */
+export declare const SentencePieceText_SentencePieceSchema: GenMessage<SentencePieceText_SentencePiece>;
+/**
+ * @generated from message sentencepiece.NBestSentencePieceText
+ */
+export declare type NBestSentencePieceText = Message<"sentencepiece.NBestSentencePieceText"> & {
+  /**
+   * @generated from field: repeated sentencepiece.SentencePieceText nbests = 1;
+   */
+  nbests: SentencePieceText[];
+};
+/**
+ * Describes the message sentencepiece.NBestSentencePieceText.
+ * Use `create(NBestSentencePieceTextSchema)` to create a new message.
+ */
+export declare const NBestSentencePieceTextSchema: GenMessage<NBestSentencePieceText>;

package/dist/sentencepiece_pb.js ADDED Viewed

@@ -0,0 +1,47 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+// @generated by protoc-gen-es v2.10.2 with parameter "target=js+dts,import_extension=js,js_import_style=module"
+// @generated from file sentencepiece.proto (package sentencepiece, syntax proto2)
+/* eslint-disable */
+import { fileDesc, messageDesc } from "@bufbuild/protobuf/codegenv2";
+/**
+ * Describes the file sentencepiece.proto.
+ */
+export const file_sentencepiece = /*@__PURE__*/
+  fileDesc("ChNzZW50ZW5jZXBpZWNlLnByb3RvEg1zZW50ZW5jZXBpZWNlIt8BChFTZW50ZW5jZVBpZWNlVGV4dBIMCgR0ZXh0GAEgASgJEj4KBnBpZWNlcxgCIAMoCzIuLnNlbnRlbmNlcGllY2UuU2VudGVuY2VQaWVjZVRleHQuU2VudGVuY2VQaWVjZRINCgVzY29yZRgDIAEoAhpiCg1TZW50ZW5jZVBpZWNlEg0KBXBpZWNlGAEgASgJEgoKAmlkGAIgASgNEg8KB3N1cmZhY2UYAyABKAkSDQoFYmVnaW4YBCABKA0SCwoDZW5kGAUgASgNKgkIyAEQgICAgAIqCQjIARCAgICAAiJKChZOQmVzdFNlbnRlbmNlUGllY2VUZXh0EjAKBm5iZXN0cxgBIAMoCzIgLnNlbnRlbmNlcGllY2UuU2VudGVuY2VQaWVjZVRleHRCAkgD");
+/**
+ * Describes the message sentencepiece.SentencePieceText.
+ * Use `create(SentencePieceTextSchema)` to create a new message.
+ */
+export const SentencePieceTextSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece, 0);
+/**
+ * Describes the message sentencepiece.SentencePieceText.SentencePiece.
+ * Use `create(SentencePieceText_SentencePieceSchema)` to create a new message.
+ */
+export const SentencePieceText_SentencePieceSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece, 0, 0);
+/**
+ * Describes the message sentencepiece.NBestSentencePieceText.
+ * Use `create(NBestSentencePieceTextSchema)` to create a new message.
+ */
+export const NBestSentencePieceTextSchema = /*@__PURE__*/
+  messageDesc(file_sentencepiece, 1);

package/package.json ADDED Viewed

@@ -0,0 +1,49 @@
+{
+  "name": "sentencepiece-buf",
+  "version": "0.2.1-0",
+  "description": "Compiled Protobuf definitions for SentencePiece files",
+  "keywords": [
+    "sentencepiece",
+    "protobuf"
+  ],
+  "author": {
+    "name": "Eric Zhang",
+    "email": "ekzhang1@gmail.com",
+    "url": "https://www.ekzhang.com"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/ekzhang/protobuf-es-pkg.git"
+  },
+  "type": "module",
+  "files": [
+    "/dist/*.{js,d.ts}"
+  ],
+  "main": "dist/sentencepiece_pb.js",
+  "types": "dist/sentencepiece_pb.d.ts",
+  "module": "dist/sentencepiece_pb.js",
+  "exports": {
+    ".": {
+      "import": "./dist/sentencepiece_pb.js",
+      "types": "./dist/sentencepiece_pb.d.ts"
+    },
+    "./model": {
+      "import": "./dist/sentencepiece_model_pb.js",
+      "types": "./dist/sentencepiece_model_pb.d.ts"
+    }
+  },
+  "license": "MIT",
+  "dependencies": {
+    "@bufbuild/protobuf": "^2.10.2"
+  },
+  "devDependencies": {
+    "@bufbuild/buf": "^1.61.0",
+    "@bufbuild/protoc-gen-es": "^2.10.2"
+  },
+  "engines": {
+    "pnpm": ">=10.0.0"
+  },
+  "scripts": {
+    "build": "buf generate && perl -pi -e 's/Uint8Array(?!<)/Uint8Array<ArrayBuffer>/g' dist/*.d.ts"
+  }
+}