@huggingface/transformers 3.0.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +376 -0
  3. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  4. package/dist/transformers.cjs +30741 -0
  5. package/dist/transformers.cjs.map +1 -0
  6. package/dist/transformers.js +33858 -0
  7. package/dist/transformers.js.map +1 -0
  8. package/dist/transformers.min.cjs +173 -0
  9. package/dist/transformers.min.cjs.map +1 -0
  10. package/dist/transformers.min.js +231 -0
  11. package/dist/transformers.min.js.map +1 -0
  12. package/package.json +92 -0
  13. package/src/backends/onnx.js +151 -0
  14. package/src/configs.js +360 -0
  15. package/src/env.js +152 -0
  16. package/src/generation/configuration_utils.js +381 -0
  17. package/src/generation/logits_process.js +716 -0
  18. package/src/generation/logits_sampler.js +204 -0
  19. package/src/generation/parameters.js +35 -0
  20. package/src/generation/stopping_criteria.js +156 -0
  21. package/src/generation/streamers.js +212 -0
  22. package/src/models/whisper/common_whisper.js +151 -0
  23. package/src/models/whisper/generation_whisper.js +89 -0
  24. package/src/models.js +7028 -0
  25. package/src/ops/registry.js +92 -0
  26. package/src/pipelines.js +3341 -0
  27. package/src/processors.js +2614 -0
  28. package/src/tokenizers.js +4395 -0
  29. package/src/transformers.js +28 -0
  30. package/src/utils/audio.js +704 -0
  31. package/src/utils/constants.js +2 -0
  32. package/src/utils/core.js +149 -0
  33. package/src/utils/data-structures.js +445 -0
  34. package/src/utils/devices.js +11 -0
  35. package/src/utils/dtypes.js +62 -0
  36. package/src/utils/generic.js +35 -0
  37. package/src/utils/hub.js +671 -0
  38. package/src/utils/image.js +745 -0
  39. package/src/utils/maths.js +1050 -0
  40. package/src/utils/tensor.js +1378 -0
  41. package/types/backends/onnx.d.ts +26 -0
  42. package/types/backends/onnx.d.ts.map +1 -0
  43. package/types/configs.d.ts +59 -0
  44. package/types/configs.d.ts.map +1 -0
  45. package/types/env.d.ts +106 -0
  46. package/types/env.d.ts.map +1 -0
  47. package/types/generation/configuration_utils.d.ts +320 -0
  48. package/types/generation/configuration_utils.d.ts.map +1 -0
  49. package/types/generation/logits_process.d.ts +354 -0
  50. package/types/generation/logits_process.d.ts.map +1 -0
  51. package/types/generation/logits_sampler.d.ts +51 -0
  52. package/types/generation/logits_sampler.d.ts.map +1 -0
  53. package/types/generation/parameters.d.ts +47 -0
  54. package/types/generation/parameters.d.ts.map +1 -0
  55. package/types/generation/stopping_criteria.d.ts +81 -0
  56. package/types/generation/stopping_criteria.d.ts.map +1 -0
  57. package/types/generation/streamers.d.ts +81 -0
  58. package/types/generation/streamers.d.ts.map +1 -0
  59. package/types/models/whisper/common_whisper.d.ts +8 -0
  60. package/types/models/whisper/common_whisper.d.ts.map +1 -0
  61. package/types/models/whisper/generation_whisper.d.ts +76 -0
  62. package/types/models/whisper/generation_whisper.d.ts.map +1 -0
  63. package/types/models.d.ts +3845 -0
  64. package/types/models.d.ts.map +1 -0
  65. package/types/ops/registry.d.ts +11 -0
  66. package/types/ops/registry.d.ts.map +1 -0
  67. package/types/pipelines.d.ts +2403 -0
  68. package/types/pipelines.d.ts.map +1 -0
  69. package/types/processors.d.ts +917 -0
  70. package/types/processors.d.ts.map +1 -0
  71. package/types/tokenizers.d.ts +999 -0
  72. package/types/tokenizers.d.ts.map +1 -0
  73. package/types/transformers.d.ts +13 -0
  74. package/types/transformers.d.ts.map +1 -0
  75. package/types/utils/audio.d.ts +130 -0
  76. package/types/utils/audio.d.ts.map +1 -0
  77. package/types/utils/constants.d.ts +2 -0
  78. package/types/utils/constants.d.ts.map +1 -0
  79. package/types/utils/core.d.ts +91 -0
  80. package/types/utils/core.d.ts.map +1 -0
  81. package/types/utils/data-structures.d.ts +236 -0
  82. package/types/utils/data-structures.d.ts.map +1 -0
  83. package/types/utils/devices.d.ts +8 -0
  84. package/types/utils/devices.d.ts.map +1 -0
  85. package/types/utils/dtypes.d.ts +22 -0
  86. package/types/utils/dtypes.d.ts.map +1 -0
  87. package/types/utils/generic.d.ts +11 -0
  88. package/types/utils/generic.d.ts.map +1 -0
  89. package/types/utils/hub.d.ts +191 -0
  90. package/types/utils/hub.d.ts.map +1 -0
  91. package/types/utils/image.d.ts +119 -0
  92. package/types/utils/image.d.ts.map +1 -0
  93. package/types/utils/maths.d.ts +280 -0
  94. package/types/utils/maths.d.ts.map +1 -0
  95. package/types/utils/tensor.d.ts +392 -0
  96. package/types/utils/tensor.d.ts.map +1 -0
@@ -0,0 +1,999 @@
1
+ /**
2
+ * Checks whether the given Unicode codepoint represents a CJK (Chinese, Japanese, or Korean) character.
3
+ *
4
+ * A "chinese character" is defined as anything in the CJK Unicode block:
5
+ * https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
6
+ *
7
+ * Note that the CJK Unicode block is NOT all Japanese and Korean characters, despite its name.
8
+ * The modern Korean Hangul alphabet is a different block, as is Japanese Hiragana and Katakana.
9
+ * Those alphabets are used to write space-separated words, so they are not treated specially
10
+ * and are handled like all other languages.
11
+ *
12
+ * @param {number|bigint} cp The Unicode codepoint to check.
13
+ * @returns {boolean} True if the codepoint represents a CJK character, false otherwise.
14
+ */
15
+ export function is_chinese_char(cp: number | bigint): boolean;
16
+ declare const TokenizerModel_base: new () => {
17
+ (...args: any[]): any;
18
+ _call(...args: any[]): any;
19
+ };
20
+ /**
21
+ * Abstract base class for tokenizer models.
22
+ *
23
+ * @extends Callable
24
+ */
25
+ export class TokenizerModel extends TokenizerModel_base {
26
+ /**
27
+ * Instantiates a new TokenizerModel instance based on the configuration object provided.
28
+ * @param {Object} config The configuration object for the TokenizerModel.
29
+ * @param {...*} args Optional arguments to pass to the specific TokenizerModel constructor.
30
+ * @returns {TokenizerModel} A new instance of a TokenizerModel.
31
+ * @throws Will throw an error if the TokenizerModel type in the config is not recognized.
32
+ */
33
+ static fromConfig(config: any, ...args: any[]): TokenizerModel;
34
+ /**
35
+ * Creates a new instance of TokenizerModel.
36
+ * @param {Object} config The configuration object for the TokenizerModel.
37
+ */
38
+ constructor(config: any);
39
+ config: any;
40
+ /** @type {string[]} */
41
+ vocab: string[];
42
+ /**
43
+ * A mapping of tokens to ids.
44
+ * @type {Map<string, number>}
45
+ */
46
+ tokens_to_ids: Map<string, number>;
47
+ unk_token_id: any;
48
+ unk_token: any;
49
+ end_of_word_suffix: any;
50
+ /** @type {boolean} Whether to fuse unknown tokens when encoding. Defaults to false. */
51
+ fuse_unk: boolean;
52
+ /**
53
+ * Internal function to call the TokenizerModel instance.
54
+ * @param {string[]} tokens The tokens to encode.
55
+ * @returns {string[]} The encoded token IDs.
56
+ */
57
+ _call(tokens: string[]): string[];
58
+ /**
59
+ * Encodes a list of tokens into a list of token IDs.
60
+ * @param {string[]} tokens The tokens to encode.
61
+ * @returns {string[]} The encoded tokens.
62
+ * @throws Will throw an error if not implemented in a subclass.
63
+ */
64
+ encode(tokens: string[]): string[];
65
+ /**
66
+ * Converts a list of tokens into a list of token IDs.
67
+ * @param {string[]} tokens The tokens to convert.
68
+ * @returns {number[]} The converted token IDs.
69
+ */
70
+ convert_tokens_to_ids(tokens: string[]): number[];
71
+ /**
72
+ * Converts a list of token IDs into a list of tokens.
73
+ * @param {number[]|bigint[]} ids The token IDs to convert.
74
+ * @returns {string[]} The converted tokens.
75
+ */
76
+ convert_ids_to_tokens(ids: number[] | bigint[]): string[];
77
+ }
78
+ declare const PreTrainedTokenizer_base: new () => {
79
+ (...args: any[]): any;
80
+ _call(...args: any[]): any;
81
+ };
82
+ /**
83
+ * @typedef {Object} Message
84
+ * @property {string} role The role of the message (e.g., "user" or "assistant" or "system").
85
+ * @property {string} content The content of the message.
86
+ */
87
+ export class PreTrainedTokenizer extends PreTrainedTokenizer_base {
88
+ /**
89
+ * Loads a pre-trained tokenizer from the given `pretrained_model_name_or_path`.
90
+ *
91
+ * @param {string} pretrained_model_name_or_path The path to the pre-trained tokenizer.
92
+ * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer.
93
+ *
94
+ * @throws {Error} Throws an error if the tokenizer.json or tokenizer_config.json files are not found in the `pretrained_model_name_or_path`.
95
+ * @returns {Promise<PreTrainedTokenizer>} A new instance of the `PreTrainedTokenizer` class.
96
+ */
97
+ static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, legacy, }?: PretrainedTokenizerOptions): Promise<PreTrainedTokenizer>;
98
+ /**
99
+ * Create a new PreTrainedTokenizer instance.
100
+ * @param {Object} tokenizerJSON The JSON of the tokenizer.
101
+ * @param {Object} tokenizerConfig The config of the tokenizer.
102
+ */
103
+ constructor(tokenizerJSON: any, tokenizerConfig: any);
104
+ return_token_type_ids: boolean;
105
+ padding_side: string;
106
+ _tokenizer_config: any;
107
+ normalizer: Normalizer;
108
+ pre_tokenizer: PreTokenizer;
109
+ model: TokenizerModel;
110
+ post_processor: PostProcessor;
111
+ decoder: Decoder;
112
+ special_tokens: any[];
113
+ all_special_ids: number[];
114
+ /** @type {AddedToken[]} */
115
+ added_tokens: AddedToken[];
116
+ additional_special_tokens: any;
117
+ added_tokens_regex: RegExp;
118
+ mask_token: string;
119
+ mask_token_id: number;
120
+ pad_token: string;
121
+ pad_token_id: number;
122
+ sep_token: string;
123
+ sep_token_id: number;
124
+ unk_token: string;
125
+ unk_token_id: number;
126
+ model_max_length: any;
127
+ /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
128
+ remove_space: boolean;
129
+ clean_up_tokenization_spaces: any;
130
+ do_lowercase_and_remove_accent: any;
131
+ legacy: boolean;
132
+ chat_template: any;
133
+ _compiled_template_cache: Map<any, any>;
134
+ /**
135
+ * Returns the value of the first matching key in the tokenizer config object.
136
+ * @param {...string} keys One or more keys to search for in the tokenizer config object.
137
+ * @returns {string|null} The value associated with the first matching key, or null if no match is found.
138
+ * @throws {Error} If an object is found for a matching key and its __type property is not "AddedToken".
139
+ * @private
140
+ */
141
+ private getToken;
142
+ /**
143
+ * @typedef {number[]|number[][]|Tensor} BatchEncodingItem
144
+ *
145
+ * @typedef {Object} BatchEncoding Holds the output of the tokenizer's call function.
146
+ * @property {BatchEncodingItem} input_ids List of token ids to be fed to a model.
147
+ * @property {BatchEncodingItem} attention_mask List of indices specifying which tokens should be attended to by the model.
148
+ * @property {BatchEncodingItem} [token_type_ids] List of token type ids to be fed to a model.
149
+ */
150
+ /**
151
+ * Encode/tokenize the given text(s).
152
+ * @param {string|string[]} text The text to tokenize.
153
+ * @param {Object} options An optional object containing the following properties:
154
+ * @param {string|string[]} [options.text_pair=null] Optional second sequence to be encoded. If set, must be the same type as text.
155
+ * @param {boolean|'max_length'} [options.padding=false] Whether to pad the input sequences.
156
+ * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model.
157
+ * @param {boolean} [options.truncation=null] Whether to truncate the input sequences.
158
+ * @param {number} [options.max_length=null] Maximum length of the returned list and optionally padding length.
159
+ * @param {boolean} [options.return_tensor=true] Whether to return the results as Tensors or arrays.
160
+ * @param {boolean} [options.return_token_type_ids=null] Whether to return the token type ids.
161
+ * @returns {BatchEncoding} Object to be passed to the model.
162
+ */
163
+ _call(text: string | string[], { text_pair, add_special_tokens, padding, truncation, max_length, return_tensor, return_token_type_ids, }?: {
164
+ text_pair?: string | string[];
165
+ padding?: boolean | 'max_length';
166
+ add_special_tokens?: boolean;
167
+ truncation?: boolean;
168
+ max_length?: number;
169
+ return_tensor?: boolean;
170
+ return_token_type_ids?: boolean;
171
+ }): {
172
+ /**
173
+ * List of token ids to be fed to a model.
174
+ */
175
+ input_ids: number[] | number[][] | Tensor;
176
+ /**
177
+ * List of indices specifying which tokens should be attended to by the model.
178
+ */
179
+ attention_mask: number[] | number[][] | Tensor;
180
+ /**
181
+ * List of token type ids to be fed to a model.
182
+ */
183
+ token_type_ids?: number[] | number[][] | Tensor;
184
+ };
185
+ /**
186
+ * Encodes a single text using the preprocessor pipeline of the tokenizer.
187
+ *
188
+ * @param {string|null} text The text to encode.
189
+ * @returns {string[]|null} The encoded tokens.
190
+ */
191
+ _encode_text(text: string | null): string[] | null;
192
+ /**
193
+ * Encodes a single text or a pair of texts using the model's tokenizer.
194
+ *
195
+ * @param {string} text The text to encode.
196
+ * @param {Object} options An optional object containing the following properties:
197
+ * @param {string} [options.text_pair=null] The optional second text to encode.
198
+ * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model.
199
+ * @param {boolean} [options.return_token_type_ids=null] Whether to return token_type_ids.
200
+ * @returns {EncodingSingle} An object containing the encoded text.
201
+ * @private
202
+ */
203
+ private _encode_plus;
204
+ /**
205
+ * Internal helper function to tokenize a text, and optionally a pair of texts.
206
+ * @param {string} text The text to tokenize.
207
+ * @param {Object} options An optional object containing the following properties:
208
+ * @param {string} [options.pair=null] The optional second text to tokenize.
209
+ * @param {boolean} [options.add_special_tokens=false] Whether or not to add the special tokens associated with the corresponding model.
210
+ * @returns {{tokens: string[], token_type_ids?: number[]}} An object containing the tokens and optionally the token type IDs.
211
+ */
212
+ _tokenize_helper(text: string, { pair, add_special_tokens, }?: {
213
+ pair?: string;
214
+ add_special_tokens?: boolean;
215
+ }): {
216
+ tokens: string[];
217
+ token_type_ids?: number[];
218
+ };
219
+ /**
220
+ * Converts a string into a sequence of tokens.
221
+ * @param {string} text The sequence to be encoded.
222
+ * @param {Object} options An optional object containing the following properties:
223
+ * @param {string} [options.pair] A second sequence to be encoded with the first.
224
+ * @param {boolean} [options.add_special_tokens=false] Whether or not to add the special tokens associated with the corresponding model.
225
+ * @returns {string[]} The list of tokens.
226
+ */
227
+ tokenize(text: string, { pair, add_special_tokens, }?: {
228
+ pair?: string;
229
+ add_special_tokens?: boolean;
230
+ }): string[];
231
+ /**
232
+ * Encodes a single text or a pair of texts using the model's tokenizer.
233
+ *
234
+ * @param {string} text The text to encode.
235
+ * @param {Object} options An optional object containing the following properties:
236
+ * @param {string} [options.text_pair=null] The optional second text to encode.
237
+ * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model.
238
+ * @param {boolean} [options.return_token_type_ids=null] Whether to return token_type_ids.
239
+ * @returns {number[]} An array of token IDs representing the encoded text(s).
240
+ */
241
+ encode(text: string, { text_pair, add_special_tokens, return_token_type_ids, }?: {
242
+ text_pair?: string;
243
+ add_special_tokens?: boolean;
244
+ return_token_type_ids?: boolean;
245
+ }): number[];
246
+ /**
247
+ * Decode a batch of tokenized sequences.
248
+ * @param {number[][]|Tensor} batch List/Tensor of tokenized input sequences.
249
+ * @param {Object} decode_args (Optional) Object with decoding arguments.
250
+ * @returns {string[]} List of decoded sequences.
251
+ */
252
+ batch_decode(batch: number[][] | Tensor, decode_args?: any): string[];
253
+ /**
254
+ * Decodes a sequence of token IDs back to a string.
255
+ *
256
+ * @param {number[]|bigint[]|Tensor} token_ids List/Tensor of token IDs to decode.
257
+ * @param {Object} [decode_args={}]
258
+ * @param {boolean} [decode_args.skip_special_tokens=false] If true, special tokens are removed from the output string.
259
+ * @param {boolean} [decode_args.clean_up_tokenization_spaces=true] If true, spaces before punctuations and abbreviated forms are removed.
260
+ *
261
+ * @returns {string} The decoded string.
262
+ * @throws {Error} If `token_ids` is not a non-empty array of integers.
263
+ */
264
+ decode(token_ids: number[] | bigint[] | Tensor, decode_args?: {
265
+ skip_special_tokens?: boolean;
266
+ clean_up_tokenization_spaces?: boolean;
267
+ }): string;
268
+ /**
269
+ * Decode a single list of token ids to a string.
270
+ * @param {number[]|bigint[]} token_ids List of token ids to decode
271
+ * @param {Object} decode_args Optional arguments for decoding
272
+ * @param {boolean} [decode_args.skip_special_tokens=false] Whether to skip special tokens during decoding
273
+ * @param {boolean} [decode_args.clean_up_tokenization_spaces=null] Whether to clean up tokenization spaces during decoding.
274
+ * If null, the value is set to `this.decoder.cleanup` if it exists, falling back to `this.clean_up_tokenization_spaces` if it exists, falling back to `true`.
275
+ * @returns {string} The decoded string
276
+ */
277
+ decode_single(token_ids: number[] | bigint[], { skip_special_tokens, clean_up_tokenization_spaces, }: {
278
+ skip_special_tokens?: boolean;
279
+ clean_up_tokenization_spaces?: boolean;
280
+ }): string;
281
+ /**
282
+ * Converts a list of message objects with `"role"` and `"content"` keys to a list of token
283
+ * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
284
+ * determine the format and control tokens to use when converting.
285
+ *
286
+ * See [here](https://huggingface.co/docs/transformers/chat_templating) for more information.
287
+ *
288
+ * **Example:** Applying a chat template to a conversation.
289
+ *
290
+ * ```javascript
291
+ * import { AutoTokenizer } from "@huggingface/transformers";
292
+ *
293
+ * const tokenizer = await AutoTokenizer.from_pretrained("Xenova/mistral-tokenizer-v1");
294
+ *
295
+ * const chat = [
296
+ * { "role": "user", "content": "Hello, how are you?" },
297
+ * { "role": "assistant", "content": "I'm doing great. How can I help you today?" },
298
+ * { "role": "user", "content": "I'd like to show off how chat templating works!" },
299
+ * ]
300
+ *
301
+ * const text = tokenizer.apply_chat_template(chat, { tokenize: false });
302
+ * // "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
303
+ *
304
+ * const input_ids = tokenizer.apply_chat_template(chat, { tokenize: true, return_tensor: false });
305
+ * // [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793]
306
+ * ```
307
+ *
308
+ * @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys,
309
+ * representing the chat history so far.
310
+ * @param {Object} options An optional object containing the following properties:
311
+ * @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If
312
+ * this is not passed, the model's chat template will be used instead.
313
+ * @param {Object[]} [options.tools=null]
314
+ * A list of tools (callable functions) that will be accessible to the model. If the template does not
315
+ * support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
316
+ * giving the name, description and argument types for the tool. See our
317
+ * [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
318
+ * for more information.
319
+ * @param {Record<string, string>[]} [options.documents=null]
320
+ * A list of dicts representing documents that will be accessible to the model if it is performing RAG
321
+ * (retrieval-augmented generation). If the template does not support RAG, this argument will have no
322
+ * effect. We recommend that each document should be a dict containing "title" and "text" keys. Please
323
+ * see the RAG section of the [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#arguments-for-RAG)
324
+ * for examples of passing documents with chat templates.
325
+ * @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate
326
+ * the start of an assistant message. This is useful when you want to generate a response from the model.
327
+ * Note that this argument will be passed to the chat template, and so it must be supported in the
328
+ * template for this argument to have any effect.
329
+ * @param {boolean} [options.tokenize=true] Whether to tokenize the output. If false, the output will be a string.
330
+ * @param {boolean} [options.padding=false] Whether to pad sequences to the maximum length. Has no effect if tokenize is false.
331
+ * @param {boolean} [options.truncation=false] Whether to truncate sequences to the maximum length. Has no effect if tokenize is false.
332
+ * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false.
333
+ * If not specified, the tokenizer's `max_length` attribute will be used as a default.
334
+ * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false.
335
+ * @param {boolean} [options.return_dict=true] Whether to return a dictionary with named outputs. Has no effect if tokenize is false.
336
+ * @param {Object} [options.tokenizer_kwargs={}] Additional options to pass to the tokenizer.
337
+ * @returns {string | Tensor | number[]| number[][]|BatchEncoding} The tokenized output.
338
+ */
339
+ apply_chat_template(conversation: Message[], { tools, documents, chat_template, add_generation_prompt, tokenize, padding, truncation, max_length, return_tensor, return_dict, tokenizer_kwargs, ...kwargs }?: {
340
+ chat_template?: string;
341
+ tools?: any[];
342
+ documents?: Record<string, string>[];
343
+ add_generation_prompt?: boolean;
344
+ tokenize?: boolean;
345
+ padding?: boolean;
346
+ truncation?: boolean;
347
+ max_length?: number;
348
+ return_tensor?: boolean;
349
+ return_dict?: boolean;
350
+ tokenizer_kwargs?: any;
351
+ }): string | number[] | number[][] | Tensor | {
352
+ /**
353
+ * List of token ids to be fed to a model.
354
+ */
355
+ input_ids: number[] | number[][] | Tensor;
356
+ /**
357
+ * List of indices specifying which tokens should be attended to by the model.
358
+ */
359
+ attention_mask: number[] | number[][] | Tensor;
360
+ /**
361
+ * List of token type ids to be fed to a model.
362
+ */
363
+ token_type_ids?: number[] | number[][] | Tensor;
364
+ };
365
+ }
366
+ /**
367
+ * BertTokenizer is a class used to tokenize text for BERT models.
368
+ * @extends PreTrainedTokenizer
369
+ */
370
+ export class BertTokenizer extends PreTrainedTokenizer {
371
+ }
372
+ /**
373
+ * Albert tokenizer
374
+ * @extends PreTrainedTokenizer
375
+ */
376
+ export class AlbertTokenizer extends PreTrainedTokenizer {
377
+ }
378
+ export class MobileBertTokenizer extends PreTrainedTokenizer {
379
+ }
380
+ export class SqueezeBertTokenizer extends PreTrainedTokenizer {
381
+ }
382
+ export class DebertaTokenizer extends PreTrainedTokenizer {
383
+ }
384
+ export class DebertaV2Tokenizer extends PreTrainedTokenizer {
385
+ }
386
+ export class HerbertTokenizer extends PreTrainedTokenizer {
387
+ }
388
+ export class ConvBertTokenizer extends PreTrainedTokenizer {
389
+ }
390
+ export class RoFormerTokenizer extends PreTrainedTokenizer {
391
+ }
392
+ export class DistilBertTokenizer extends PreTrainedTokenizer {
393
+ }
394
+ export class CamembertTokenizer extends PreTrainedTokenizer {
395
+ }
396
+ export class XLMTokenizer extends PreTrainedTokenizer {
397
+ constructor(tokenizerJSON: any, tokenizerConfig: any);
398
+ }
399
+ export class ElectraTokenizer extends PreTrainedTokenizer {
400
+ }
401
+ export class T5Tokenizer extends PreTrainedTokenizer {
402
+ }
403
+ export class GPT2Tokenizer extends PreTrainedTokenizer {
404
+ }
405
+ export class BartTokenizer extends PreTrainedTokenizer {
406
+ }
407
+ export class MBartTokenizer extends PreTrainedTokenizer {
408
+ constructor(tokenizerJSON: any, tokenizerConfig: any);
409
+ languageRegex: RegExp;
410
+ language_codes: any[];
411
+ lang_to_token: (x: any) => any;
412
+ /**
413
+ * Helper function to build translation inputs for an `MBartTokenizer`.
414
+ * @param {string|string[]} raw_inputs The text to tokenize.
415
+ * @param {Object} tokenizer_options Options to be sent to the tokenizer
416
+ * @param {Object} generate_kwargs Generation options.
417
+ * @returns {Object} Object to be passed to the model.
418
+ */
419
+ _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any;
420
+ }
421
+ export class MBart50Tokenizer extends MBartTokenizer {
422
+ }
423
+ export class RobertaTokenizer extends PreTrainedTokenizer {
424
+ }
425
+ export class BloomTokenizer extends PreTrainedTokenizer {
426
+ constructor(tokenizerJSON: any, tokenizerConfig: any);
427
+ }
428
+ export class LlamaTokenizer extends PreTrainedTokenizer {
429
+ constructor(tokenizerJSON: any, tokenizerConfig: any);
430
+ legacy: any;
431
+ }
432
+ export class CodeLlamaTokenizer extends PreTrainedTokenizer {
433
+ }
434
+ export class XLMRobertaTokenizer extends PreTrainedTokenizer {
435
+ }
436
+ export class MPNetTokenizer extends PreTrainedTokenizer {
437
+ }
438
+ export class FalconTokenizer extends PreTrainedTokenizer {
439
+ }
440
+ export class GPTNeoXTokenizer extends PreTrainedTokenizer {
441
+ }
442
+ export class EsmTokenizer extends PreTrainedTokenizer {
443
+ }
444
+ export class Qwen2Tokenizer extends PreTrainedTokenizer {
445
+ }
446
+ export class GemmaTokenizer extends PreTrainedTokenizer {
447
+ }
448
+ export class Grok1Tokenizer extends PreTrainedTokenizer {
449
+ }
450
+ /**
451
+ * The NllbTokenizer class is used to tokenize text for NLLB ("No Language Left Behind") models.
452
+ *
453
+ * No Language Left Behind (NLLB) is a first-of-its-kind, AI breakthrough project
454
+ * that open-sources models capable of delivering high-quality translations directly
455
+ * between any pair of 200+ languages — including low-resource languages like Asturian,
456
+ * Luganda, Urdu and more. It aims to help people communicate with anyone, anywhere,
457
+ * regardless of their language preferences. For more information, check out their
458
+ * [paper](https://arxiv.org/abs/2207.04672).
459
+ *
460
+ * For a list of supported languages (along with their language codes),
461
+ * @see {@link https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200}
462
+ */
463
+ export class NllbTokenizer extends PreTrainedTokenizer {
464
+ constructor(tokenizerJSON: any, tokenizerConfig: any);
465
+ languageRegex: RegExp;
466
+ language_codes: any[];
467
+ lang_to_token: (x: any) => any;
468
+ /**
469
+ * Helper function to build translation inputs for an `NllbTokenizer`.
470
+ * @param {string|string[]} raw_inputs The text to tokenize.
471
+ * @param {Object} tokenizer_options Options to be sent to the tokenizer
472
+ * @param {Object} generate_kwargs Generation options.
473
+ * @returns {Object} Object to be passed to the model.
474
+ */
475
+ _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any;
476
+ }
477
+ /**
478
+ * The M2M100Tokenizer class is used to tokenize text for M2M100 ("Many-to-Many") models.
479
+ *
480
+ * M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many
481
+ * multilingual translation. It was introduced in this [paper](https://arxiv.org/abs/2010.11125)
482
+ * and first released in [this](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100) repository.
483
+ *
484
+ * For a list of supported languages (along with their language codes),
485
+ * @see {@link https://huggingface.co/facebook/m2m100_418M#languages-covered}
486
+ */
487
+ export class M2M100Tokenizer extends PreTrainedTokenizer {
488
+ constructor(tokenizerJSON: any, tokenizerConfig: any);
489
+ languageRegex: RegExp;
490
+ language_codes: any[];
491
+ lang_to_token: (x: any) => string;
492
+ /**
493
+ * Helper function to build translation inputs for an `M2M100Tokenizer`.
494
+ * @param {string|string[]} raw_inputs The text to tokenize.
495
+ * @param {Object} tokenizer_options Options to be sent to the tokenizer
496
+ * @param {Object} generate_kwargs Generation options.
497
+ * @returns {Object} Object to be passed to the model.
498
+ */
499
+ _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any;
500
+ }
501
+ /**
502
+ * WhisperTokenizer tokenizer
503
+ * @extends PreTrainedTokenizer
504
+ */
505
+ export class WhisperTokenizer extends PreTrainedTokenizer {
506
+ get timestamp_begin(): number;
507
+ /**
508
+ * Decodes automatic speech recognition (ASR) sequences.
509
+ * @param {Array<{tokens: bigint[], token_timestamps?: number[], stride: number[]}>} sequences The sequences to decode.
510
+ * @param {Object} options The options to use for decoding.
511
+ * @returns {Array<string|{chunks?: undefined|Array<{language: string|null, timestamp: Array<number|null>, text: string}>}>} The decoded sequences.
512
+ */
513
+ _decode_asr(sequences: Array<{
514
+ tokens: bigint[];
515
+ token_timestamps?: number[];
516
+ stride: number[];
517
+ }>, { return_timestamps, return_language, time_precision, force_full_sequences }?: any): (string | {
518
+ chunks?: undefined | Array<{
519
+ language: string | null;
520
+ timestamp: Array<number | null>;
521
+ text: string;
522
+ }>;
523
+ })[];
524
+ /**
525
+ * Finds the longest common sequence among the provided sequences.
526
+ * @param {number[][]} sequences An array of sequences of token ids to compare.
527
+ * @returns {number[][]} The longest common sequence found.
528
+ * @throws {Error} If there is a bug within the function.
529
+ * @private
530
+ */
531
+ private findLongestCommonSequence;
532
+ /** @private */
533
+ private collateWordTimestamps;
534
+ /**
535
+ * Groups tokens by word. Returns a tuple containing a list of strings with the words,
536
+ * and a list of `token_id` sequences with the tokens making up each word.
537
+ * @param {number[]} tokens
538
+ * @param {string} [language]
539
+ * @param {string} prepend_punctionations
540
+ * @param {string} append_punctuations
541
+ *
542
+ * @private
543
+ */
544
+ private combineTokensIntoWords;
545
+ /**
546
+ * @param {number[]|bigint[]} token_ids List of token IDs to decode.
547
+ * @param {Object} decode_args Optional arguments for decoding
548
+ * @private
549
+ */
550
+ private decodeWithTimestamps;
551
+ /**
552
+ * Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points.
553
+ * @param {number[]} tokens
554
+ * @returns {*}
555
+ * @private
556
+ */
557
+ private splitTokensOnUnicode;
558
+ /**
559
+ * Combine tokens into words by splitting at whitespace and punctuation tokens.
560
+ * @param {number[]} tokens
561
+ * @private
562
+ */
563
+ private splitTokensOnSpaces;
564
+ /**
565
+ * Merges punctuation tokens with neighboring words.
566
+ * @param {string[]} words
567
+ * @param {number[][]} tokens
568
+ * @param {number[][]} indices
569
+ * @param {string} prepended
570
+ * @param {string} appended
571
+ * @private
572
+ */
573
+ private mergePunctuations;
574
+ /**
575
+ * Helper function to build translation inputs for a `WhisperTokenizer`,
576
+ * depending on the language, task, and whether to predict timestamp tokens.
577
+ *
578
+ * Used to override the prefix tokens appended to the start of the label sequence.
579
+ *
580
+ * **Example: Get ids for a language**
581
+ * ```javascript
582
+ * // instantiate the tokenizer and set the prefix token to Spanish
583
+ * const tokenizer = await WhisperTokenizer.from_pretrained('Xenova/whisper-tiny');
584
+ * const forced_decoder_ids = tokenizer.get_decoder_prompt_ids({ language: 'spanish' });
585
+ * // [(1, 50262), (2, 50363)]
586
+ * ```
587
+ *
588
+ * @param {Object} options Options to generate the decoder prompt.
589
+ * @param {string} [options.language] The language of the transcription text.
590
+ * The corresponding language id token is appended to the start of the sequence for multilingual
591
+ * speech recognition and speech translation tasks, e.g. for "Spanish" the token "<|es|>" is appended
592
+ * to the start of sequence.
593
+ * @param {string} [options.task] Task identifier to append at the start of sequence (if any).
594
+ * This should be used for mulitlingual fine-tuning, with "transcribe" for speech recognition and
595
+ * "translate" for speech translation.
596
+ * @param {boolean} [options.no_timestamps] Whether to add the <|notimestamps|> token at the start of the sequence.
597
+ * @returns {number[][]} The decoder prompt ids.
598
+ */
599
+ get_decoder_prompt_ids({ language, task, no_timestamps, }?: {
600
+ language?: string;
601
+ task?: string;
602
+ no_timestamps?: boolean;
603
+ }): number[][];
604
+ }
605
+ export class CodeGenTokenizer extends PreTrainedTokenizer {
606
+ }
607
+ export class CLIPTokenizer extends PreTrainedTokenizer {
608
+ }
609
+ export class SiglipTokenizer extends PreTrainedTokenizer {
610
+ }
611
+ /**
612
+ * @todo This model is not yet supported by Hugging Face's "fast" tokenizers library (https://github.com/huggingface/tokenizers).
613
+ * Therefore, this implementation (which is based on fast tokenizers) may produce slightly inaccurate results.
614
+ */
615
+ export class MarianTokenizer extends PreTrainedTokenizer {
616
+ languageRegex: RegExp;
617
+ supported_language_codes: string[];
618
+ /**
619
+ * Encodes a single text. Overriding this method is necessary since the language codes
620
+ * must be removed before encoding with sentencepiece model.
621
+ * @see https://github.com/huggingface/transformers/blob/12d51db243a00726a548a43cc333390ebae731e3/src/transformers/models/marian/tokenization_marian.py#L204-L213
622
+ *
623
+ * @param {string|null} text The text to encode.
624
+ * @returns {Array} The encoded tokens.
625
+ */
626
+ _encode_text(text: string | null): any[];
627
+ }
628
+ export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {
629
+ }
630
+ export class BlenderbotTokenizer extends PreTrainedTokenizer {
631
+ }
632
+ export class BlenderbotSmallTokenizer extends PreTrainedTokenizer {
633
+ }
634
+ export class SpeechT5Tokenizer extends PreTrainedTokenizer {
635
+ }
636
+ export class NougatTokenizer extends PreTrainedTokenizer {
637
+ }
638
+ export class VitsTokenizer extends PreTrainedTokenizer {
639
+ constructor(tokenizerJSON: any, tokenizerConfig: any);
640
+ }
641
+ export class CohereTokenizer extends PreTrainedTokenizer {
642
+ }
643
+ /**
644
+ * Helper class which is used to instantiate pretrained tokenizers with the `from_pretrained` function.
645
+ * The chosen tokenizer class is determined by the type specified in the tokenizer config.
646
+ *
647
+ * @example
648
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/bert-base-uncased');
649
+ */
650
+ export class AutoTokenizer {
651
+ static TOKENIZER_CLASS_MAPPING: {
652
+ T5Tokenizer: typeof T5Tokenizer;
653
+ DistilBertTokenizer: typeof DistilBertTokenizer;
654
+ CamembertTokenizer: typeof CamembertTokenizer;
655
+ DebertaTokenizer: typeof DebertaTokenizer;
656
+ DebertaV2Tokenizer: typeof DebertaV2Tokenizer;
657
+ BertTokenizer: typeof BertTokenizer;
658
+ HerbertTokenizer: typeof HerbertTokenizer;
659
+ ConvBertTokenizer: typeof ConvBertTokenizer;
660
+ RoFormerTokenizer: typeof RoFormerTokenizer;
661
+ XLMTokenizer: typeof XLMTokenizer;
662
+ ElectraTokenizer: typeof ElectraTokenizer;
663
+ MobileBertTokenizer: typeof MobileBertTokenizer;
664
+ SqueezeBertTokenizer: typeof SqueezeBertTokenizer;
665
+ AlbertTokenizer: typeof AlbertTokenizer;
666
+ GPT2Tokenizer: typeof GPT2Tokenizer;
667
+ BartTokenizer: typeof BartTokenizer;
668
+ MBartTokenizer: typeof MBartTokenizer;
669
+ MBart50Tokenizer: typeof MBart50Tokenizer;
670
+ RobertaTokenizer: typeof RobertaTokenizer;
671
+ WhisperTokenizer: typeof WhisperTokenizer;
672
+ CodeGenTokenizer: typeof CodeGenTokenizer;
673
+ CLIPTokenizer: typeof CLIPTokenizer;
674
+ SiglipTokenizer: typeof SiglipTokenizer;
675
+ MarianTokenizer: typeof MarianTokenizer;
676
+ BloomTokenizer: typeof BloomTokenizer;
677
+ NllbTokenizer: typeof NllbTokenizer;
678
+ M2M100Tokenizer: typeof M2M100Tokenizer;
679
+ LlamaTokenizer: typeof LlamaTokenizer;
680
+ CodeLlamaTokenizer: typeof CodeLlamaTokenizer;
681
+ XLMRobertaTokenizer: typeof XLMRobertaTokenizer;
682
+ MPNetTokenizer: typeof MPNetTokenizer;
683
+ FalconTokenizer: typeof FalconTokenizer;
684
+ GPTNeoXTokenizer: typeof GPTNeoXTokenizer;
685
+ EsmTokenizer: typeof EsmTokenizer;
686
+ Wav2Vec2CTCTokenizer: typeof Wav2Vec2CTCTokenizer;
687
+ BlenderbotTokenizer: typeof BlenderbotTokenizer;
688
+ BlenderbotSmallTokenizer: typeof BlenderbotSmallTokenizer;
689
+ SpeechT5Tokenizer: typeof SpeechT5Tokenizer;
690
+ NougatTokenizer: typeof NougatTokenizer;
691
+ VitsTokenizer: typeof VitsTokenizer;
692
+ Qwen2Tokenizer: typeof Qwen2Tokenizer;
693
+ GemmaTokenizer: typeof GemmaTokenizer;
694
+ Grok1Tokenizer: typeof Grok1Tokenizer;
695
+ CohereTokenizer: typeof CohereTokenizer;
696
+ PreTrainedTokenizer: typeof PreTrainedTokenizer;
697
+ };
698
+ /**
699
+ * Instantiate one of the tokenizer classes of the library from a pretrained model.
700
+ *
701
+ * The tokenizer class to instantiate is selected based on the `tokenizer_class` property of the config object
702
+ * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
703
+ *
704
+ * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
705
+ * - A string, the *model id* of a pretrained tokenizer hosted inside a model repo on huggingface.co.
706
+ * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
707
+ * user or organization name, like `dbmdz/bert-base-german-cased`.
708
+ * - A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/`.
709
+ * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer.
710
+ *
711
+ * @returns {Promise<PreTrainedTokenizer>} A new instance of the PreTrainedTokenizer class.
712
+ */
713
+ static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, legacy, }?: PretrainedTokenizerOptions): Promise<PreTrainedTokenizer>;
714
+ }
715
+ /**
716
+ * Additional tokenizer-specific properties.
717
+ */
718
+ export type TokenizerProperties = {
719
+ /**
720
+ * Whether or not the `legacy` behavior of the tokenizer should be used.
721
+ */
722
+ legacy?: boolean;
723
+ };
724
+ export type PretrainedTokenizerOptions = import('./utils/hub.js').PretrainedOptions & TokenizerProperties;
725
+ export type BPENode = {
726
+ /**
727
+ * The token associated with the node
728
+ */
729
+ token: string;
730
+ /**
731
+ * A positional bias for the node.
732
+ */
733
+ bias: number;
734
+ /**
735
+ * The score of the node.
736
+ */
737
+ score?: number;
738
+ /**
739
+ * The previous node in the linked list.
740
+ */
741
+ prev?: BPENode;
742
+ /**
743
+ * The next node in the linked list.
744
+ */
745
+ next?: BPENode;
746
+ };
747
+ export type SplitDelimiterBehavior = 'removed' | 'isolated' | 'mergedWithPrevious' | 'mergedWithNext' | 'contiguous';
748
+ export type PostProcessedOutput = {
749
+ /**
750
+ * List of token produced by the post-processor.
751
+ */
752
+ tokens: string[];
753
+ /**
754
+ * List of token type ids produced by the post-processor.
755
+ */
756
+ token_type_ids?: number[];
757
+ };
758
+ export type EncodingSingle = {
759
+ /**
760
+ * List of token ids to be fed to a model.
761
+ */
762
+ input_ids: number[];
763
+ /**
764
+ * List of token type ids to be fed to a model
765
+ */
766
+ attention_mask: number[];
767
+ /**
768
+ * List of indices specifying which tokens should be attended to by the model
769
+ */
770
+ token_type_ids?: number[];
771
+ };
772
+ export type Message = {
773
+ /**
774
+ * The role of the message (e.g., "user" or "assistant" or "system").
775
+ */
776
+ role: string;
777
+ /**
778
+ * The content of the message.
779
+ */
780
+ content: string;
781
+ };
782
+ declare const Normalizer_base: new () => {
783
+ (...args: any[]): any;
784
+ _call(...args: any[]): any;
785
+ };
786
+ /**
787
+ * A base class for text normalization.
788
+ * @abstract
789
+ */
790
+ declare class Normalizer extends Normalizer_base {
791
+ /**
792
+ * Factory method for creating normalizers from config objects.
793
+ * @static
794
+ * @param {Object} config The configuration object for the normalizer.
795
+ * @returns {Normalizer} A Normalizer object.
796
+ * @throws {Error} If an unknown Normalizer type is specified in the config.
797
+ */
798
+ static fromConfig(config: any): Normalizer;
799
+ /**
800
+ * @param {Object} config The configuration object for the normalizer.
801
+ */
802
+ constructor(config: any);
803
+ config: any;
804
+ /**
805
+ * Normalize the input text.
806
+ * @abstract
807
+ * @param {string} text The text to normalize.
808
+ * @returns {string} The normalized text.
809
+ * @throws {Error} If this method is not implemented in a subclass.
810
+ */
811
+ normalize(text: string): string;
812
+ /**
813
+ * Alias for {@link Normalizer#normalize}.
814
+ * @param {string} text The text to normalize.
815
+ * @returns {string} The normalized text.
816
+ */
817
+ _call(text: string): string;
818
+ }
819
+ declare const PreTokenizer_base: new () => {
820
+ (...args: any[]): any;
821
+ _call(...args: any[]): any;
822
+ };
823
+ /**
824
+ * A callable class representing a pre-tokenizer used in tokenization. Subclasses
825
+ * should implement the `pre_tokenize_text` method to define the specific pre-tokenization logic.
826
+ * @extends Callable
827
+ */
828
+ declare class PreTokenizer extends PreTokenizer_base {
829
+ /**
830
+ * Factory method that returns an instance of a subclass of `PreTokenizer` based on the provided configuration.
831
+ *
832
+ * @static
833
+ * @param {Object} config A configuration object for the pre-tokenizer.
834
+ * @returns {PreTokenizer} An instance of a subclass of `PreTokenizer`.
835
+ * @throws {Error} If the provided configuration object does not correspond to any known pre-tokenizer.
836
+ */
837
+ static fromConfig(config: any): PreTokenizer;
838
+ /**
839
+ * Method that should be implemented by subclasses to define the specific pre-tokenization logic.
840
+ *
841
+ * @abstract
842
+ * @param {string} text The text to pre-tokenize.
843
+ * @param {Object} [options] Additional options for the pre-tokenization logic.
844
+ * @returns {string[]} The pre-tokenized text.
845
+ * @throws {Error} If the method is not implemented in the subclass.
846
+ */
847
+ pre_tokenize_text(text: string, options?: any): string[];
848
+ /**
849
+ * Tokenizes the given text into pre-tokens.
850
+ * @param {string|string[]} text The text or array of texts to pre-tokenize.
851
+ * @param {Object} [options] Additional options for the pre-tokenization logic.
852
+ * @returns {string[]} An array of pre-tokens.
853
+ */
854
+ pre_tokenize(text: string | string[], options?: any): string[];
855
+ /**
856
+ * Alias for {@link PreTokenizer#pre_tokenize}.
857
+ * @param {string|string[]} text The text or array of texts to pre-tokenize.
858
+ * @param {Object} [options] Additional options for the pre-tokenization logic.
859
+ * @returns {string[]} An array of pre-tokens.
860
+ */
861
+ _call(text: string | string[], options?: any): string[];
862
+ }
863
+ declare const PostProcessor_base: new () => {
864
+ (...args: any[]): any;
865
+ _call(...args: any[]): any;
866
+ };
867
+ /**
868
+ * @typedef {Object} PostProcessedOutput
869
+ * @property {string[]} tokens List of token produced by the post-processor.
870
+ * @property {number[]} [token_type_ids] List of token type ids produced by the post-processor.
871
+ */
872
+ /**
873
+ * @typedef {Object} EncodingSingle
874
+ * @property {number[]} input_ids List of token ids to be fed to a model.
875
+ * @property {number[]} attention_mask List of token type ids to be fed to a model
876
+ * @property {number[]} [token_type_ids] List of indices specifying which tokens should be attended to by the model
877
+ */
878
+ /**
879
+ * @extends Callable
880
+ */
881
+ declare class PostProcessor extends PostProcessor_base {
882
+ /**
883
+ * Factory method to create a PostProcessor object from a configuration object.
884
+ *
885
+ * @param {Object} config Configuration object representing a PostProcessor.
886
+ * @returns {PostProcessor} A PostProcessor object created from the given configuration.
887
+ * @throws {Error} If an unknown PostProcessor type is encountered.
888
+ */
889
+ static fromConfig(config: any): PostProcessor;
890
+ /**
891
+ * @param {Object} config The configuration for the post-processor.
892
+ */
893
+ constructor(config: any);
894
+ config: any;
895
+ /**
896
+ * Method to be implemented in subclass to apply post-processing on the given tokens.
897
+ *
898
+ * @param {Array} tokens The input tokens to be post-processed.
899
+ * @param {...*} args Additional arguments required by the post-processing logic.
900
+ * @returns {PostProcessedOutput} The post-processed tokens.
901
+ * @throws {Error} If the method is not implemented in subclass.
902
+ */
903
+ post_process(tokens: any[], ...args: any[]): PostProcessedOutput;
904
+ /**
905
+ * Alias for {@link PostProcessor#post_process}.
906
+ * @param {Array} tokens The text or array of texts to post-process.
907
+ * @param {...*} args Additional arguments required by the post-processing logic.
908
+ * @returns {PostProcessedOutput} The post-processed tokens.
909
+ */
910
+ _call(tokens: any[], ...args: any[]): PostProcessedOutput;
911
+ }
912
+ declare const Decoder_base: new () => {
913
+ (...args: any[]): any;
914
+ _call(...args: any[]): any;
915
+ };
916
+ /**
917
+ * The base class for token decoders.
918
+ * @extends Callable
919
+ */
920
+ declare class Decoder extends Decoder_base {
921
+ /**
922
+ * Creates a decoder instance based on the provided configuration.
923
+ *
924
+ * @param {Object} config The configuration object.
925
+ * @returns {Decoder} A decoder instance.
926
+ * @throws {Error} If an unknown decoder type is provided.
927
+ */
928
+ static fromConfig(config: any): Decoder;
929
+ /**
930
+ * Creates an instance of `Decoder`.
931
+ *
932
+ * @param {Object} config The configuration object.
933
+ */
934
+ constructor(config: any);
935
+ config: any;
936
+ /** @type {AddedToken[]} */
937
+ added_tokens: AddedToken[];
938
+ end_of_word_suffix: any;
939
+ trim_offsets: any;
940
+ /**
941
+ * Calls the `decode` method.
942
+ *
943
+ * @param {string[]} tokens The list of tokens.
944
+ * @returns {string} The decoded string.
945
+ */
946
+ _call(tokens: string[]): string;
947
+ /**
948
+ * Decodes a list of tokens.
949
+ * @param {string[]} tokens The list of tokens.
950
+ * @returns {string} The decoded string.
951
+ */
952
+ decode(tokens: string[]): string;
953
+ /**
954
+ * Apply the decoder to a list of tokens.
955
+ *
956
+ * @param {string[]} tokens The list of tokens.
957
+ * @returns {string[]} The decoded list of tokens.
958
+ * @throws {Error} If the `decode_chain` method is not implemented in the subclass.
959
+ */
960
+ decode_chain(tokens: string[]): string[];
961
+ }
962
+ /**
963
+ * Represent a token added by the user on top of the existing Model vocabulary.
964
+ * AddedToken can be configured to specify the behavior they should have in various situations like:
965
+ * - Whether they should only match single words
966
+ * - Whether to include any whitespace on its left or right
967
+ */
968
+ declare class AddedToken {
969
+ /**
970
+ * Creates a new instance of AddedToken.
971
+ * @param {Object} config Added token configuration object.
972
+ * @param {string} config.content The content of the added token.
973
+ * @param {number} config.id The id of the added token.
974
+ * @param {boolean} [config.single_word=false] Whether this token must be a single word or can break words.
975
+ * @param {boolean} [config.lstrip=false] Whether this token should strip whitespaces on its left.
976
+ * @param {boolean} [config.rstrip=false] Whether this token should strip whitespaces on its right.
977
+ * @param {boolean} [config.normalized=false] Whether this token should be normalized.
978
+ * @param {boolean} [config.special=false] Whether this token is special.
979
+ */
980
+ constructor(config: {
981
+ content: string;
982
+ id: number;
983
+ single_word?: boolean;
984
+ lstrip?: boolean;
985
+ rstrip?: boolean;
986
+ normalized?: boolean;
987
+ special?: boolean;
988
+ });
989
+ content: string;
990
+ id: number;
991
+ single_word: boolean;
992
+ lstrip: boolean;
993
+ rstrip: boolean;
994
+ special: boolean;
995
+ normalized: boolean;
996
+ }
997
+ import { Tensor } from './utils/tensor.js';
998
+ export {};
999
+ //# sourceMappingURL=tokenizers.d.ts.map