@huggingface/transformers 3.0.0-alpha.9 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +82 -50
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +2550 -2552
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +3639 -3567
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +25 -25
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +41 -42
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +56 -57
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +2586 -2564
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +14 -13
  16. package/src/backends/onnx.js +24 -19
  17. package/src/configs.js +19 -4
  18. package/src/env.js +5 -9
  19. package/src/generation/logits_process.js +40 -37
  20. package/src/models.js +356 -539
  21. package/src/ops/registry.js +14 -3
  22. package/src/pipelines.js +5 -5
  23. package/src/processors.js +392 -351
  24. package/src/tokenizers.js +140 -175
  25. package/src/utils/constants.js +1 -1
  26. package/src/utils/core.js +12 -0
  27. package/src/utils/data-structures.js +13 -11
  28. package/src/utils/hub.js +1 -1
  29. package/src/utils/maths.js +14 -5
  30. package/src/utils/tensor.js +60 -13
  31. package/types/backends/onnx.d.ts +5 -2
  32. package/types/backends/onnx.d.ts.map +1 -1
  33. package/types/configs.d.ts +29 -3
  34. package/types/configs.d.ts.map +1 -1
  35. package/types/env.d.ts +4 -2
  36. package/types/env.d.ts.map +1 -1
  37. package/types/generation/logits_process.d.ts.map +1 -1
  38. package/types/models.d.ts +116 -289
  39. package/types/models.d.ts.map +1 -1
  40. package/types/ops/registry.d.ts +6 -6
  41. package/types/ops/registry.d.ts.map +1 -1
  42. package/types/pipelines.d.ts +1 -2
  43. package/types/pipelines.d.ts.map +1 -1
  44. package/types/processors.d.ts +58 -51
  45. package/types/processors.d.ts.map +1 -1
  46. package/types/tokenizers.d.ts +23 -32
  47. package/types/tokenizers.d.ts.map +1 -1
  48. package/types/utils/constants.d.ts +1 -1
  49. package/types/utils/constants.d.ts.map +1 -1
  50. package/types/utils/core.d.ts +7 -0
  51. package/types/utils/core.d.ts.map +1 -1
  52. package/types/utils/data-structures.d.ts +6 -6
  53. package/types/utils/data-structures.d.ts.map +1 -1
  54. package/types/utils/hub.d.ts +1 -1
  55. package/types/utils/hub.d.ts.map +1 -1
  56. package/types/utils/maths.d.ts +2 -2
  57. package/types/utils/maths.d.ts.map +1 -1
  58. package/types/utils/tensor.d.ts +27 -1
  59. package/types/utils/tensor.d.ts.map +1 -1
package/src/tokenizers.js CHANGED
@@ -28,6 +28,7 @@ import {
28
28
  escapeRegExp,
29
29
  isIntegralNumber,
30
30
  mergeArrays,
31
+ len,
31
32
  } from './utils/core.js';
32
33
 
33
34
  import {
@@ -195,7 +196,7 @@ function clean_up_tokenization(text) {
195
196
  * @returns {string} The text with accents removed.
196
197
  */
197
198
  function remove_accents(text) {
198
- return text.replace(/[\u0300-\u036f]/g, '');
199
+ return text.replace(/\p{M}/gu, '');
199
200
  }
200
201
 
201
202
  /**
@@ -236,23 +237,26 @@ export function is_chinese_char(cp) {
236
237
  }
237
238
 
238
239
  /**
239
- * Helper function to fuse consecutive values in an array equal to the specified value.
240
- * @param {string[]} arr The input array
241
- * @param {any} value The value to fuse on.
242
- * @param {Map<string, any>} mapping The mapping from input domain to value.
240
+ * Helper function to fuse consecutive unknown tokens.
241
+ * @param {string[]} arr The list of input tokens
242
+ * @param {Map<string, any>} tokens_to_ids The mapping from tokens to token ids.
243
+ * @param {number} unk_token_id The value to fuse on.
244
+ * @private
243
245
  */
244
- function fuse(arr, value, mapping) {
246
+ function fuse_unk(arr, tokens_to_ids, unk_token_id) {
245
247
  const fused = [];
246
248
  let i = 0;
247
249
  while (i < arr.length) {
248
250
  fused.push(arr[i])
249
- if ((mapping.get(arr[i]) ?? value) !== value) {
251
+ if ((tokens_to_ids.get(arr[i]) ?? unk_token_id) !== unk_token_id) {
250
252
  ++i;
251
253
  continue;
252
254
  }
253
255
 
254
- while (i < arr.length && (mapping.get(arr[i]) ?? value) === value) {
255
- ++i;
256
+ while (++i < arr.length && (tokens_to_ids.get(arr[i]) ?? unk_token_id) === unk_token_id) {
257
+ if (tokens_to_ids.get(fused.at(-1)) !== unk_token_id) {
258
+ fused[fused.length - 1] += arr[i];
259
+ }
256
260
  }
257
261
  }
258
262
 
@@ -270,12 +274,17 @@ function whitespace_split(text) {
270
274
 
271
275
  const PUNCTUATION_REGEX = '\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E';
272
276
  const PUNCTUATION_ONLY_REGEX = new RegExp(`^[${PUNCTUATION_REGEX}]+$`, 'gu');
277
+ const BLOOM_SPLIT_CHARS = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c';
273
278
 
274
- // A mapping of regex patterns to their equivalent (but longer) JS-compatible versions.
279
+ // A mapping of regex patterns to their equivalent (but possibly longer) JS-compatible versions.
275
280
  const PROBLEMATIC_REGEX_MAP = new Map([
276
281
  // This uses the case insensitive group modifier, which is not supported in JavaScript.
277
282
  // When parsing the regex, an "Invalid group" error is thrown.
278
283
  ["(?i:'s|'t|'re|'ve|'m|'ll|'d)", "(?:'([sS]|[tT]|[rR][eE]|[vV][eE]|[mM]|[lL][lL]|[dD]))"],
284
+
285
+ // Used to override the default (invalid) regex of the bloom pretokenizer.
286
+ // For more information, see https://github.com/huggingface/transformers.js/issues/94
287
+ [` ?[^(\\s|[${BLOOM_SPLIT_CHARS}])]+`, ` ?[^\\s${BLOOM_SPLIT_CHARS}]+`],
279
288
  ])
280
289
 
281
290
 
@@ -353,14 +362,21 @@ export class TokenizerModel extends Callable {
353
362
  case 'Unigram':
354
363
  // @ts-ignore
355
364
  return new Unigram(config, ...args);
356
-
357
365
  case 'BPE':
358
366
  return new BPE(config);
359
367
 
360
368
  default:
369
+ // Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
370
+ // In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
361
371
  if (config.vocab) {
362
- // @ts-ignore
363
- return new LegacyTokenizerModel(config, ...args);
372
+ if (Array.isArray(config.vocab)) {
373
+ // config.vocab is of type `[string, number][]`
374
+ // @ts-ignore
375
+ return new Unigram(config, ...args);
376
+ } else {
377
+ // @ts-ignore
378
+ return new LegacyTokenizerModel(config, ...args);
379
+ }
364
380
  }
365
381
  throw new Error(`Unknown TokenizerModel type: ${config.type}`);
366
382
  }
@@ -369,15 +385,15 @@ export class TokenizerModel extends Callable {
369
385
  /**
370
386
  * Internal function to call the TokenizerModel instance.
371
387
  * @param {string[]} tokens The tokens to encode.
372
- * @returns {string[]} The encoded token IDs.
388
+ * @returns {string[]} The encoded tokens.
373
389
  */
374
390
  _call(tokens) {
375
- let ids = this.encode(tokens);
391
+ tokens = this.encode(tokens);
376
392
  if (this.fuse_unk) {
377
393
  // Fuse unknown tokens
378
- ids = fuse(ids, this.unk_token_id, this.tokens_to_ids);
394
+ tokens = fuse_unk(tokens, this.tokens_to_ids, this.unk_token_id);
379
395
  }
380
- return ids;
396
+ return tokens;
381
397
  }
382
398
 
383
399
  /**
@@ -538,18 +554,18 @@ class Unigram extends TokenizerModel {
538
554
  this.unk_token = this.vocab[config.unk_id];
539
555
 
540
556
  this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i]));
541
- this.bosToken = ' '; // beginning of a sentence token
557
+ this.bos_token = ' '; // beginning of a sentence token
542
558
 
543
- this.bosTokenId = this.tokens_to_ids.get(this.bosToken); // NOTE: may be undefined
544
- this.eosToken = moreConfig.eos_token;
559
+ this.bos_token_id = this.tokens_to_ids.get(this.bos_token); // NOTE: may be undefined
560
+ this.eos_token = moreConfig.eos_token;
545
561
 
546
- this.eosTokenId = this.tokens_to_ids.get(this.eosToken);
547
- this.unkToken = this.vocab[this.unk_token_id];
562
+ this.eos_token_id = this.tokens_to_ids.get(this.eos_token);
563
+ this.unk_token = this.vocab[this.unk_token_id];
548
564
 
549
565
  this.minScore = min(this.scores)[0];
550
566
 
551
- this.unkScore = this.minScore - 10.0;
552
- this.scores[this.unk_token_id] = this.unkScore;
567
+ this.unk_score = this.minScore - 10.0;
568
+ this.scores[this.unk_token_id] = this.unk_score;
553
569
 
554
570
  this.trie = new CharTrie();
555
571
  this.trie.extend(this.vocab);
@@ -564,26 +580,27 @@ class Unigram extends TokenizerModel {
564
580
  * @param {TokenLattice} lattice The token lattice to populate with nodes.
565
581
  */
566
582
  populateNodes(lattice) {
567
- const sentence = lattice.sentence;
568
- const len = sentence.length;
583
+ const chars = lattice.chars;
584
+ const mblen = 1;
569
585
  let beginPos = 0;
570
- while (beginPos < len) {
571
- const mblen = 1;
586
+ while (beginPos < chars.length) {
572
587
  let hasSingleNode = false;
573
- const tokens = [];
574
588
 
575
- for (let token of this.trie.commonPrefixSearch(sentence.slice(beginPos))) {
589
+ const tokens = [];
590
+ const sliced = chars.slice(beginPos).join('');
591
+ const prefixedTokens = this.trie.commonPrefixSearch(sliced);
592
+ for (const token of prefixedTokens) {
576
593
  tokens.push(token);
577
594
  const tokenId = this.tokens_to_ids.get(token);
578
595
  const tokenScore = this.scores[tokenId];
579
- const n = token.length;
596
+ const n = len(token);
580
597
  lattice.insert(beginPos, n, tokenScore, tokenId);
581
598
  if (!hasSingleNode && n === mblen) {
582
599
  hasSingleNode = true;
583
600
  }
584
601
  }
585
602
  if (!hasSingleNode) {
586
- lattice.insert(beginPos, mblen, this.unkScore, this.unk_token_id);
603
+ lattice.insert(beginPos, mblen, this.unk_score, this.unk_token_id);
587
604
  }
588
605
  beginPos += mblen;
589
606
  }
@@ -596,7 +613,7 @@ class Unigram extends TokenizerModel {
596
613
  * @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model.
597
614
  */
598
615
  tokenize(normalized) {
599
- const lattice = new TokenLattice(normalized, this.bosTokenId, this.eosTokenId);
616
+ const lattice = new TokenLattice(normalized, this.bos_token_id, this.eos_token_id);
600
617
  this.populateNodes(lattice);
601
618
  return lattice.tokens();
602
619
  }
@@ -666,7 +683,7 @@ class BPE extends TokenizerModel {
666
683
  * Create a BPE instance.
667
684
  * @param {Object} config The configuration object for BPE.
668
685
  * @param {Object} config.vocab A mapping of tokens to ids.
669
- * @param {string[]} config.merges An array of BPE merges as strings.
686
+ * @param {string[]|[string, string][]} config.merges An array of BPE merges as strings.
670
687
  * @param {string} config.unk_token The unknown token used for out of vocabulary words.
671
688
  * @param {string} config.end_of_word_suffix The suffix to place at the end of each word.
672
689
  * @param {string} [config.continuing_subword_suffix] The suffix to insert between words.
@@ -676,8 +693,6 @@ class BPE extends TokenizerModel {
676
693
  constructor(config) {
677
694
  super(config);
678
695
 
679
- this.BPE_SPLIT_TOKEN = ' ';
680
-
681
696
  /** @type {Map<string, number>} */
682
697
  this.tokens_to_ids = objectToMap(config.vocab);
683
698
 
@@ -689,8 +704,15 @@ class BPE extends TokenizerModel {
689
704
  this.vocab[value] = key;
690
705
  }
691
706
 
692
- this.bpe_ranks = new Map(config.merges.map((x, i) => [x, i]));
693
- this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN));
707
+ // Tokenizers >= 0.20.0 serializes BPE merges as a [string, string][] instead of a string[],
708
+ // which resolves the ambiguity for merges containing spaces.
709
+ const use_new_merge_format = Array.isArray(config.merges[0]);
710
+
711
+ /** @type {[string, string][]} */
712
+ this.merges = use_new_merge_format
713
+ ? /** @type {[string, string][]} */(config.merges)
714
+ : (/** @type {string[]} */(config.merges)).map(x => /** @type {[string, string]} */(x.split(' ', 2)));
715
+ this.bpe_ranks = new Map(this.merges.map((x, i) => [JSON.stringify(x), i]));
694
716
 
695
717
  this.end_of_word_suffix = config.end_of_word_suffix;
696
718
 
@@ -850,7 +872,7 @@ class BPE extends TokenizerModel {
850
872
  // `score` is a measure of the merge priority: lower means higher priority
851
873
  // We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list)
852
874
  // We also add a fractional component to the score to break ties (with the earlier character having higher priority)
853
- const rank = this.bpe_ranks.get(node.token + this.BPE_SPLIT_TOKEN + node.next.token);
875
+ const rank = this.bpe_ranks.get(JSON.stringify([node.token, node.next.token]));
854
876
  if (rank !== undefined) {
855
877
  node.score = rank + node.bias;
856
878
  queue.push(node);
@@ -875,15 +897,19 @@ class BPE extends TokenizerModel {
875
897
  for (const t of bpe_token_list) {
876
898
  if (this.tokens_to_ids.has(t)) {
877
899
  outputTokens.push(t);
878
- } else {
879
- if (this.byte_fallback) {
880
- outputTokens.push(
881
- ...Array.from(this.text_encoder.encode(t))
882
- .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`)
883
- );
900
+ } else if (this.byte_fallback) {
901
+ const byteTokens = Array.from(this.text_encoder.encode(t))
902
+ .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`);
903
+ if (byteTokens.every(x => this.tokens_to_ids.has(x))) {
904
+ // Ensure the byte tokens are actually in the vocabulary, otherwise
905
+ // we fall back to the unknown token. For more information, see
906
+ // https://github.com/huggingface/transformers/issues/28096.
907
+ outputTokens.push(...byteTokens);
884
908
  } else {
885
909
  outputTokens.push(this.unk_token);
886
910
  }
911
+ } else {
912
+ outputTokens.push(this.unk_token);
887
913
  }
888
914
  }
889
915
  }
@@ -1207,7 +1233,8 @@ class BertNormalizer extends Normalizer {
1207
1233
  * @returns {string} The text with accents removed.
1208
1234
  */
1209
1235
  stripAccents(text) {
1210
- return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
1236
+ // "Mark, Nonspacing" (Mn)
1237
+ return text.normalize('NFD').replace(/\p{Mn}/gu, '');
1211
1238
  }
1212
1239
 
1213
1240
 
@@ -2325,7 +2352,7 @@ class Precompiled extends Normalizer {
2325
2352
  // TODO: detect when a different `this.charsmap` is used.
2326
2353
 
2327
2354
  text = text.replace(/[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm, ''); // Remove control characters
2328
- text = text.replace(/[\u0009\u000A\u000C\u000D\u1680\u200B\u200C\u200E\u200F\u2028\u2029\u2581\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space
2355
+ text = text.replace(/[\u0009\u000A\u000C\u000D\u00A0\u1680\u2000-\u200F\u2028\u2029\u202F\u205F\u2581\u3000\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space
2329
2356
 
2330
2357
  if (text.includes('\uFF5E')) {
2331
2358
  // To match the sentencepiece implementation 100%, we must handle a very strange edge-case.
@@ -2550,16 +2577,15 @@ export class PreTrainedTokenizer extends Callable {
2550
2577
 
2551
2578
  // Another slight hack to add `end_of_word_suffix` (if present) to the decoder
2552
2579
  // This is needed for cases where BPE model and ByteLevel decoder are used
2553
- // For more information, see https://github.com/xenova/transformers.js/issues/74
2580
+ // For more information, see https://github.com/huggingface/transformers.js/issues/74
2554
2581
  // TODO: save this to the decoder when exporting?
2555
2582
  this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
2556
2583
  }
2557
2584
 
2558
-
2559
2585
  this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
2560
- this.added_tokens
2586
+ this.added_tokens.slice()
2561
2587
  // Sort by length (desc) to avoid early partial matches
2562
- .toSorted((a, b) => b.content.length - a.content.length)
2588
+ .sort((a, b) => b.content.length - a.content.length)
2563
2589
  .map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`)
2564
2590
  .join('|')
2565
2591
  ) : null;
@@ -3057,6 +3083,67 @@ export class PreTrainedTokenizer extends Callable {
3057
3083
 
3058
3084
  return decoded;
3059
3085
  }
3086
+
3087
+ /**
3088
+ * Retrieve the chat template string used for tokenizing chat messages. This template is used
3089
+ * internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat
3090
+ * template for better generation tracking.
3091
+ *
3092
+ * @param {Object} options An optional object containing the following properties:
3093
+ * @param {string} [options.chat_template=null]
3094
+ * A Jinja template or the name of a template to use for this conversion.
3095
+ * It is usually not necessary to pass anything to this argument,
3096
+ * as the model's template will be used by default.
3097
+ * @param {Object[]} [options.tools=null]
3098
+ * A list of tools (callable functions) that will be accessible to the model. If the template does not
3099
+ * support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
3100
+ * giving the name, description and argument types for the tool. See our
3101
+ * [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
3102
+ * for more information.
3103
+ * @returns {string} The chat template string.
3104
+ */
3105
+ get_chat_template({
3106
+ chat_template = null,
3107
+ tools = null,
3108
+ } = {}) {
3109
+
3110
+ // First, handle the cases when the model has a dict of multiple templates
3111
+ if (this.chat_template && typeof this.chat_template === 'object') {
3112
+ const template_dict = this.chat_template;
3113
+
3114
+ if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
3115
+ // The user can pass the name of a template to the chat template argument instead of an entire template
3116
+ chat_template = template_dict[chat_template];
3117
+ } else if (chat_template === null) {
3118
+ if (tools !== null && 'tool_use' in template_dict) {
3119
+ chat_template = template_dict['tool_use'];
3120
+ } else if ('default' in template_dict) {
3121
+ chat_template = template_dict['default'];
3122
+ } else {
3123
+ throw Error(
3124
+ `This model has multiple chat templates with no default specified! Please either pass a chat ` +
3125
+ `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
3126
+ `template names are ${Object.keys(template_dict).sort()}.`
3127
+ )
3128
+ }
3129
+ }
3130
+ } else if (chat_template === null) {
3131
+ // These are the cases when the model has a single template
3132
+ // priority: `chat_template` argument > `tokenizer.chat_template`
3133
+ if (this.chat_template) {
3134
+ chat_template = this.chat_template;
3135
+ } else {
3136
+ throw Error(
3137
+ "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
3138
+ "argument was passed! For information about writing templates and setting the " +
3139
+ "tokenizer.chat_template attribute, please see the documentation at " +
3140
+ "https://huggingface.co/docs/transformers/main/en/chat_templating"
3141
+ )
3142
+ }
3143
+ }
3144
+ return chat_template;
3145
+ }
3146
+
3060
3147
  /**
3061
3148
  * Converts a list of message objects with `"role"` and `"content"` keys to a list of token
3062
3149
  * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
@@ -3130,39 +3217,8 @@ export class PreTrainedTokenizer extends Callable {
3130
3217
  ...kwargs
3131
3218
  } = {}) {
3132
3219
 
3133
- // First, handle the cases when the model has a dict of multiple templates
3134
- if (
3135
- (this.chat_template && typeof this.chat_template === 'object')
3136
- || this.chat_template === null
3137
- ) {
3138
- const template_dict = this.chat_template;
3220
+ chat_template = this.get_chat_template({ chat_template, tools });
3139
3221
 
3140
- if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
3141
- // The user can pass the name of a template to the chat template argument instead of an entire template
3142
- chat_template = template_dict[chat_template];
3143
- } else if (chat_template === null && 'default' in template_dict) {
3144
- chat_template = template_dict['default'];
3145
- } else if (chat_template === null) {
3146
- throw Error(
3147
- `This model has multiple chat templates with no default specified! Please either pass a chat ` +
3148
- `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
3149
- `template names are ${Object.keys(template_dict).sort()}.`
3150
- )
3151
- }
3152
- } else {
3153
- // These are the cases when the model has a single template
3154
- // priority: `chat_template` argument > `tokenizer.chat_template`
3155
- if (this.chat_template) {
3156
- chat_template = this.chat_template;
3157
- } else {
3158
- throw Error(
3159
- "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
3160
- "argument was passed! For information about writing templates and setting the " +
3161
- "tokenizer.chat_template attribute, please see the documentation at " +
3162
- "https://huggingface.co/docs/transformers/main/en/chat_templating"
3163
- )
3164
- }
3165
- }
3166
3222
  if (typeof chat_template !== 'string') {
3167
3223
  throw Error(`chat_template must be a string, but got ${typeof chat_template}`);
3168
3224
  }
@@ -3283,19 +3339,7 @@ export class MBart50Tokenizer extends MBartTokenizer { } // NOTE: extends MBartT
3283
3339
 
3284
3340
  export class RobertaTokenizer extends PreTrainedTokenizer { }
3285
3341
 
3286
- export class BloomTokenizer extends PreTrainedTokenizer {
3287
-
3288
- constructor(tokenizerJSON, tokenizerConfig) {
3289
- // Override the default (invalid) regex of the pretokenizer.
3290
- // For more information, see https://github.com/xenova/transformers.js/issues/94
3291
- const splitChars = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c';
3292
- const patternObject = tokenizerJSON.pre_tokenizer?.pretokenizers[0]?.pattern;
3293
- if (patternObject && patternObject.Regex === ` ?[^(\\s|[${splitChars}])]+`) {
3294
- patternObject.Regex = ` ?[^\\s${splitChars}]+`;
3295
- }
3296
- super(tokenizerJSON, tokenizerConfig);
3297
- }
3298
- }
3342
+ export class BloomTokenizer extends PreTrainedTokenizer { }
3299
3343
 
3300
3344
  const SPIECE_UNDERLINE = "▁";
3301
3345
 
@@ -4132,85 +4176,6 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
4132
4176
  newIndices.filter(x => x.length > 0),
4133
4177
  ]
4134
4178
  }
4135
-
4136
- /**
4137
- * Helper function to build translation inputs for a `WhisperTokenizer`,
4138
- * depending on the language, task, and whether to predict timestamp tokens.
4139
- *
4140
- * Used to override the prefix tokens appended to the start of the label sequence.
4141
- *
4142
- * **Example: Get ids for a language**
4143
- * ```javascript
4144
- * // instantiate the tokenizer and set the prefix token to Spanish
4145
- * const tokenizer = await WhisperTokenizer.from_pretrained('Xenova/whisper-tiny');
4146
- * const forced_decoder_ids = tokenizer.get_decoder_prompt_ids({ language: 'spanish' });
4147
- * // [(1, 50262), (2, 50363)]
4148
- * ```
4149
- *
4150
- * @param {Object} options Options to generate the decoder prompt.
4151
- * @param {string} [options.language] The language of the transcription text.
4152
- * The corresponding language id token is appended to the start of the sequence for multilingual
4153
- * speech recognition and speech translation tasks, e.g. for "Spanish" the token "<|es|>" is appended
4154
- * to the start of sequence.
4155
- * @param {string} [options.task] Task identifier to append at the start of sequence (if any).
4156
- * This should be used for mulitlingual fine-tuning, with "transcribe" for speech recognition and
4157
- * "translate" for speech translation.
4158
- * @param {boolean} [options.no_timestamps] Whether to add the <|notimestamps|> token at the start of the sequence.
4159
- * @returns {number[][]} The decoder prompt ids.
4160
- */
4161
- get_decoder_prompt_ids({
4162
- language = null,
4163
- task = null,
4164
- no_timestamps = true,
4165
- } = {}) {
4166
-
4167
- // <|lang_id|> <|task|> <|notimestamps|>
4168
-
4169
- const forced_decoder_ids = [];
4170
-
4171
- if (language) {
4172
- // User wishes to specify the language
4173
- const language_code = whisper_language_to_code(language);
4174
- const language_token_id = this.model.tokens_to_ids.get(`<|${language_code}|>`);
4175
- if (language_token_id === undefined) {
4176
- throw new Error(`Unable to find language "${language_code}" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`)
4177
- }
4178
-
4179
- forced_decoder_ids.push(language_token_id);
4180
- } else {
4181
- // No token will be forced, which leaves the model to predict the language
4182
- forced_decoder_ids.push(null);
4183
- }
4184
-
4185
- if (task) {
4186
- task = task.toLowerCase();
4187
- if (task !== 'transcribe' && task !== 'translate') {
4188
- throw new Error(`Task "${task}" is not supported. Must be one of: ["transcribe", "translate"]`);
4189
- }
4190
-
4191
- const task_token_id = this.model.tokens_to_ids.get(`<|${task}|>`);
4192
- if (task_token_id === undefined) {
4193
- throw new Error(`Unable to find task "${task}" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`)
4194
- }
4195
-
4196
- forced_decoder_ids.push(task_token_id);
4197
- } else {
4198
- // No token will be forced, which leaves the model to predict the task
4199
- forced_decoder_ids.push(null);
4200
- }
4201
-
4202
- if (no_timestamps) {
4203
- const no_timestamps_id = this.model.tokens_to_ids.get(`<|notimestamps|>`);
4204
- if (no_timestamps_id === undefined) {
4205
- throw new Error(`Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`);
4206
- }
4207
-
4208
- forced_decoder_ids.push(no_timestamps_id);
4209
- }
4210
-
4211
- return forced_decoder_ids.map((x, i) => [i + 1, x]).filter(x => x[1] !== null);
4212
-
4213
- }
4214
4179
  }
4215
4180
  export class CodeGenTokenizer extends PreTrainedTokenizer { }
4216
4181
  export class CLIPTokenizer extends PreTrainedTokenizer { }
@@ -1,2 +1,2 @@
1
1
 
2
- export const GITHUB_ISSUE_URL = 'https://github.com/xenova/transformers.js/issues/new/choose';
2
+ export const GITHUB_ISSUE_URL = 'https://github.com/huggingface/transformers.js/issues/new/choose';
package/src/utils/core.js CHANGED
@@ -147,3 +147,15 @@ export function pick(o, props) {
147
147
  })
148
148
  );
149
149
  }
150
+
151
+ /**
152
+ * Calculate the length of a string, taking multi-byte characters into account.
153
+ * This mimics the behavior of Python's `len` function.
154
+ * @param {string} s The string to calculate the length of.
155
+ * @returns {number} The length of the string.
156
+ */
157
+ export function len(s) {
158
+ let length = 0;
159
+ for (const c of s) ++length;
160
+ return length;
161
+ }
@@ -229,7 +229,7 @@ export class CharTrie {
229
229
  * @param {string[]} texts The strings to add to the trie.
230
230
  */
231
231
  extend(texts) {
232
- for (let text of texts) {
232
+ for (const text of texts) {
233
233
  this.push(text);
234
234
  }
235
235
  }
@@ -240,7 +240,7 @@ export class CharTrie {
240
240
  */
241
241
  push(text) {
242
242
  let node = this.root;
243
- for (let ch of text) {
243
+ for (const ch of text) {
244
244
  let child = node.children.get(ch);
245
245
  if (child === undefined) {
246
246
  child = CharTrieNode.default();
@@ -258,12 +258,14 @@ export class CharTrie {
258
258
  */
259
259
  *commonPrefixSearch(text) {
260
260
  let node = this.root;
261
+ if (node === undefined) return;
262
+
261
263
  let prefix = "";
262
- for (let i = 0; i < text.length && node !== undefined; ++i) {
263
- const ch = text[i];
264
+ for (const ch of text) {
264
265
  prefix += ch;
265
266
  node = node.children.get(ch);
266
- if (node !== undefined && node.isLeaf) {
267
+ if (node === undefined) return;
268
+ if (node.isLeaf) {
267
269
  yield prefix;
268
270
  }
269
271
  }
@@ -305,8 +307,8 @@ export class TokenLattice {
305
307
  * @param {number} eosTokenId The end-of-sequence token ID.
306
308
  */
307
309
  constructor(sentence, bosTokenId, eosTokenId) {
308
- this.sentence = sentence;
309
- this.len = sentence.length;
310
+ this.chars = Array.from(sentence);
311
+ this.len = this.chars.length;
310
312
  this.bosTokenId = bosTokenId;
311
313
  this.eosTokenId = eosTokenId;
312
314
  this.nodes = [];
@@ -340,7 +342,7 @@ export class TokenLattice {
340
342
  /**
341
343
  * Implements the Viterbi algorithm to compute the most likely sequence of tokens.
342
344
  *
343
- * @returns {TokenLatticeNode[]} The array of nodes representing the most likely sequence of tokens.
345
+ * @returns {TokenLatticeNode[]} The most likely sequence of tokens.
344
346
  */
345
347
  viterbi() {
346
348
  const len = this.len;
@@ -394,11 +396,11 @@ export class TokenLattice {
394
396
  * @returns {string} The array of nodes representing the most likely sequence of tokens.
395
397
  */
396
398
  piece(node) {
397
- return this.sentence.slice(node.pos, node.pos + node.length);
399
+ return this.chars.slice(node.pos, node.pos + node.length).join('');
398
400
  }
399
401
 
400
402
  /**
401
- * @returns {Array} The array of nodes representing the most likely sequence of tokens.
403
+ * @returns {string[]} The most likely sequence of tokens.
402
404
  */
403
405
  tokens() {
404
406
  const nodes = this.viterbi();
@@ -406,7 +408,7 @@ export class TokenLattice {
406
408
  }
407
409
 
408
410
  /**
409
- * @returns {Array} The array of nodes representing the most likely sequence of tokens.
411
+ * @returns {number[]} The most likely sequence of token ids.
410
412
  */
411
413
  tokenIds() {
412
414
  const nodes = this.viterbi();
package/src/utils/hub.js CHANGED
@@ -14,7 +14,7 @@ import { dispatchCallback } from './core.js';
14
14
  /**
15
15
  * @typedef {Object} PretrainedOptions Options for loading a pretrained model.
16
16
  * @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
17
- * @property {Object} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
17
+ * @property {import('../configs.js').PretrainedConfig} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
18
18
  * - The model is a model provided by the library (loaded with the *model id* string of a pretrained model).
19
19
  * - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory.
20
20
  * @property {string} [cache_dir=null] Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.
@@ -158,11 +158,20 @@ export function softmax(arr) {
158
158
  * @returns {T} The resulting log_softmax array.
159
159
  */
160
160
  export function log_softmax(arr) {
161
- // Compute the softmax values
162
- const softmaxArr = softmax(arr);
161
+ // Compute the maximum value in the array
162
+ const maxVal = max(arr)[0];
163
+
164
+ // Compute the sum of the exponentials
165
+ let sumExps = 0;
166
+ for(let i = 0; i < arr.length; ++i) {
167
+ sumExps += Math.exp(arr[i] - maxVal);
168
+ }
163
169
 
164
- // Apply log formula to each element
165
- const logSoftmaxArr = softmaxArr.map(x => Math.log(x));
170
+ // Compute the log of the sum
171
+ const logSum = Math.log(sumExps);
172
+
173
+ // Compute the softmax values
174
+ const logSoftmaxArr = arr.map(x => x - maxVal - logSum);
166
175
 
167
176
  return /** @type {T} */(logSoftmaxArr);
168
177
  }
@@ -217,7 +226,7 @@ export function magnitude(arr) {
217
226
  /**
218
227
  * Returns the value and index of the minimum element in an array.
219
228
  * @param {number[]|TypedArray} arr array of numbers.
220
- * @returns {number[]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
229
+ * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
221
230
  * @throws {Error} If array is empty.
222
231
  */
223
232
  export function min(arr) {