@huggingface/transformers 3.0.0-alpha.9 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -22
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +2515 -2525
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +3529 -3455
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +25 -25
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +39 -40
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +56 -57
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +2551 -2538
- package/dist/transformers.mjs.map +1 -1
- package/package.json +14 -13
- package/src/backends/onnx.js +24 -19
- package/src/configs.js +19 -4
- package/src/env.js +5 -9
- package/src/generation/logits_process.js +40 -37
- package/src/models.js +326 -514
- package/src/ops/registry.js +14 -3
- package/src/pipelines.js +5 -4
- package/src/processors.js +390 -351
- package/src/tokenizers.js +140 -175
- package/src/utils/constants.js +1 -1
- package/src/utils/core.js +12 -0
- package/src/utils/data-structures.js +13 -11
- package/src/utils/hub.js +1 -1
- package/src/utils/maths.js +14 -5
- package/src/utils/tensor.js +60 -13
- package/types/backends/onnx.d.ts +5 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/configs.d.ts +29 -3
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +4 -2
- package/types/env.d.ts.map +1 -1
- package/types/generation/logits_process.d.ts.map +1 -1
- package/types/models.d.ts +116 -289
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +6 -6
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +1 -2
- package/types/pipelines.d.ts.map +1 -1
- package/types/processors.d.ts +55 -51
- package/types/processors.d.ts.map +1 -1
- package/types/tokenizers.d.ts +23 -32
- package/types/tokenizers.d.ts.map +1 -1
- package/types/utils/constants.d.ts +1 -1
- package/types/utils/constants.d.ts.map +1 -1
- package/types/utils/core.d.ts +7 -0
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/data-structures.d.ts +6 -6
- package/types/utils/data-structures.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/maths.d.ts +2 -2
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +27 -1
- package/types/utils/tensor.d.ts.map +1 -1
package/src/tokenizers.js
CHANGED
|
@@ -28,6 +28,7 @@ import {
|
|
|
28
28
|
escapeRegExp,
|
|
29
29
|
isIntegralNumber,
|
|
30
30
|
mergeArrays,
|
|
31
|
+
len,
|
|
31
32
|
} from './utils/core.js';
|
|
32
33
|
|
|
33
34
|
import {
|
|
@@ -195,7 +196,7 @@ function clean_up_tokenization(text) {
|
|
|
195
196
|
* @returns {string} The text with accents removed.
|
|
196
197
|
*/
|
|
197
198
|
function remove_accents(text) {
|
|
198
|
-
return text.replace(/
|
|
199
|
+
return text.replace(/\p{M}/gu, '');
|
|
199
200
|
}
|
|
200
201
|
|
|
201
202
|
/**
|
|
@@ -236,23 +237,26 @@ export function is_chinese_char(cp) {
|
|
|
236
237
|
}
|
|
237
238
|
|
|
238
239
|
/**
|
|
239
|
-
* Helper function to fuse consecutive
|
|
240
|
-
* @param {string[]} arr The input
|
|
241
|
-
* @param {any}
|
|
242
|
-
* @param {
|
|
240
|
+
* Helper function to fuse consecutive unknown tokens.
|
|
241
|
+
* @param {string[]} arr The list of input tokens
|
|
242
|
+
* @param {Map<string, any>} tokens_to_ids The mapping from tokens to token ids.
|
|
243
|
+
* @param {number} unk_token_id The value to fuse on.
|
|
244
|
+
* @private
|
|
243
245
|
*/
|
|
244
|
-
function
|
|
246
|
+
function fuse_unk(arr, tokens_to_ids, unk_token_id) {
|
|
245
247
|
const fused = [];
|
|
246
248
|
let i = 0;
|
|
247
249
|
while (i < arr.length) {
|
|
248
250
|
fused.push(arr[i])
|
|
249
|
-
if ((
|
|
251
|
+
if ((tokens_to_ids.get(arr[i]) ?? unk_token_id) !== unk_token_id) {
|
|
250
252
|
++i;
|
|
251
253
|
continue;
|
|
252
254
|
}
|
|
253
255
|
|
|
254
|
-
while (i < arr.length && (
|
|
255
|
-
|
|
256
|
+
while (++i < arr.length && (tokens_to_ids.get(arr[i]) ?? unk_token_id) === unk_token_id) {
|
|
257
|
+
if (tokens_to_ids.get(fused.at(-1)) !== unk_token_id) {
|
|
258
|
+
fused[fused.length - 1] += arr[i];
|
|
259
|
+
}
|
|
256
260
|
}
|
|
257
261
|
}
|
|
258
262
|
|
|
@@ -270,12 +274,17 @@ function whitespace_split(text) {
|
|
|
270
274
|
|
|
271
275
|
const PUNCTUATION_REGEX = '\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E';
|
|
272
276
|
const PUNCTUATION_ONLY_REGEX = new RegExp(`^[${PUNCTUATION_REGEX}]+$`, 'gu');
|
|
277
|
+
const BLOOM_SPLIT_CHARS = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c';
|
|
273
278
|
|
|
274
|
-
// A mapping of regex patterns to their equivalent (but longer) JS-compatible versions.
|
|
279
|
+
// A mapping of regex patterns to their equivalent (but possibly longer) JS-compatible versions.
|
|
275
280
|
const PROBLEMATIC_REGEX_MAP = new Map([
|
|
276
281
|
// This uses the case insensitive group modifier, which is not supported in JavaScript.
|
|
277
282
|
// When parsing the regex, an "Invalid group" error is thrown.
|
|
278
283
|
["(?i:'s|'t|'re|'ve|'m|'ll|'d)", "(?:'([sS]|[tT]|[rR][eE]|[vV][eE]|[mM]|[lL][lL]|[dD]))"],
|
|
284
|
+
|
|
285
|
+
// Used to override the default (invalid) regex of the bloom pretokenizer.
|
|
286
|
+
// For more information, see https://github.com/huggingface/transformers.js/issues/94
|
|
287
|
+
[` ?[^(\\s|[${BLOOM_SPLIT_CHARS}])]+`, ` ?[^\\s${BLOOM_SPLIT_CHARS}]+`],
|
|
279
288
|
])
|
|
280
289
|
|
|
281
290
|
|
|
@@ -353,14 +362,21 @@ export class TokenizerModel extends Callable {
|
|
|
353
362
|
case 'Unigram':
|
|
354
363
|
// @ts-ignore
|
|
355
364
|
return new Unigram(config, ...args);
|
|
356
|
-
|
|
357
365
|
case 'BPE':
|
|
358
366
|
return new BPE(config);
|
|
359
367
|
|
|
360
368
|
default:
|
|
369
|
+
// Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
|
|
370
|
+
// In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
|
|
361
371
|
if (config.vocab) {
|
|
362
|
-
|
|
363
|
-
|
|
372
|
+
if (Array.isArray(config.vocab)) {
|
|
373
|
+
// config.vocab is of type `[string, number][]`
|
|
374
|
+
// @ts-ignore
|
|
375
|
+
return new Unigram(config, ...args);
|
|
376
|
+
} else {
|
|
377
|
+
// @ts-ignore
|
|
378
|
+
return new LegacyTokenizerModel(config, ...args);
|
|
379
|
+
}
|
|
364
380
|
}
|
|
365
381
|
throw new Error(`Unknown TokenizerModel type: ${config.type}`);
|
|
366
382
|
}
|
|
@@ -369,15 +385,15 @@ export class TokenizerModel extends Callable {
|
|
|
369
385
|
/**
|
|
370
386
|
* Internal function to call the TokenizerModel instance.
|
|
371
387
|
* @param {string[]} tokens The tokens to encode.
|
|
372
|
-
* @returns {string[]} The encoded
|
|
388
|
+
* @returns {string[]} The encoded tokens.
|
|
373
389
|
*/
|
|
374
390
|
_call(tokens) {
|
|
375
|
-
|
|
391
|
+
tokens = this.encode(tokens);
|
|
376
392
|
if (this.fuse_unk) {
|
|
377
393
|
// Fuse unknown tokens
|
|
378
|
-
|
|
394
|
+
tokens = fuse_unk(tokens, this.tokens_to_ids, this.unk_token_id);
|
|
379
395
|
}
|
|
380
|
-
return
|
|
396
|
+
return tokens;
|
|
381
397
|
}
|
|
382
398
|
|
|
383
399
|
/**
|
|
@@ -538,18 +554,18 @@ class Unigram extends TokenizerModel {
|
|
|
538
554
|
this.unk_token = this.vocab[config.unk_id];
|
|
539
555
|
|
|
540
556
|
this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i]));
|
|
541
|
-
this.
|
|
557
|
+
this.bos_token = ' '; // beginning of a sentence token
|
|
542
558
|
|
|
543
|
-
this.
|
|
544
|
-
this.
|
|
559
|
+
this.bos_token_id = this.tokens_to_ids.get(this.bos_token); // NOTE: may be undefined
|
|
560
|
+
this.eos_token = moreConfig.eos_token;
|
|
545
561
|
|
|
546
|
-
this.
|
|
547
|
-
this.
|
|
562
|
+
this.eos_token_id = this.tokens_to_ids.get(this.eos_token);
|
|
563
|
+
this.unk_token = this.vocab[this.unk_token_id];
|
|
548
564
|
|
|
549
565
|
this.minScore = min(this.scores)[0];
|
|
550
566
|
|
|
551
|
-
this.
|
|
552
|
-
this.scores[this.unk_token_id] = this.
|
|
567
|
+
this.unk_score = this.minScore - 10.0;
|
|
568
|
+
this.scores[this.unk_token_id] = this.unk_score;
|
|
553
569
|
|
|
554
570
|
this.trie = new CharTrie();
|
|
555
571
|
this.trie.extend(this.vocab);
|
|
@@ -564,26 +580,27 @@ class Unigram extends TokenizerModel {
|
|
|
564
580
|
* @param {TokenLattice} lattice The token lattice to populate with nodes.
|
|
565
581
|
*/
|
|
566
582
|
populateNodes(lattice) {
|
|
567
|
-
const
|
|
568
|
-
const
|
|
583
|
+
const chars = lattice.chars;
|
|
584
|
+
const mblen = 1;
|
|
569
585
|
let beginPos = 0;
|
|
570
|
-
while (beginPos <
|
|
571
|
-
const mblen = 1;
|
|
586
|
+
while (beginPos < chars.length) {
|
|
572
587
|
let hasSingleNode = false;
|
|
573
|
-
const tokens = [];
|
|
574
588
|
|
|
575
|
-
|
|
589
|
+
const tokens = [];
|
|
590
|
+
const sliced = chars.slice(beginPos).join('');
|
|
591
|
+
const prefixedTokens = this.trie.commonPrefixSearch(sliced);
|
|
592
|
+
for (const token of prefixedTokens) {
|
|
576
593
|
tokens.push(token);
|
|
577
594
|
const tokenId = this.tokens_to_ids.get(token);
|
|
578
595
|
const tokenScore = this.scores[tokenId];
|
|
579
|
-
const n = token
|
|
596
|
+
const n = len(token);
|
|
580
597
|
lattice.insert(beginPos, n, tokenScore, tokenId);
|
|
581
598
|
if (!hasSingleNode && n === mblen) {
|
|
582
599
|
hasSingleNode = true;
|
|
583
600
|
}
|
|
584
601
|
}
|
|
585
602
|
if (!hasSingleNode) {
|
|
586
|
-
lattice.insert(beginPos, mblen, this.
|
|
603
|
+
lattice.insert(beginPos, mblen, this.unk_score, this.unk_token_id);
|
|
587
604
|
}
|
|
588
605
|
beginPos += mblen;
|
|
589
606
|
}
|
|
@@ -596,7 +613,7 @@ class Unigram extends TokenizerModel {
|
|
|
596
613
|
* @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model.
|
|
597
614
|
*/
|
|
598
615
|
tokenize(normalized) {
|
|
599
|
-
const lattice = new TokenLattice(normalized, this.
|
|
616
|
+
const lattice = new TokenLattice(normalized, this.bos_token_id, this.eos_token_id);
|
|
600
617
|
this.populateNodes(lattice);
|
|
601
618
|
return lattice.tokens();
|
|
602
619
|
}
|
|
@@ -666,7 +683,7 @@ class BPE extends TokenizerModel {
|
|
|
666
683
|
* Create a BPE instance.
|
|
667
684
|
* @param {Object} config The configuration object for BPE.
|
|
668
685
|
* @param {Object} config.vocab A mapping of tokens to ids.
|
|
669
|
-
* @param {string[]} config.merges An array of BPE merges as strings.
|
|
686
|
+
* @param {string[]|[string, string][]} config.merges An array of BPE merges as strings.
|
|
670
687
|
* @param {string} config.unk_token The unknown token used for out of vocabulary words.
|
|
671
688
|
* @param {string} config.end_of_word_suffix The suffix to place at the end of each word.
|
|
672
689
|
* @param {string} [config.continuing_subword_suffix] The suffix to insert between words.
|
|
@@ -676,8 +693,6 @@ class BPE extends TokenizerModel {
|
|
|
676
693
|
constructor(config) {
|
|
677
694
|
super(config);
|
|
678
695
|
|
|
679
|
-
this.BPE_SPLIT_TOKEN = ' ';
|
|
680
|
-
|
|
681
696
|
/** @type {Map<string, number>} */
|
|
682
697
|
this.tokens_to_ids = objectToMap(config.vocab);
|
|
683
698
|
|
|
@@ -689,8 +704,15 @@ class BPE extends TokenizerModel {
|
|
|
689
704
|
this.vocab[value] = key;
|
|
690
705
|
}
|
|
691
706
|
|
|
692
|
-
|
|
693
|
-
|
|
707
|
+
// Tokenizers >= 0.20.0 serializes BPE merges as a [string, string][] instead of a string[],
|
|
708
|
+
// which resolves the ambiguity for merges containing spaces.
|
|
709
|
+
const use_new_merge_format = Array.isArray(config.merges[0]);
|
|
710
|
+
|
|
711
|
+
/** @type {[string, string][]} */
|
|
712
|
+
this.merges = use_new_merge_format
|
|
713
|
+
? /** @type {[string, string][]} */(config.merges)
|
|
714
|
+
: (/** @type {string[]} */(config.merges)).map(x => /** @type {[string, string]} */(x.split(' ', 2)));
|
|
715
|
+
this.bpe_ranks = new Map(this.merges.map((x, i) => [JSON.stringify(x), i]));
|
|
694
716
|
|
|
695
717
|
this.end_of_word_suffix = config.end_of_word_suffix;
|
|
696
718
|
|
|
@@ -850,7 +872,7 @@ class BPE extends TokenizerModel {
|
|
|
850
872
|
// `score` is a measure of the merge priority: lower means higher priority
|
|
851
873
|
// We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list)
|
|
852
874
|
// We also add a fractional component to the score to break ties (with the earlier character having higher priority)
|
|
853
|
-
const rank = this.bpe_ranks.get(node.token
|
|
875
|
+
const rank = this.bpe_ranks.get(JSON.stringify([node.token, node.next.token]));
|
|
854
876
|
if (rank !== undefined) {
|
|
855
877
|
node.score = rank + node.bias;
|
|
856
878
|
queue.push(node);
|
|
@@ -875,15 +897,19 @@ class BPE extends TokenizerModel {
|
|
|
875
897
|
for (const t of bpe_token_list) {
|
|
876
898
|
if (this.tokens_to_ids.has(t)) {
|
|
877
899
|
outputTokens.push(t);
|
|
878
|
-
} else {
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
900
|
+
} else if (this.byte_fallback) {
|
|
901
|
+
const byteTokens = Array.from(this.text_encoder.encode(t))
|
|
902
|
+
.map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`);
|
|
903
|
+
if (byteTokens.every(x => this.tokens_to_ids.has(x))) {
|
|
904
|
+
// Ensure the byte tokens are actually in the vocabulary, otherwise
|
|
905
|
+
// we fall back to the unknown token. For more information, see
|
|
906
|
+
// https://github.com/huggingface/transformers/issues/28096.
|
|
907
|
+
outputTokens.push(...byteTokens);
|
|
884
908
|
} else {
|
|
885
909
|
outputTokens.push(this.unk_token);
|
|
886
910
|
}
|
|
911
|
+
} else {
|
|
912
|
+
outputTokens.push(this.unk_token);
|
|
887
913
|
}
|
|
888
914
|
}
|
|
889
915
|
}
|
|
@@ -1207,7 +1233,8 @@ class BertNormalizer extends Normalizer {
|
|
|
1207
1233
|
* @returns {string} The text with accents removed.
|
|
1208
1234
|
*/
|
|
1209
1235
|
stripAccents(text) {
|
|
1210
|
-
|
|
1236
|
+
// "Mark, Nonspacing" (Mn)
|
|
1237
|
+
return text.normalize('NFD').replace(/\p{Mn}/gu, '');
|
|
1211
1238
|
}
|
|
1212
1239
|
|
|
1213
1240
|
|
|
@@ -2325,7 +2352,7 @@ class Precompiled extends Normalizer {
|
|
|
2325
2352
|
// TODO: detect when a different `this.charsmap` is used.
|
|
2326
2353
|
|
|
2327
2354
|
text = text.replace(/[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm, ''); // Remove control characters
|
|
2328
|
-
text = text.replace(/[\u0009\u000A\u000C\u000D\u1680\
|
|
2355
|
+
text = text.replace(/[\u0009\u000A\u000C\u000D\u00A0\u1680\u2000-\u200F\u2028\u2029\u202F\u205F\u2581\u3000\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space
|
|
2329
2356
|
|
|
2330
2357
|
if (text.includes('\uFF5E')) {
|
|
2331
2358
|
// To match the sentencepiece implementation 100%, we must handle a very strange edge-case.
|
|
@@ -2550,16 +2577,15 @@ export class PreTrainedTokenizer extends Callable {
|
|
|
2550
2577
|
|
|
2551
2578
|
// Another slight hack to add `end_of_word_suffix` (if present) to the decoder
|
|
2552
2579
|
// This is needed for cases where BPE model and ByteLevel decoder are used
|
|
2553
|
-
// For more information, see https://github.com/
|
|
2580
|
+
// For more information, see https://github.com/huggingface/transformers.js/issues/74
|
|
2554
2581
|
// TODO: save this to the decoder when exporting?
|
|
2555
2582
|
this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
|
|
2556
2583
|
}
|
|
2557
2584
|
|
|
2558
|
-
|
|
2559
2585
|
this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
|
|
2560
|
-
this.added_tokens
|
|
2586
|
+
this.added_tokens.slice()
|
|
2561
2587
|
// Sort by length (desc) to avoid early partial matches
|
|
2562
|
-
.
|
|
2588
|
+
.sort((a, b) => b.content.length - a.content.length)
|
|
2563
2589
|
.map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`)
|
|
2564
2590
|
.join('|')
|
|
2565
2591
|
) : null;
|
|
@@ -3057,6 +3083,67 @@ export class PreTrainedTokenizer extends Callable {
|
|
|
3057
3083
|
|
|
3058
3084
|
return decoded;
|
|
3059
3085
|
}
|
|
3086
|
+
|
|
3087
|
+
/**
|
|
3088
|
+
* Retrieve the chat template string used for tokenizing chat messages. This template is used
|
|
3089
|
+
* internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat
|
|
3090
|
+
* template for better generation tracking.
|
|
3091
|
+
*
|
|
3092
|
+
* @param {Object} options An optional object containing the following properties:
|
|
3093
|
+
* @param {string} [options.chat_template=null]
|
|
3094
|
+
* A Jinja template or the name of a template to use for this conversion.
|
|
3095
|
+
* It is usually not necessary to pass anything to this argument,
|
|
3096
|
+
* as the model's template will be used by default.
|
|
3097
|
+
* @param {Object[]} [options.tools=null]
|
|
3098
|
+
* A list of tools (callable functions) that will be accessible to the model. If the template does not
|
|
3099
|
+
* support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
|
|
3100
|
+
* giving the name, description and argument types for the tool. See our
|
|
3101
|
+
* [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
|
|
3102
|
+
* for more information.
|
|
3103
|
+
* @returns {string} The chat template string.
|
|
3104
|
+
*/
|
|
3105
|
+
get_chat_template({
|
|
3106
|
+
chat_template = null,
|
|
3107
|
+
tools = null,
|
|
3108
|
+
} = {}) {
|
|
3109
|
+
|
|
3110
|
+
// First, handle the cases when the model has a dict of multiple templates
|
|
3111
|
+
if (this.chat_template && typeof this.chat_template === 'object') {
|
|
3112
|
+
const template_dict = this.chat_template;
|
|
3113
|
+
|
|
3114
|
+
if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
|
|
3115
|
+
// The user can pass the name of a template to the chat template argument instead of an entire template
|
|
3116
|
+
chat_template = template_dict[chat_template];
|
|
3117
|
+
} else if (chat_template === null) {
|
|
3118
|
+
if (tools !== null && 'tool_use' in template_dict) {
|
|
3119
|
+
chat_template = template_dict['tool_use'];
|
|
3120
|
+
} else if ('default' in template_dict) {
|
|
3121
|
+
chat_template = template_dict['default'];
|
|
3122
|
+
} else {
|
|
3123
|
+
throw Error(
|
|
3124
|
+
`This model has multiple chat templates with no default specified! Please either pass a chat ` +
|
|
3125
|
+
`template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
|
|
3126
|
+
`template names are ${Object.keys(template_dict).sort()}.`
|
|
3127
|
+
)
|
|
3128
|
+
}
|
|
3129
|
+
}
|
|
3130
|
+
} else if (chat_template === null) {
|
|
3131
|
+
// These are the cases when the model has a single template
|
|
3132
|
+
// priority: `chat_template` argument > `tokenizer.chat_template`
|
|
3133
|
+
if (this.chat_template) {
|
|
3134
|
+
chat_template = this.chat_template;
|
|
3135
|
+
} else {
|
|
3136
|
+
throw Error(
|
|
3137
|
+
"Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
|
|
3138
|
+
"argument was passed! For information about writing templates and setting the " +
|
|
3139
|
+
"tokenizer.chat_template attribute, please see the documentation at " +
|
|
3140
|
+
"https://huggingface.co/docs/transformers/main/en/chat_templating"
|
|
3141
|
+
)
|
|
3142
|
+
}
|
|
3143
|
+
}
|
|
3144
|
+
return chat_template;
|
|
3145
|
+
}
|
|
3146
|
+
|
|
3060
3147
|
/**
|
|
3061
3148
|
* Converts a list of message objects with `"role"` and `"content"` keys to a list of token
|
|
3062
3149
|
* ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
|
|
@@ -3130,39 +3217,8 @@ export class PreTrainedTokenizer extends Callable {
|
|
|
3130
3217
|
...kwargs
|
|
3131
3218
|
} = {}) {
|
|
3132
3219
|
|
|
3133
|
-
|
|
3134
|
-
if (
|
|
3135
|
-
(this.chat_template && typeof this.chat_template === 'object')
|
|
3136
|
-
|| this.chat_template === null
|
|
3137
|
-
) {
|
|
3138
|
-
const template_dict = this.chat_template;
|
|
3220
|
+
chat_template = this.get_chat_template({ chat_template, tools });
|
|
3139
3221
|
|
|
3140
|
-
if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
|
|
3141
|
-
// The user can pass the name of a template to the chat template argument instead of an entire template
|
|
3142
|
-
chat_template = template_dict[chat_template];
|
|
3143
|
-
} else if (chat_template === null && 'default' in template_dict) {
|
|
3144
|
-
chat_template = template_dict['default'];
|
|
3145
|
-
} else if (chat_template === null) {
|
|
3146
|
-
throw Error(
|
|
3147
|
-
`This model has multiple chat templates with no default specified! Please either pass a chat ` +
|
|
3148
|
-
`template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
|
|
3149
|
-
`template names are ${Object.keys(template_dict).sort()}.`
|
|
3150
|
-
)
|
|
3151
|
-
}
|
|
3152
|
-
} else {
|
|
3153
|
-
// These are the cases when the model has a single template
|
|
3154
|
-
// priority: `chat_template` argument > `tokenizer.chat_template`
|
|
3155
|
-
if (this.chat_template) {
|
|
3156
|
-
chat_template = this.chat_template;
|
|
3157
|
-
} else {
|
|
3158
|
-
throw Error(
|
|
3159
|
-
"Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
|
|
3160
|
-
"argument was passed! For information about writing templates and setting the " +
|
|
3161
|
-
"tokenizer.chat_template attribute, please see the documentation at " +
|
|
3162
|
-
"https://huggingface.co/docs/transformers/main/en/chat_templating"
|
|
3163
|
-
)
|
|
3164
|
-
}
|
|
3165
|
-
}
|
|
3166
3222
|
if (typeof chat_template !== 'string') {
|
|
3167
3223
|
throw Error(`chat_template must be a string, but got ${typeof chat_template}`);
|
|
3168
3224
|
}
|
|
@@ -3283,19 +3339,7 @@ export class MBart50Tokenizer extends MBartTokenizer { } // NOTE: extends MBartT
|
|
|
3283
3339
|
|
|
3284
3340
|
export class RobertaTokenizer extends PreTrainedTokenizer { }
|
|
3285
3341
|
|
|
3286
|
-
export class BloomTokenizer extends PreTrainedTokenizer {
|
|
3287
|
-
|
|
3288
|
-
constructor(tokenizerJSON, tokenizerConfig) {
|
|
3289
|
-
// Override the default (invalid) regex of the pretokenizer.
|
|
3290
|
-
// For more information, see https://github.com/xenova/transformers.js/issues/94
|
|
3291
|
-
const splitChars = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c';
|
|
3292
|
-
const patternObject = tokenizerJSON.pre_tokenizer?.pretokenizers[0]?.pattern;
|
|
3293
|
-
if (patternObject && patternObject.Regex === ` ?[^(\\s|[${splitChars}])]+`) {
|
|
3294
|
-
patternObject.Regex = ` ?[^\\s${splitChars}]+`;
|
|
3295
|
-
}
|
|
3296
|
-
super(tokenizerJSON, tokenizerConfig);
|
|
3297
|
-
}
|
|
3298
|
-
}
|
|
3342
|
+
export class BloomTokenizer extends PreTrainedTokenizer { }
|
|
3299
3343
|
|
|
3300
3344
|
const SPIECE_UNDERLINE = "▁";
|
|
3301
3345
|
|
|
@@ -4132,85 +4176,6 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
|
|
|
4132
4176
|
newIndices.filter(x => x.length > 0),
|
|
4133
4177
|
]
|
|
4134
4178
|
}
|
|
4135
|
-
|
|
4136
|
-
/**
|
|
4137
|
-
* Helper function to build translation inputs for a `WhisperTokenizer`,
|
|
4138
|
-
* depending on the language, task, and whether to predict timestamp tokens.
|
|
4139
|
-
*
|
|
4140
|
-
* Used to override the prefix tokens appended to the start of the label sequence.
|
|
4141
|
-
*
|
|
4142
|
-
* **Example: Get ids for a language**
|
|
4143
|
-
* ```javascript
|
|
4144
|
-
* // instantiate the tokenizer and set the prefix token to Spanish
|
|
4145
|
-
* const tokenizer = await WhisperTokenizer.from_pretrained('Xenova/whisper-tiny');
|
|
4146
|
-
* const forced_decoder_ids = tokenizer.get_decoder_prompt_ids({ language: 'spanish' });
|
|
4147
|
-
* // [(1, 50262), (2, 50363)]
|
|
4148
|
-
* ```
|
|
4149
|
-
*
|
|
4150
|
-
* @param {Object} options Options to generate the decoder prompt.
|
|
4151
|
-
* @param {string} [options.language] The language of the transcription text.
|
|
4152
|
-
* The corresponding language id token is appended to the start of the sequence for multilingual
|
|
4153
|
-
* speech recognition and speech translation tasks, e.g. for "Spanish" the token "<|es|>" is appended
|
|
4154
|
-
* to the start of sequence.
|
|
4155
|
-
* @param {string} [options.task] Task identifier to append at the start of sequence (if any).
|
|
4156
|
-
* This should be used for mulitlingual fine-tuning, with "transcribe" for speech recognition and
|
|
4157
|
-
* "translate" for speech translation.
|
|
4158
|
-
* @param {boolean} [options.no_timestamps] Whether to add the <|notimestamps|> token at the start of the sequence.
|
|
4159
|
-
* @returns {number[][]} The decoder prompt ids.
|
|
4160
|
-
*/
|
|
4161
|
-
get_decoder_prompt_ids({
|
|
4162
|
-
language = null,
|
|
4163
|
-
task = null,
|
|
4164
|
-
no_timestamps = true,
|
|
4165
|
-
} = {}) {
|
|
4166
|
-
|
|
4167
|
-
// <|lang_id|> <|task|> <|notimestamps|>
|
|
4168
|
-
|
|
4169
|
-
const forced_decoder_ids = [];
|
|
4170
|
-
|
|
4171
|
-
if (language) {
|
|
4172
|
-
// User wishes to specify the language
|
|
4173
|
-
const language_code = whisper_language_to_code(language);
|
|
4174
|
-
const language_token_id = this.model.tokens_to_ids.get(`<|${language_code}|>`);
|
|
4175
|
-
if (language_token_id === undefined) {
|
|
4176
|
-
throw new Error(`Unable to find language "${language_code}" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`)
|
|
4177
|
-
}
|
|
4178
|
-
|
|
4179
|
-
forced_decoder_ids.push(language_token_id);
|
|
4180
|
-
} else {
|
|
4181
|
-
// No token will be forced, which leaves the model to predict the language
|
|
4182
|
-
forced_decoder_ids.push(null);
|
|
4183
|
-
}
|
|
4184
|
-
|
|
4185
|
-
if (task) {
|
|
4186
|
-
task = task.toLowerCase();
|
|
4187
|
-
if (task !== 'transcribe' && task !== 'translate') {
|
|
4188
|
-
throw new Error(`Task "${task}" is not supported. Must be one of: ["transcribe", "translate"]`);
|
|
4189
|
-
}
|
|
4190
|
-
|
|
4191
|
-
const task_token_id = this.model.tokens_to_ids.get(`<|${task}|>`);
|
|
4192
|
-
if (task_token_id === undefined) {
|
|
4193
|
-
throw new Error(`Unable to find task "${task}" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`)
|
|
4194
|
-
}
|
|
4195
|
-
|
|
4196
|
-
forced_decoder_ids.push(task_token_id);
|
|
4197
|
-
} else {
|
|
4198
|
-
// No token will be forced, which leaves the model to predict the task
|
|
4199
|
-
forced_decoder_ids.push(null);
|
|
4200
|
-
}
|
|
4201
|
-
|
|
4202
|
-
if (no_timestamps) {
|
|
4203
|
-
const no_timestamps_id = this.model.tokens_to_ids.get(`<|notimestamps|>`);
|
|
4204
|
-
if (no_timestamps_id === undefined) {
|
|
4205
|
-
throw new Error(`Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`);
|
|
4206
|
-
}
|
|
4207
|
-
|
|
4208
|
-
forced_decoder_ids.push(no_timestamps_id);
|
|
4209
|
-
}
|
|
4210
|
-
|
|
4211
|
-
return forced_decoder_ids.map((x, i) => [i + 1, x]).filter(x => x[1] !== null);
|
|
4212
|
-
|
|
4213
|
-
}
|
|
4214
4179
|
}
|
|
4215
4180
|
export class CodeGenTokenizer extends PreTrainedTokenizer { }
|
|
4216
4181
|
export class CLIPTokenizer extends PreTrainedTokenizer { }
|
package/src/utils/constants.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
|
|
2
|
-
export const GITHUB_ISSUE_URL = 'https://github.com/
|
|
2
|
+
export const GITHUB_ISSUE_URL = 'https://github.com/huggingface/transformers.js/issues/new/choose';
|
package/src/utils/core.js
CHANGED
|
@@ -147,3 +147,15 @@ export function pick(o, props) {
|
|
|
147
147
|
})
|
|
148
148
|
);
|
|
149
149
|
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Calculate the length of a string, taking multi-byte characters into account.
|
|
153
|
+
* This mimics the behavior of Python's `len` function.
|
|
154
|
+
* @param {string} s The string to calculate the length of.
|
|
155
|
+
* @returns {number} The length of the string.
|
|
156
|
+
*/
|
|
157
|
+
export function len(s) {
|
|
158
|
+
let length = 0;
|
|
159
|
+
for (const c of s) ++length;
|
|
160
|
+
return length;
|
|
161
|
+
}
|
|
@@ -229,7 +229,7 @@ export class CharTrie {
|
|
|
229
229
|
* @param {string[]} texts The strings to add to the trie.
|
|
230
230
|
*/
|
|
231
231
|
extend(texts) {
|
|
232
|
-
for (
|
|
232
|
+
for (const text of texts) {
|
|
233
233
|
this.push(text);
|
|
234
234
|
}
|
|
235
235
|
}
|
|
@@ -240,7 +240,7 @@ export class CharTrie {
|
|
|
240
240
|
*/
|
|
241
241
|
push(text) {
|
|
242
242
|
let node = this.root;
|
|
243
|
-
for (
|
|
243
|
+
for (const ch of text) {
|
|
244
244
|
let child = node.children.get(ch);
|
|
245
245
|
if (child === undefined) {
|
|
246
246
|
child = CharTrieNode.default();
|
|
@@ -258,12 +258,14 @@ export class CharTrie {
|
|
|
258
258
|
*/
|
|
259
259
|
*commonPrefixSearch(text) {
|
|
260
260
|
let node = this.root;
|
|
261
|
+
if (node === undefined) return;
|
|
262
|
+
|
|
261
263
|
let prefix = "";
|
|
262
|
-
for (
|
|
263
|
-
const ch = text[i];
|
|
264
|
+
for (const ch of text) {
|
|
264
265
|
prefix += ch;
|
|
265
266
|
node = node.children.get(ch);
|
|
266
|
-
if (node
|
|
267
|
+
if (node === undefined) return;
|
|
268
|
+
if (node.isLeaf) {
|
|
267
269
|
yield prefix;
|
|
268
270
|
}
|
|
269
271
|
}
|
|
@@ -305,8 +307,8 @@ export class TokenLattice {
|
|
|
305
307
|
* @param {number} eosTokenId The end-of-sequence token ID.
|
|
306
308
|
*/
|
|
307
309
|
constructor(sentence, bosTokenId, eosTokenId) {
|
|
308
|
-
this.
|
|
309
|
-
this.len =
|
|
310
|
+
this.chars = Array.from(sentence);
|
|
311
|
+
this.len = this.chars.length;
|
|
310
312
|
this.bosTokenId = bosTokenId;
|
|
311
313
|
this.eosTokenId = eosTokenId;
|
|
312
314
|
this.nodes = [];
|
|
@@ -340,7 +342,7 @@ export class TokenLattice {
|
|
|
340
342
|
/**
|
|
341
343
|
* Implements the Viterbi algorithm to compute the most likely sequence of tokens.
|
|
342
344
|
*
|
|
343
|
-
* @returns {TokenLatticeNode[]} The
|
|
345
|
+
* @returns {TokenLatticeNode[]} The most likely sequence of tokens.
|
|
344
346
|
*/
|
|
345
347
|
viterbi() {
|
|
346
348
|
const len = this.len;
|
|
@@ -394,11 +396,11 @@ export class TokenLattice {
|
|
|
394
396
|
* @returns {string} The array of nodes representing the most likely sequence of tokens.
|
|
395
397
|
*/
|
|
396
398
|
piece(node) {
|
|
397
|
-
return this.
|
|
399
|
+
return this.chars.slice(node.pos, node.pos + node.length).join('');
|
|
398
400
|
}
|
|
399
401
|
|
|
400
402
|
/**
|
|
401
|
-
* @returns {
|
|
403
|
+
* @returns {string[]} The most likely sequence of tokens.
|
|
402
404
|
*/
|
|
403
405
|
tokens() {
|
|
404
406
|
const nodes = this.viterbi();
|
|
@@ -406,7 +408,7 @@ export class TokenLattice {
|
|
|
406
408
|
}
|
|
407
409
|
|
|
408
410
|
/**
|
|
409
|
-
* @returns {
|
|
411
|
+
* @returns {number[]} The most likely sequence of token ids.
|
|
410
412
|
*/
|
|
411
413
|
tokenIds() {
|
|
412
414
|
const nodes = this.viterbi();
|
package/src/utils/hub.js
CHANGED
|
@@ -14,7 +14,7 @@ import { dispatchCallback } from './core.js';
|
|
|
14
14
|
/**
|
|
15
15
|
* @typedef {Object} PretrainedOptions Options for loading a pretrained model.
|
|
16
16
|
* @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
|
|
17
|
-
* @property {
|
|
17
|
+
* @property {import('../configs.js').PretrainedConfig} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
|
|
18
18
|
* - The model is a model provided by the library (loaded with the *model id* string of a pretrained model).
|
|
19
19
|
* - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory.
|
|
20
20
|
* @property {string} [cache_dir=null] Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.
|
package/src/utils/maths.js
CHANGED
|
@@ -158,11 +158,20 @@ export function softmax(arr) {
|
|
|
158
158
|
* @returns {T} The resulting log_softmax array.
|
|
159
159
|
*/
|
|
160
160
|
export function log_softmax(arr) {
|
|
161
|
-
// Compute the
|
|
162
|
-
const
|
|
161
|
+
// Compute the maximum value in the array
|
|
162
|
+
const maxVal = max(arr)[0];
|
|
163
|
+
|
|
164
|
+
// Compute the sum of the exponentials
|
|
165
|
+
let sumExps = 0;
|
|
166
|
+
for(let i = 0; i < arr.length; ++i) {
|
|
167
|
+
sumExps += Math.exp(arr[i] - maxVal);
|
|
168
|
+
}
|
|
163
169
|
|
|
164
|
-
//
|
|
165
|
-
const
|
|
170
|
+
// Compute the log of the sum
|
|
171
|
+
const logSum = Math.log(sumExps);
|
|
172
|
+
|
|
173
|
+
// Compute the softmax values
|
|
174
|
+
const logSoftmaxArr = arr.map(x => x - maxVal - logSum);
|
|
166
175
|
|
|
167
176
|
return /** @type {T} */(logSoftmaxArr);
|
|
168
177
|
}
|
|
@@ -217,7 +226,7 @@ export function magnitude(arr) {
|
|
|
217
226
|
/**
|
|
218
227
|
* Returns the value and index of the minimum element in an array.
|
|
219
228
|
* @param {number[]|TypedArray} arr array of numbers.
|
|
220
|
-
* @returns {number
|
|
229
|
+
* @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
|
|
221
230
|
* @throws {Error} If array is empty.
|
|
222
231
|
*/
|
|
223
232
|
export function min(arr) {
|