npm - @huggingface/transformers - Versions diffs - 3.0.0-alpha.9 → 3.0.1 - Mend

@huggingface/transformers 3.0.0-alpha.9 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/README.md +82 -50
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/transformers.cjs +2550 -2552
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +3639 -3567
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +25 -25
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +41 -42
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +56 -57
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +2586 -2564
package/dist/transformers.mjs.map +1 -1
package/package.json +14 -13
package/src/backends/onnx.js +24 -19
package/src/configs.js +19 -4
package/src/env.js +5 -9
package/src/generation/logits_process.js +40 -37
package/src/models.js +356 -539
package/src/ops/registry.js +14 -3
package/src/pipelines.js +5 -5
package/src/processors.js +392 -351
package/src/tokenizers.js +140 -175
package/src/utils/constants.js +1 -1
package/src/utils/core.js +12 -0
package/src/utils/data-structures.js +13 -11
package/src/utils/hub.js +1 -1
package/src/utils/maths.js +14 -5
package/src/utils/tensor.js +60 -13
package/types/backends/onnx.d.ts +5 -2
package/types/backends/onnx.d.ts.map +1 -1
package/types/configs.d.ts +29 -3
package/types/configs.d.ts.map +1 -1
package/types/env.d.ts +4 -2
package/types/env.d.ts.map +1 -1
package/types/generation/logits_process.d.ts.map +1 -1
package/types/models.d.ts +116 -289
package/types/models.d.ts.map +1 -1
package/types/ops/registry.d.ts +6 -6
package/types/ops/registry.d.ts.map +1 -1
package/types/pipelines.d.ts +1 -2
package/types/pipelines.d.ts.map +1 -1
package/types/processors.d.ts +58 -51
package/types/processors.d.ts.map +1 -1
package/types/tokenizers.d.ts +23 -32
package/types/tokenizers.d.ts.map +1 -1
package/types/utils/constants.d.ts +1 -1
package/types/utils/constants.d.ts.map +1 -1
package/types/utils/core.d.ts +7 -0
package/types/utils/core.d.ts.map +1 -1
package/types/utils/data-structures.d.ts +6 -6
package/types/utils/data-structures.d.ts.map +1 -1
package/types/utils/hub.d.ts +1 -1
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/maths.d.ts +2 -2
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +27 -1
package/types/utils/tensor.d.ts.map +1 -1

package/src/tokenizers.js CHANGED Viewed

@@ -28,6 +28,7 @@ import {
     escapeRegExp,
     isIntegralNumber,
     mergeArrays,
+    len,
 } from './utils/core.js';
 import {
@@ -195,7 +196,7 @@ function clean_up_tokenization(text) {
  * @returns {string} The text with accents removed.
  */
 function remove_accents(text) {
-    return text.replace(/[\u0300-\u036f]/g, '');
+    return text.replace(/\p{M}/gu, '');
 }
 /**
@@ -236,23 +237,26 @@ export function is_chinese_char(cp) {
 }
 /**
- * Helper function to fuse consecutive values in an array equal to the specified value.
- * @param {string[]} arr The input array
- * @param {any} value The value to fuse on.
- * @param {Map<string, any>} mapping The mapping from input domain to value.
+ * Helper function to fuse consecutive unknown tokens.
+ * @param {string[]} arr The list of input tokens
+ * @param {Map<string, any>} tokens_to_ids The mapping from tokens to token ids.
+ * @param {number} unk_token_id The value to fuse on.
+ * @private
  */
-function fuse(arr, value, mapping) {
+function fuse_unk(arr, tokens_to_ids, unk_token_id) {
     const fused = [];
     let i = 0;
     while (i < arr.length) {
         fused.push(arr[i])
-        if ((mapping.get(arr[i]) ?? value) !== value) {
+        if ((tokens_to_ids.get(arr[i]) ?? unk_token_id) !== unk_token_id) {
             ++i;
             continue;
         }
-        while (i < arr.length && (mapping.get(arr[i]) ?? value) === value) {
-            ++i;
+        while (++i < arr.length && (tokens_to_ids.get(arr[i]) ?? unk_token_id) === unk_token_id) {
+            if (tokens_to_ids.get(fused.at(-1)) !== unk_token_id) {
+                fused[fused.length - 1] += arr[i];
+            }
         }
     }
@@ -270,12 +274,17 @@ function whitespace_split(text) {
 const PUNCTUATION_REGEX = '\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E';
 const PUNCTUATION_ONLY_REGEX = new RegExp(`^[${PUNCTUATION_REGEX}]+$`, 'gu');
+const BLOOM_SPLIT_CHARS = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c';
-// A mapping of regex patterns to their equivalent (but longer) JS-compatible versions.
+// A mapping of regex patterns to their equivalent (but possibly longer) JS-compatible versions.
 const PROBLEMATIC_REGEX_MAP = new Map([
     // This uses the case insensitive group modifier, which is not supported in JavaScript.
     // When parsing the regex, an "Invalid group" error is thrown.
     ["(?i:'s|'t|'re|'ve|'m|'ll|'d)", "(?:'([sS]|[tT]|[rR][eE]|[vV][eE]|[mM]|[lL][lL]|[dD]))"],
+    // Used to override the default (invalid) regex of the bloom pretokenizer.
+    // For more information, see https://github.com/huggingface/transformers.js/issues/94
+    [` ?[^(\\s|[${BLOOM_SPLIT_CHARS}])]+`, ` ?[^\\s${BLOOM_SPLIT_CHARS}]+`],
 ])
@@ -353,14 +362,21 @@ export class TokenizerModel extends Callable {
             case 'Unigram':
                 // @ts-ignore
                 return new Unigram(config, ...args);
             case 'BPE':
                 return new BPE(config);
             default:
+                // Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
+                // In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
                 if (config.vocab) {
-                    // @ts-ignore
-                    return new LegacyTokenizerModel(config, ...args);
+                    if (Array.isArray(config.vocab)) {
+                        // config.vocab is of type `[string, number][]`
+                        // @ts-ignore
+                        return new Unigram(config, ...args);
+                    } else {
+                        // @ts-ignore
+                        return new LegacyTokenizerModel(config, ...args);
+                    }
                 }
                 throw new Error(`Unknown TokenizerModel type: ${config.type}`);
         }
@@ -369,15 +385,15 @@ export class TokenizerModel extends Callable {
     /**
      * Internal function to call the TokenizerModel instance.
      * @param {string[]} tokens The tokens to encode.
-     * @returns {string[]} The encoded token IDs.
+     * @returns {string[]} The encoded tokens.
      */
     _call(tokens) {
-        let ids = this.encode(tokens);
+        tokens = this.encode(tokens);
         if (this.fuse_unk) {
             // Fuse unknown tokens
-            ids = fuse(ids, this.unk_token_id, this.tokens_to_ids);
+            tokens = fuse_unk(tokens, this.tokens_to_ids, this.unk_token_id);
         }
-        return ids;
+        return tokens;
     }
     /**
@@ -538,18 +554,18 @@ class Unigram extends TokenizerModel {
         this.unk_token = this.vocab[config.unk_id];
         this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i]));
-        this.bosToken = ' '; // beginning of a sentence token
+        this.bos_token = ' '; // beginning of a sentence token
-        this.bosTokenId = this.tokens_to_ids.get(this.bosToken); // NOTE: may be undefined
-        this.eosToken = moreConfig.eos_token;
+        this.bos_token_id = this.tokens_to_ids.get(this.bos_token); // NOTE: may be undefined
+        this.eos_token = moreConfig.eos_token;
-        this.eosTokenId = this.tokens_to_ids.get(this.eosToken);
-        this.unkToken = this.vocab[this.unk_token_id];
+        this.eos_token_id = this.tokens_to_ids.get(this.eos_token);
+        this.unk_token = this.vocab[this.unk_token_id];
         this.minScore = min(this.scores)[0];
-        this.unkScore = this.minScore - 10.0;
-        this.scores[this.unk_token_id] = this.unkScore;
+        this.unk_score = this.minScore - 10.0;
+        this.scores[this.unk_token_id] = this.unk_score;
         this.trie = new CharTrie();
         this.trie.extend(this.vocab);
@@ -564,26 +580,27 @@ class Unigram extends TokenizerModel {
      * @param {TokenLattice} lattice The token lattice to populate with nodes.
      */
     populateNodes(lattice) {
-        const sentence = lattice.sentence;
-        const len = sentence.length;
+        const chars = lattice.chars;
+        const mblen = 1;
         let beginPos = 0;
-        while (beginPos < len) {
-            const mblen = 1;
+        while (beginPos < chars.length) {
             let hasSingleNode = false;
-            const tokens = [];
-            for (let token of this.trie.commonPrefixSearch(sentence.slice(beginPos))) {
+            const tokens = [];
+            const sliced = chars.slice(beginPos).join('');
+            const prefixedTokens = this.trie.commonPrefixSearch(sliced);
+            for (const token of prefixedTokens) {
                 tokens.push(token);
                 const tokenId = this.tokens_to_ids.get(token);
                 const tokenScore = this.scores[tokenId];
-                const n = token.length;
+                const n = len(token);
                 lattice.insert(beginPos, n, tokenScore, tokenId);
                 if (!hasSingleNode && n === mblen) {
                     hasSingleNode = true;
                 }
             }
             if (!hasSingleNode) {
-                lattice.insert(beginPos, mblen, this.unkScore, this.unk_token_id);
+                lattice.insert(beginPos, mblen, this.unk_score, this.unk_token_id);
             }
             beginPos += mblen;
         }
@@ -596,7 +613,7 @@ class Unigram extends TokenizerModel {
      * @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model.
      */
     tokenize(normalized) {
-        const lattice = new TokenLattice(normalized, this.bosTokenId, this.eosTokenId);
+        const lattice = new TokenLattice(normalized, this.bos_token_id, this.eos_token_id);
         this.populateNodes(lattice);
         return lattice.tokens();
     }
@@ -666,7 +683,7 @@ class BPE extends TokenizerModel {
      * Create a BPE instance.
      * @param {Object} config The configuration object for BPE.
      * @param {Object} config.vocab A mapping of tokens to ids.
-     * @param {string[]} config.merges An array of BPE merges as strings.
+     * @param {string[]|[string, string][]} config.merges An array of BPE merges as strings.
      * @param {string} config.unk_token The unknown token used for out of vocabulary words.
      * @param {string} config.end_of_word_suffix The suffix to place at the end of each word.
      * @param {string} [config.continuing_subword_suffix] The suffix to insert between words.
@@ -676,8 +693,6 @@ class BPE extends TokenizerModel {
     constructor(config) {
         super(config);
-        this.BPE_SPLIT_TOKEN = ' ';
         /** @type {Map<string, number>} */
         this.tokens_to_ids = objectToMap(config.vocab);
@@ -689,8 +704,15 @@ class BPE extends TokenizerModel {
             this.vocab[value] = key;
         }
-        this.bpe_ranks = new Map(config.merges.map((x, i) => [x, i]));
-        this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN));
+        // Tokenizers >= 0.20.0 serializes BPE merges as a [string, string][] instead of a string[],
+        // which resolves the ambiguity for merges containing spaces.
+        const use_new_merge_format = Array.isArray(config.merges[0]);
+        /** @type {[string, string][]} */
+        this.merges = use_new_merge_format
+            ? /** @type {[string, string][]} */(config.merges)
+            : (/** @type {string[]} */(config.merges)).map(x => /** @type {[string, string]} */(x.split(' ', 2)));
+        this.bpe_ranks = new Map(this.merges.map((x, i) => [JSON.stringify(x), i]));
         this.end_of_word_suffix = config.end_of_word_suffix;
@@ -850,7 +872,7 @@ class BPE extends TokenizerModel {
         // `score` is a measure of the merge priority: lower means higher priority
         // We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list)
         // We also add a fractional component to the score to break ties (with the earlier character having higher priority)
-        const rank = this.bpe_ranks.get(node.token + this.BPE_SPLIT_TOKEN + node.next.token);
+        const rank = this.bpe_ranks.get(JSON.stringify([node.token, node.next.token]));
         if (rank !== undefined) {
             node.score = rank + node.bias;
             queue.push(node);
@@ -875,15 +897,19 @@ class BPE extends TokenizerModel {
             for (const t of bpe_token_list) {
                 if (this.tokens_to_ids.has(t)) {
                     outputTokens.push(t);
-                } else {
-                    if (this.byte_fallback) {
-                        outputTokens.push(
-                            ...Array.from(this.text_encoder.encode(t))
-                                .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`)
-                        );
+                } else if (this.byte_fallback) {
+                    const byteTokens = Array.from(this.text_encoder.encode(t))
+                        .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`);
+                    if (byteTokens.every(x => this.tokens_to_ids.has(x))) {
+                        // Ensure the byte tokens are actually in the vocabulary, otherwise
+                        // we fall back to the unknown token. For more information, see
+                        // https://github.com/huggingface/transformers/issues/28096.
+                        outputTokens.push(...byteTokens);
                     } else {
                         outputTokens.push(this.unk_token);
                     }
+                } else {
+                    outputTokens.push(this.unk_token);
                 }
             }
         }
@@ -1207,7 +1233,8 @@ class BertNormalizer extends Normalizer {
      * @returns {string} The text with accents removed.
      */
     stripAccents(text) {
-        return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
+        // "Mark, Nonspacing" (Mn)
+        return text.normalize('NFD').replace(/\p{Mn}/gu, '');
     }
@@ -2325,7 +2352,7 @@ class Precompiled extends Normalizer {
         // TODO: detect when a different `this.charsmap` is used.
         text = text.replace(/[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm, ''); // Remove control characters
-        text = text.replace(/[\u0009\u000A\u000C\u000D\u1680\u200B\u200C\u200E\u200F\u2028\u2029\u2581\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space
+        text = text.replace(/[\u0009\u000A\u000C\u000D\u00A0\u1680\u2000-\u200F\u2028\u2029\u202F\u205F\u2581\u3000\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space
         if (text.includes('\uFF5E')) {
             // To match the sentencepiece implementation 100%, we must handle a very strange edge-case.
@@ -2550,16 +2577,15 @@ export class PreTrainedTokenizer extends Callable {
             // Another slight hack to add `end_of_word_suffix` (if present) to the decoder
             // This is needed for cases where BPE model and ByteLevel decoder are used
-            // For more information, see https://github.com/xenova/transformers.js/issues/74
+            // For more information, see https://github.com/huggingface/transformers.js/issues/74
             // TODO: save this to the decoder when exporting?
             this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
         }
         this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
-            this.added_tokens
+            this.added_tokens.slice()
                 // Sort by length (desc) to avoid early partial matches
-                .toSorted((a, b) => b.content.length - a.content.length)
+                .sort((a, b) => b.content.length - a.content.length)
                 .map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`)
                 .join('|')
         ) : null;
@@ -3057,6 +3083,67 @@ export class PreTrainedTokenizer extends Callable {
         return decoded;
     }
+    /**
+     * Retrieve the chat template string used for tokenizing chat messages. This template is used
+     * internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat
+     * template for better generation tracking.
+     *
+     * @param {Object} options An optional object containing the following properties:
+     * @param {string} [options.chat_template=null]
+     * A Jinja template or the name of a template to use for this conversion.
+     * It is usually not necessary to pass anything to this argument,
+     * as the model's template will be used by default.
+     * @param {Object[]} [options.tools=null]
+     * A list of tools (callable functions) that will be accessible to the model. If the template does not
+     * support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
+     * giving the name, description and argument types for the tool. See our
+     * [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
+     * for more information.
+     * @returns {string} The chat template string.
+     */
+    get_chat_template({
+        chat_template = null,
+        tools = null,
+    } = {}) {
+        // First, handle the cases when the model has a dict of multiple templates
+        if (this.chat_template && typeof this.chat_template === 'object') {
+            const template_dict = this.chat_template;
+            if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
+                // The user can pass the name of a template to the chat template argument instead of an entire template
+                chat_template = template_dict[chat_template];
+            } else if (chat_template === null) {
+                if (tools !== null && 'tool_use' in template_dict) {
+                    chat_template = template_dict['tool_use'];
+                } else if ('default' in template_dict) {
+                    chat_template = template_dict['default'];
+                } else {
+                    throw Error(
+                        `This model has multiple chat templates with no default specified! Please either pass a chat ` +
+                        `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
+                        `template names are ${Object.keys(template_dict).sort()}.`
+                    )
+                }
+            }
+        } else if (chat_template === null) {
+            // These are the cases when the model has a single template
+            // priority: `chat_template` argument > `tokenizer.chat_template`
+            if (this.chat_template) {
+                chat_template = this.chat_template;
+            } else {
+                throw Error(
+                    "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
+                    "argument was passed! For information about writing templates and setting the " +
+                    "tokenizer.chat_template attribute, please see the documentation at " +
+                    "https://huggingface.co/docs/transformers/main/en/chat_templating"
+                )
+            }
+        }
+        return chat_template;
+    }
     /**
      * Converts a list of message objects with `"role"` and `"content"` keys to a list of token
      * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
@@ -3130,39 +3217,8 @@ export class PreTrainedTokenizer extends Callable {
         ...kwargs
     } = {}) {
-        // First, handle the cases when the model has a dict of multiple templates
-        if (
-            (this.chat_template && typeof this.chat_template === 'object')
-            || this.chat_template === null
-        ) {
-            const template_dict = this.chat_template;
+        chat_template = this.get_chat_template({ chat_template, tools });
-            if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
-                // The user can pass the name of a template to the chat template argument instead of an entire template
-                chat_template = template_dict[chat_template];
-            } else if (chat_template === null && 'default' in template_dict) {
-                chat_template = template_dict['default'];
-            } else if (chat_template === null) {
-                throw Error(
-                    `This model has multiple chat templates with no default specified! Please either pass a chat ` +
-                    `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
-                    `template names are ${Object.keys(template_dict).sort()}.`
-                )
-            }
-        } else {
-            // These are the cases when the model has a single template
-            // priority: `chat_template` argument > `tokenizer.chat_template`
-            if (this.chat_template) {
-                chat_template = this.chat_template;
-            } else {
-                throw Error(
-                    "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
-                    "argument was passed! For information about writing templates and setting the " +
-                    "tokenizer.chat_template attribute, please see the documentation at " +
-                    "https://huggingface.co/docs/transformers/main/en/chat_templating"
-                )
-            }
-        }
         if (typeof chat_template !== 'string') {
             throw Error(`chat_template must be a string, but got ${typeof chat_template}`);
         }
@@ -3283,19 +3339,7 @@ export class MBart50Tokenizer extends MBartTokenizer { } // NOTE: extends MBartT
 export class RobertaTokenizer extends PreTrainedTokenizer { }
-export class BloomTokenizer extends PreTrainedTokenizer {
-    constructor(tokenizerJSON, tokenizerConfig) {
-        // Override the default (invalid) regex of the pretokenizer.
-        // For more information, see https://github.com/xenova/transformers.js/issues/94
-        const splitChars = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c';
-        const patternObject = tokenizerJSON.pre_tokenizer?.pretokenizers[0]?.pattern;
-        if (patternObject && patternObject.Regex === ` ?[^(\\s|[${splitChars}])]+`) {
-            patternObject.Regex = ` ?[^\\s${splitChars}]+`;
-        }
-        super(tokenizerJSON, tokenizerConfig);
-    }
-}
+export class BloomTokenizer extends PreTrainedTokenizer { }
 const SPIECE_UNDERLINE = "▁";
@@ -4132,85 +4176,6 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
             newIndices.filter(x => x.length > 0),
         ]
     }
-    /**
-     * Helper function to build translation inputs for a `WhisperTokenizer`,
-     * depending on the language, task, and whether to predict timestamp tokens.
-     *
-     * Used to override the prefix tokens appended to the start of the label sequence.
-     *
-     * **Example: Get ids for a language**
-     * ```javascript
-     * // instantiate the tokenizer and set the prefix token to Spanish
-     * const tokenizer = await WhisperTokenizer.from_pretrained('Xenova/whisper-tiny');
-     * const forced_decoder_ids = tokenizer.get_decoder_prompt_ids({ language: 'spanish' });
-     * // [(1, 50262), (2, 50363)]
-     * ```
-     *
-     * @param {Object} options Options to generate the decoder prompt.
-     * @param {string} [options.language] The language of the transcription text.
-     * The corresponding language id token is appended to the start of the sequence for multilingual
-     * speech recognition and speech translation tasks, e.g. for "Spanish" the token "<|es|>" is appended
-     * to the start of sequence.
-     * @param {string} [options.task] Task identifier to append at the start of sequence (if any).
-     * This should be used for mulitlingual fine-tuning, with "transcribe" for speech recognition and
-     * "translate" for speech translation.
-     * @param {boolean} [options.no_timestamps] Whether to add the <|notimestamps|> token at the start of the sequence.
-     * @returns {number[][]} The decoder prompt ids.
-     */
-    get_decoder_prompt_ids({
-        language = null,
-        task = null,
-        no_timestamps = true,
-    } = {}) {
-        // <|lang_id|> <|task|> <|notimestamps|>
-        const forced_decoder_ids = [];
-        if (language) {
-            // User wishes to specify the language
-            const language_code = whisper_language_to_code(language);
-            const language_token_id = this.model.tokens_to_ids.get(`<|${language_code}|>`);
-            if (language_token_id === undefined) {
-                throw new Error(`Unable to find language "${language_code}" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`)
-            }
-            forced_decoder_ids.push(language_token_id);
-        } else {
-            // No token will be forced, which leaves the model to predict the language
-            forced_decoder_ids.push(null);
-        }
-        if (task) {
-            task = task.toLowerCase();
-            if (task !== 'transcribe' && task !== 'translate') {
-                throw new Error(`Task "${task}" is not supported. Must be one of: ["transcribe", "translate"]`);
-            }
-            const task_token_id = this.model.tokens_to_ids.get(`<|${task}|>`);
-            if (task_token_id === undefined) {
-                throw new Error(`Unable to find task "${task}" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`)
-            }
-            forced_decoder_ids.push(task_token_id);
-        } else {
-            // No token will be forced, which leaves the model to predict the task
-            forced_decoder_ids.push(null);
-        }
-        if (no_timestamps) {
-            const no_timestamps_id = this.model.tokens_to_ids.get(`<|notimestamps|>`);
-            if (no_timestamps_id === undefined) {
-                throw new Error(`Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at ${GITHUB_ISSUE_URL}.`);
-            }
-            forced_decoder_ids.push(no_timestamps_id);
-        }
-        return forced_decoder_ids.map((x, i) => [i + 1, x]).filter(x => x[1] !== null);
-    }
 }
 export class CodeGenTokenizer extends PreTrainedTokenizer { }
 export class CLIPTokenizer extends PreTrainedTokenizer { }

package/src/utils/constants.js CHANGED Viewed

@@ -1,2 +1,2 @@
-export const GITHUB_ISSUE_URL = 'https://github.com/xenova/transformers.js/issues/new/choose';
+export const GITHUB_ISSUE_URL = 'https://github.com/huggingface/transformers.js/issues/new/choose';

package/src/utils/core.js CHANGED Viewed

@@ -147,3 +147,15 @@ export function pick(o, props) {
         })
     );
 }
+/**
+ * Calculate the length of a string, taking multi-byte characters into account.
+ * This mimics the behavior of Python's `len` function.
+ * @param {string} s The string to calculate the length of.
+ * @returns {number} The length of the string.
+ */
+export function len(s) {
+    let length = 0;
+    for (const c of s) ++length;
+    return length;
+}

package/src/utils/data-structures.js CHANGED Viewed

@@ -229,7 +229,7 @@ export class CharTrie {
      * @param {string[]} texts The strings to add to the trie.
      */
     extend(texts) {
-        for (let text of texts) {
+        for (const text of texts) {
             this.push(text);
         }
     }
@@ -240,7 +240,7 @@ export class CharTrie {
      */
     push(text) {
         let node = this.root;
-        for (let ch of text) {
+        for (const ch of text) {
             let child = node.children.get(ch);
             if (child === undefined) {
                 child = CharTrieNode.default();
@@ -258,12 +258,14 @@ export class CharTrie {
      */
     *commonPrefixSearch(text) {
         let node = this.root;
+        if (node === undefined) return;
         let prefix = "";
-        for (let i = 0; i < text.length && node !== undefined; ++i) {
-            const ch = text[i];
+        for (const ch of text) {
             prefix += ch;
             node = node.children.get(ch);
-            if (node !== undefined && node.isLeaf) {
+            if (node === undefined) return;
+            if (node.isLeaf) {
                 yield prefix;
             }
         }
@@ -305,8 +307,8 @@ export class TokenLattice {
      * @param {number} eosTokenId The end-of-sequence token ID.
      */
     constructor(sentence, bosTokenId, eosTokenId) {
-        this.sentence = sentence;
-        this.len = sentence.length;
+        this.chars = Array.from(sentence);
+        this.len = this.chars.length;
         this.bosTokenId = bosTokenId;
         this.eosTokenId = eosTokenId;
         this.nodes = [];
@@ -340,7 +342,7 @@ export class TokenLattice {
     /**
      * Implements the Viterbi algorithm to compute the most likely sequence of tokens.
      *
-     * @returns {TokenLatticeNode[]} The array of nodes representing the most likely sequence of tokens.
+     * @returns {TokenLatticeNode[]} The most likely sequence of tokens.
      */
     viterbi() {
         const len = this.len;
@@ -394,11 +396,11 @@ export class TokenLattice {
      * @returns {string} The array of nodes representing the most likely sequence of tokens.
      */
     piece(node) {
-        return this.sentence.slice(node.pos, node.pos + node.length);
+        return this.chars.slice(node.pos, node.pos + node.length).join('');
     }
     /**
-     * @returns {Array} The array of nodes representing the most likely sequence of tokens.
+     * @returns {string[]} The most likely sequence of tokens.
      */
     tokens() {
         const nodes = this.viterbi();
@@ -406,7 +408,7 @@ export class TokenLattice {
     }
     /**
-     * @returns {Array} The array of nodes representing the most likely sequence of tokens.
+     * @returns {number[]} The most likely sequence of token ids.
      */
     tokenIds() {
         const nodes = this.viterbi();

package/src/utils/hub.js CHANGED Viewed

@@ -14,7 +14,7 @@ import { dispatchCallback } from './core.js';
 /**
  * @typedef {Object} PretrainedOptions Options for loading a pretrained model.
  * @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
- * @property {Object} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
+ * @property {import('../configs.js').PretrainedConfig} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
  * - The model is a model provided by the library (loaded with the *model id* string of a pretrained model).
  * - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory.
  * @property {string} [cache_dir=null] Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.

package/src/utils/maths.js CHANGED Viewed

@@ -158,11 +158,20 @@ export function softmax(arr) {
  * @returns {T} The resulting log_softmax array.
  */
 export function log_softmax(arr) {
-    // Compute the softmax values
-    const softmaxArr = softmax(arr);
+    // Compute the maximum value in the array
+    const maxVal = max(arr)[0];
+    // Compute the sum of the exponentials
+    let sumExps = 0;
+    for(let i = 0; i < arr.length; ++i) {
+        sumExps += Math.exp(arr[i] - maxVal);
+    }
-    // Apply log formula to each element
-    const logSoftmaxArr = softmaxArr.map(x => Math.log(x));
+    // Compute the log of the sum
+    const logSum = Math.log(sumExps);
+    // Compute the softmax values
+    const logSoftmaxArr = arr.map(x => x - maxVal - logSum);
     return /** @type {T} */(logSoftmaxArr);
 }
@@ -217,7 +226,7 @@ export function magnitude(arr) {
 /**
  * Returns the value and index of the minimum element in an array.
  * @param {number[]|TypedArray} arr array of numbers.
- * @returns {number[]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
+ * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
  * @throws {Error} If array is empty.
  */
 export function min(arr) {