npm - @huggingface/transformers - Versions diffs - 3.3.3 → 3.4.1 - Mend

@huggingface/transformers 3.3.3 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

package/README.md +13 -3
package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/transformers.js +2778 -1592
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.js +1 -1
package/dist/transformers.min.js.map +1 -1
package/dist/{transformers.cjs → transformers.node.cjs} +1699 -2530
package/dist/transformers.node.cjs.map +1 -0
package/dist/transformers.node.min.cjs +2 -0
package/dist/transformers.node.min.cjs.map +1 -0
package/dist/transformers.node.min.mjs +2 -0
package/dist/transformers.node.min.mjs.map +1 -0
package/dist/{transformers.mjs → transformers.node.mjs} +1738 -2510
package/dist/transformers.node.mjs.map +1 -0
package/dist/transformers.web.js +35876 -0
package/dist/transformers.web.js.map +1 -0
package/dist/transformers.web.min.js +2 -0
package/dist/transformers.web.min.js.map +1 -0
package/package.json +6 -6
package/src/backends/onnx.js +14 -15
package/src/configs.js +6 -1
package/src/env.js +1 -1
package/src/generation/streamers.js +4 -3
package/src/models/dac/feature_extraction_dac.js +3 -0
package/src/models/encodec/feature_extraction_encodec.js +32 -0
package/src/models/feature_extractors.js +3 -0
package/src/models/idefics3/image_processing_idefics3.js +1 -1
package/src/models/image_processors.js +1 -0
package/src/models/processors.js +2 -0
package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
package/src/models/smolvlm/processing_smolvlm.js +2 -0
package/src/models/snac/feature_extraction_snac.js +3 -0
package/src/models/ultravox/processing_ultravox.js +54 -0
package/src/models/whisper/common_whisper.js +7 -1
package/src/models/whisper/feature_extraction_whisper.js +18 -10
package/src/models.js +546 -78
package/src/pipelines.js +246 -137
package/src/tokenizers.js +42 -28
package/src/transformers.js +1 -0
package/src/utils/audio.js +2 -0
package/src/utils/hub.js +140 -80
package/src/utils/image.js +9 -1
package/src/utils/maths.js +1 -1
package/src/utils/tensor.js +12 -5
package/src/utils/video.js +128 -0
package/types/backends/onnx.d.ts +2 -2
package/types/backends/onnx.d.ts.map +1 -1
package/types/configs.d.ts +1 -1
package/types/configs.d.ts.map +1 -1
package/types/generation/streamers.d.ts.map +1 -1
package/types/models/dac/feature_extraction_dac.d.ts +4 -0
package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
package/types/models/feature_extractors.d.ts +3 -0
package/types/models/florence2/processing_florence2.d.ts +1 -1
package/types/models/florence2/processing_florence2.d.ts.map +1 -1
package/types/models/image_processors.d.ts +1 -0
package/types/models/processors.d.ts +2 -0
package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
package/types/models/snac/feature_extraction_snac.d.ts +4 -0
package/types/models/snac/feature_extraction_snac.d.ts.map +1 -0
package/types/models/ultravox/processing_ultravox.d.ts +16 -0
package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
package/types/models/whisper/common_whisper.d.ts.map +1 -1
package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
package/types/models.d.ts +180 -4
package/types/models.d.ts.map +1 -1
package/types/pipelines.d.ts +51 -5
package/types/pipelines.d.ts.map +1 -1
package/types/tokenizers.d.ts.map +1 -1
package/types/transformers.d.ts +1 -0
package/types/tsconfig.tsbuildinfo +1 -1
package/types/utils/audio.d.ts.map +1 -1
package/types/utils/hub.d.ts +19 -7
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +2 -2
package/types/utils/image.d.ts.map +1 -1
package/types/utils/maths.d.ts +2 -2
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +17 -18
package/types/utils/tensor.d.ts.map +1 -1
package/types/utils/video.d.ts +37 -0
package/types/utils/video.d.ts.map +1 -0
package/dist/transformers.cjs.map +0 -1
package/dist/transformers.min.cjs +0 -2
package/dist/transformers.min.cjs.map +0 -1
package/dist/transformers.min.mjs +0 -2
package/dist/transformers.min.mjs.map +0 -1
package/dist/transformers.mjs.map +0 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.3.3",
+  "version": "3.4.1",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",
@@ -9,16 +9,16 @@
     "node": {
       "import": {
         "types": "./types/transformers.d.ts",
-        "default": "./dist/transformers.mjs"
+        "default": "./dist/transformers.node.mjs"
       },
       "require": {
         "types": "./types/transformers.d.ts",
-        "default": "./dist/transformers.cjs"
+        "default": "./dist/transformers.node.cjs"
       }
     },
     "default": {
       "types": "./types/transformers.d.ts",
-      "default": "./dist/transformers.js"
+      "default": "./dist/transformers.web.js"
     }
   },
   "scripts": {
@@ -57,7 +57,7 @@
   "dependencies": {
     "@huggingface/jinja": "^0.3.3",
     "onnxruntime-node": "1.20.1",
-    "onnxruntime-web": "1.21.0-dev.20250206-d981b153d3",
+    "onnxruntime-web": "1.22.0-dev.20250306-ccf8fdd9ea",
     "sharp": "^0.33.5"
   },
   "devDependencies": {
@@ -69,7 +69,7 @@
     "jest-environment-node": "^30.0.0-alpha.6",
     "jsdoc-to-markdown": "^9.1.1",
     "prettier": "3.4.2",
-    "typescript": "^5.7.2",
+    "typescript": "^5.8.2",
     "wavefile": "11.0.0",
     "webpack": "^5.97.1",
     "webpack-cli": "^5.1.4",

package/src/backends/onnx.js CHANGED Viewed

@@ -57,8 +57,8 @@ let ONNX;
 const ORT_SYMBOL = Symbol.for('onnxruntime');
 if (ORT_SYMBOL in globalThis) {
-  // If the JS runtime exposes their own ONNX runtime, use it
-  ONNX = globalThis[ORT_SYMBOL];
+    // If the JS runtime exposes their own ONNX runtime, use it
+    ONNX = globalThis[ORT_SYMBOL];
 } else if (apis.IS_NODE_ENV) {
     ONNX = ONNX_NODE.default ?? ONNX_NODE;
@@ -141,19 +141,19 @@ let wasmInitPromise = null;
 /**
  * Create an ONNX inference session.
- * @param {Uint8Array} buffer The ONNX model buffer.
+ * @param {Uint8Array|string} buffer_or_path The ONNX model buffer or path.
  * @param {import('onnxruntime-common').InferenceSession.SessionOptions} session_options ONNX inference session options.
  * @param {Object} session_config ONNX inference session configuration.
  * @returns {Promise<import('onnxruntime-common').InferenceSession & { config: Object}>} The ONNX inference session.
  */
-export async function createInferenceSession(buffer, session_options, session_config) {
+export async function createInferenceSession(buffer_or_path, session_options, session_config) {
     if (wasmInitPromise) {
         // A previous session has already initialized the WASM runtime
         // so we wait for it to resolve before creating this new session.
         await wasmInitPromise;
     }
-    const sessionPromise = InferenceSession.create(buffer, session_options);
+    const sessionPromise = InferenceSession.create(buffer_or_path, session_options);
     wasmInitPromise ??= sessionPromise;
     const session = await sessionPromise;
     session.config = session_config;
@@ -175,11 +175,15 @@ const ONNX_ENV = ONNX?.env;
 if (ONNX_ENV?.wasm) {
     // Initialize wasm backend with suitable default settings.
-    // (Optional) Set path to wasm files. This is needed when running in a web worker.
-    // https://onnxruntime.ai/docs/api/js/interfaces/Env.WebAssemblyFlags.html#wasmPaths
-    // We use remote wasm files by default to make it easier for newer users.
-    // In practice, users should probably self-host the necessary .wasm files.
-    ONNX_ENV.wasm.wasmPaths = `https://cdn.jsdelivr.net/npm/@huggingface/transformers@${env.version}/dist/`;
+    // (Optional) Set path to wasm files. This will override the default path search behavior of onnxruntime-web.
+    // By default, we only do this if we are not in a service worker and the wasmPaths are not already set.
+    if (
+        // @ts-ignore Cannot find name 'ServiceWorkerGlobalScope'.ts(2304)
+        !(typeof ServiceWorkerGlobalScope !== 'undefined' && self instanceof ServiceWorkerGlobalScope)
+        && !ONNX_ENV.wasm.wasmPaths
+    ) {
+        ONNX_ENV.wasm.wasmPaths = `https://cdn.jsdelivr.net/npm/@huggingface/transformers@${env.version}/dist/`;
+    }
     // TODO: Add support for loading WASM files from cached buffer when we upgrade to onnxruntime-web@1.19.0
     // https://github.com/microsoft/onnxruntime/pull/21534
@@ -187,11 +191,6 @@ if (ONNX_ENV?.wasm) {
     // Users may wish to proxy the WASM backend to prevent the UI from freezing,
     // However, this is not necessary when using WebGPU, so we default to false.
     ONNX_ENV.wasm.proxy = false;
-    // https://developer.mozilla.org/en-US/docs/Web/API/crossOriginIsolated
-    if (typeof crossOriginIsolated === 'undefined' || !crossOriginIsolated) {
-        ONNX_ENV.wasm.numThreads = 1;
-    }
 }
 if (ONNX_ENV?.webgpu) {

package/src/configs.js CHANGED Viewed

@@ -67,9 +67,12 @@ function getNormalizedConfig(config) {
         // Sub-configs
         case 'llava':
         case 'paligemma':
+        case 'gemma3':
         case 'florence2':
         case 'llava_onevision':
         case 'idefics3':
+        case 'ultravox':
+        case 'smolvlm':
             // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.text_config);
             break;
@@ -124,6 +127,7 @@ function getNormalizedConfig(config) {
             break;
         case 'gemma':
         case 'gemma2':
+        case 'gemma3_text':
         case 'glm':
         case 'helium':
             mapping['num_heads'] = 'num_key_value_heads';
@@ -173,6 +177,7 @@ function getNormalizedConfig(config) {
         case 'mbart':
         case 'marian':
         case 'whisper':
+        case 'lite-whisper':
         case 'm2m_100':
         case 'blenderbot':
         case 'blenderbot-small':
@@ -405,5 +410,5 @@ export class AutoConfig {
  * for more information.
  * @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
  * @property {import('./utils/dtypes.js').DataType|Record<string, import('./utils/dtypes.js').DataType>} [dtype] The default data type to use for the model.
- * @property {boolean|Record<string, boolean>} [use_external_data_format=false] Whether to load the model using the external data format (used for models >= 2GB in size).
+ * @property {import('./utils/hub.js').ExternalData|Record<string, import('./utils/hub.js').ExternalData>} [use_external_data_format=false] Whether to load the model using the external data format (used for models >= 2GB in size).
  */

package/src/env.js CHANGED Viewed

@@ -26,7 +26,7 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
-const VERSION = '3.3.3';
+const VERSION = '3.4.1';
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";

package/src/generation/streamers.js CHANGED Viewed

@@ -72,9 +72,10 @@ export class TextStreamer extends BaseStreamer {
             throw Error('TextStreamer only supports batch size of 1');
         }
-        if (this.skip_prompt && this.next_tokens_are_prompt) {
+        const is_prompt = this.next_tokens_are_prompt;
+        if (is_prompt) {
             this.next_tokens_are_prompt = false;
-            return;
+            if (this.skip_prompt) return;
         }
         const tokens = value[0];
@@ -85,7 +86,7 @@ export class TextStreamer extends BaseStreamer {
         const text = this.tokenizer.decode(this.token_cache, this.decode_kwargs);
         let printable_text;
-        if (text.endsWith('\n')) {
+        if (is_prompt || text.endsWith('\n')) {
             // After the symbol for a new line, we flush the cache.
             printable_text = text.slice(this.print_len);
             this.token_cache = [];

package/src/models/dac/feature_extraction_dac.js ADDED Viewed

@@ -0,0 +1,3 @@
+import { EncodecFeatureExtractor } from '../encodec/feature_extraction_encodec.js';
+export class DacFeatureExtractor extends EncodecFeatureExtractor { }

package/src/models/encodec/feature_extraction_encodec.js ADDED Viewed

@@ -0,0 +1,32 @@
+import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
+import { Tensor } from '../../utils/tensor.js';
+export class EncodecFeatureExtractor extends FeatureExtractor {
+    /**
+     * Asynchronously extracts input values from a given audio using the provided configuration.
+     * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
+     * @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
+     */
+    async _call(audio) {
+        validate_audio_inputs(audio, 'EncodecFeatureExtractor');
+        if (audio instanceof Float64Array) {
+            audio = new Float32Array(audio);
+        }
+        const num_channels = this.config.feature_size;
+        if (audio.length % num_channels !== 0) {
+            throw new Error(`The length of the audio data must be a multiple of the number of channels (${num_channels}).`);
+        }
+        const shape = [
+            1,                              /* batch_size */
+            num_channels,                   /* num_channels */
+            audio.length / num_channels,    /* num_samples */
+        ];
+        return {
+            input_values: new Tensor('float32', audio, shape),
+        };
+    }
+}

package/src/models/feature_extractors.js CHANGED Viewed

@@ -1,9 +1,12 @@
 export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
+export * from './encodec/feature_extraction_encodec.js';
 export * from './clap/feature_extraction_clap.js';
+export * from './dac/feature_extraction_dac.js';
 export * from './moonshine/feature_extraction_moonshine.js';
 export * from './pyannote/feature_extraction_pyannote.js';
 export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
+export * from './snac/feature_extraction_snac.js';
 export * from './speecht5/feature_extraction_speecht5.js';
 export * from './wav2vec2/feature_extraction_wav2vec2.js';
 export * from './wespeaker/feature_extraction_wespeaker.js';

package/src/models/idefics3/image_processing_idefics3.js CHANGED Viewed

@@ -147,7 +147,7 @@ export class Idefics3ImageProcessor extends ImageProcessor {
                     const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
                     const end_offset = (i + 1) * pixel_attention_mask_stride;
-                    // @ts-expect-error
+                    // @ts-ignore
                     pixel_attention_mask_data.fill(false, start_offset, end_offset);
                 }
             }

package/src/models/image_processors.js CHANGED Viewed

@@ -32,6 +32,7 @@ export * from './rt_detr/image_processing_rt_detr.js'
 export * from './sam/image_processing_sam.js'
 export * from './segformer/image_processing_segformer.js'
 export * from './siglip/image_processing_siglip.js'
+export * from './smolvlm/image_processing_smolvlm.js'
 export * from './swin2sr/image_processing_swin2sr.js'
 export * from './vit/image_processing_vit.js'
 export * from './vitmatte/image_processing_vitmatte.js'

package/src/models/processors.js CHANGED Viewed

@@ -11,7 +11,9 @@ export * from './paligemma/processing_paligemma.js';
 export * from './pyannote/processing_pyannote.js';
 export * from './qwen2_vl/processing_qwen2_vl.js';
 export * from './sam/processing_sam.js';
+export * from './smolvlm/processing_smolvlm.js';
 export * from './speecht5/processing_speecht5.js';
+export * from './ultravox/processing_ultravox.js';
 export * from './wav2vec2/processing_wav2vec2.js';
 export * from './wav2vec2_with_lm/processing_wav2vec2_with_lm.js';
 export * from './whisper/processing_whisper.js';

package/src/models/smolvlm/image_processing_smolvlm.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+
2	+ export { Idefics3ImageProcessor as SmolVLMImageProcessor } from "../idefics3/image_processing_idefics3.js";

package/src/models/smolvlm/processing_smolvlm.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+
2	+ export { Idefics3Processor as SmolVLMProcessor } from "../idefics3/processing_idefics3.js";

package/src/models/snac/feature_extraction_snac.js ADDED Viewed

@@ -0,0 +1,3 @@
+import { DacFeatureExtractor } from '../dac/feature_extraction_dac.js';
+export class SnacFeatureExtractor extends DacFeatureExtractor { }

package/src/models/ultravox/processing_ultravox.js ADDED Viewed

@@ -0,0 +1,54 @@
+import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
+import { AutoTokenizer } from "../../tokenizers.js"
+import { Processor } from "../../base/processing_utils.js"
+/**
+ * Represents a UltravoxProcessor that extracts features from an audio input.
+ */
+export class UltravoxProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static feature_extractor_class = AutoFeatureExtractor
+    static uses_processor_config = true;
+    /**
+     * @param {string} text The text input to process.
+     * @param {Float32Array} audio The audio input to process.
+     */
+    async _call(text, audio = null, kwargs = {}) {
+        // TODO: Support batched inputs
+        if (Array.isArray(text)) {
+            throw new Error("Batched inputs are not supported yet.");
+        }
+        let audio_inputs = {};
+        if (audio) {
+            const audio_len = audio.length;
+            const { input_features } = await this.feature_extractor(audio, {
+                ...kwargs,
+                max_length: audio_len,
+            });
+            const nb_encoder_frames = Math.round(audio_len / this.config.encoder_ds_factor + 1e-4);
+            // NOTE: The python version appears to have an off-by-one error.
+            const audio_embed_frames = 1 + Math.ceil(nb_encoder_frames / this.config.stack_factor);
+            audio_inputs["audio_token_len"] = [audio_embed_frames];
+            audio_inputs["audio_values"] = input_features;
+            const image_token = this.config.audio_placeholder;
+            if (!text.includes(image_token)) {
+                throw new Error(`The input text does not contain the image token ${image_token}.`);
+            }
+            text = text.replaceAll(image_token, image_token.repeat(audio_embed_frames));
+        }
+        const text_inputs = this.tokenizer(text, {
+            add_special_tokens: false,
+            ...kwargs,
+        });
+        return {
+            ...text_inputs,
+            ...audio_inputs,
+        }
+    }
+}

package/src/models/whisper/common_whisper.js CHANGED Viewed

@@ -135,6 +135,12 @@ export function whisper_language_to_code(language) {
     if (language_code === undefined) {
         // User provided something that is not a language name
+        // Perhaps the user passed the special token itself
+        const language_special_token = language.match(/^<\|([a-z]{2})\|>$/);
+        if (language_special_token) {
+            language = language_special_token[1];
+        }
         if (WHISPER_LANGUAGE_MAPPING.has(language)) {
             // User provided the language code directly (e.g., "en")
             language_code = language;
@@ -144,7 +150,7 @@ export function whisper_language_to_code(language) {
             const is_language_code = language.length === 2;
             const langs = is_language_code ? WHISPER_LANGUAGE_MAPPING.keys() : WHISPER_LANGUAGE_MAPPING.values();
-            throw new Error(`Language "${language}" is not supported. Must be one of: ${JSON.stringify(langs)}`);
+            throw new Error(`Language "${language}" is not supported. Must be one of: ${JSON.stringify(Array.from(langs))}`);
         }
     }
     return language_code;

package/src/models/whisper/feature_extraction_whisper.js CHANGED Viewed

@@ -39,7 +39,10 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
                 log_mel: 'log10',
                 // Custom
-                max_num_frames: this.config.nb_max_frames, // 3000
+                max_num_frames: Math.min(
+                    Math.floor(waveform.length / this.config.hop_length),
+                    this.config.nb_max_frames, // 3000
+                )
             }
         )
@@ -58,20 +61,25 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
      * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
      * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
      */
-    async _call(audio) {
+    async _call(audio, {
+        max_length = null,
+    } = {}) {
         validate_audio_inputs(audio, 'WhisperFeatureExtractor');
         let waveform;
-        if (audio.length > this.config.n_samples) {
-            console.warn(
-                "Attempting to extract features for audio longer than 30 seconds. " +
-                "If using a pipeline to extract transcript from a long audio clip, " +
-                "remember to specify `chunk_length_s` and/or `stride_length_s`."
-            );
-            waveform = audio.slice(0, this.config.n_samples);
+        const length = max_length ?? this.config.n_samples;
+        if (audio.length > length) {
+            if (audio.length > this.config.n_samples) {
+                console.warn(
+                    "Attempting to extract features for audio longer than 30 seconds. " +
+                    "If using a pipeline to extract transcript from a long audio clip, " +
+                    "remember to specify `chunk_length_s` and/or `stride_length_s`."
+                );
+            }
+            waveform = audio.slice(0, length);
         } else {
             // pad with zeros
-            waveform = new Float32Array(this.config.n_samples);
+            waveform = new Float32Array(length);
             waveform.set(audio);
         }