npm - @huggingface/transformers - Versions diffs - 4.0.0-next.5 → 4.0.0-next.7 - Mend

@huggingface/transformers 4.0.0-next.5 → 4.0.0-next.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

package/README.md +12 -4
package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
package/dist/transformers.js +2189 -1015
package/dist/transformers.min.js +16 -16
package/dist/transformers.node.cjs +2234 -1029
package/dist/transformers.node.min.cjs +20 -20
package/dist/transformers.node.min.mjs +20 -20
package/dist/transformers.node.mjs +2194 -1017
package/dist/transformers.web.js +2175 -1001
package/dist/transformers.web.min.js +18 -18
package/package.json +4 -4
package/src/backends/onnx.js +77 -58
package/src/backends/utils/cacheWasm.js +22 -43
package/src/cache_utils.js +62 -0
package/src/configs.js +32 -5
package/src/env.js +36 -6
package/src/image_processors_utils.js +3 -3
package/src/models/auto/modeling_auto.js +14 -1
package/src/models/chatterbox/modeling_chatterbox.js +1 -1
package/src/models/detr/image_processing_detr.js +1 -1
package/src/models/feature_extractors.js +2 -0
package/src/models/gemma3n/modeling_gemma3n.js +2 -0
package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
package/src/models/granite_speech/modeling_granite_speech.js +5 -0
package/src/models/granite_speech/processing_granite_speech.js +62 -0
package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
package/src/models/idefics3/modeling_idefics3.js +5 -32
package/src/models/image_processors.js +1 -0
package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
package/src/models/llava/modeling_llava.js +1 -1
package/src/models/mistral3/modeling_mistral3.js +2 -2
package/src/models/modeling_utils.js +234 -292
package/src/models/models.js +9 -0
package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
package/src/models/paligemma/modeling_paligemma.js +2 -25
package/src/models/processors.js +3 -0
package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
package/src/models/qwen2_vl/modeling_qwen2_vl.js +36 -3
package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
package/src/models/registry.js +39 -4
package/src/models/sam/image_processing_sam.js +1 -1
package/src/models/session.js +17 -6
package/src/models/smolvlm/modeling_smolvlm.js +7 -0
package/src/models/ultravox/modeling_ultravox.js +1 -3
package/src/models/voxtral/modeling_voxtral.js +3 -0
package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
package/src/models/whisper/feature_extraction_whisper.js +2 -12
package/src/pipelines/index.js +2 -84
package/src/pipelines.js +40 -77
package/src/transformers.js +2 -0
package/src/utils/audio.js +18 -2
package/src/utils/cache/CrossOriginStorageCache.js +251 -0
package/src/utils/cache/FileCache.js +128 -0
package/src/utils/cache/cross-origin-storage.d.ts +38 -0
package/src/utils/cache.js +8 -3
package/src/utils/hub/{files.js → FileResponse.js} +0 -105
package/src/utils/hub/utils.js +35 -1
package/src/utils/hub.js +6 -5
package/src/utils/image.js +12 -13
package/src/utils/lru_cache.js +67 -0
package/src/utils/memoize_promise.js +45 -0
package/src/utils/model_registry/ModelRegistry.js +70 -23
package/src/utils/model_registry/get_file_metadata.js +14 -2
package/src/utils/model_registry/get_model_files.js +63 -78
package/src/utils/model_registry/get_pipeline_files.js +15 -24
package/src/utils/model_registry/is_cached.js +81 -4
package/src/utils/tensor.js +18 -2
package/types/backends/onnx.d.ts.map +1 -1
package/types/backends/utils/cacheWasm.d.ts +3 -17
package/types/backends/utils/cacheWasm.d.ts.map +1 -1
package/types/cache_utils.d.ts +29 -0
package/types/cache_utils.d.ts.map +1 -0
package/types/configs.d.ts.map +1 -1
package/types/env.d.ts +18 -3
package/types/env.d.ts.map +1 -1
package/types/image_processors_utils.d.ts +17 -1
package/types/image_processors_utils.d.ts.map +1 -1
package/types/models/auto/modeling_auto.d.ts +6 -0
package/types/models/auto/modeling_auto.d.ts.map +1 -1
package/types/models/detr/image_processing_detr.d.ts +1 -1
package/types/models/feature_extractors.d.ts +2 -0
package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
package/types/models/image_processors.d.ts +1 -0
package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
package/types/models/modeling_utils.d.ts +44 -24
package/types/models/modeling_utils.d.ts.map +1 -1
package/types/models/models.d.ts +9 -0
package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
package/types/models/processors.d.ts +3 -0
package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
package/types/models/registry.d.ts +2 -1
package/types/models/registry.d.ts.map +1 -1
package/types/models/sam/image_processing_sam.d.ts +1 -1
package/types/models/session.d.ts +3 -2
package/types/models/session.d.ts.map +1 -1
package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
package/types/pipelines/index.d.ts +0 -34
package/types/pipelines/index.d.ts.map +1 -1
package/types/pipelines.d.ts.map +1 -1
package/types/transformers.d.ts +1 -0
package/types/transformers.d.ts.map +1 -1
package/types/utils/audio.d.ts +5 -2
package/types/utils/audio.d.ts.map +1 -1
package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
package/types/utils/cache/FileCache.d.ts +39 -0
package/types/utils/cache/FileCache.d.ts.map +1 -0
package/types/utils/cache.d.ts +4 -4
package/types/utils/cache.d.ts.map +1 -1
package/types/utils/dtypes.d.ts +1 -1
package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -38
package/types/utils/hub/FileResponse.d.ts.map +1 -0
package/types/utils/hub/utils.d.ts +17 -2
package/types/utils/hub/utils.d.ts.map +1 -1
package/types/utils/hub.d.ts +7 -7
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +1 -1
package/types/utils/image.d.ts.map +1 -1
package/types/utils/lru_cache.d.ts +38 -0
package/types/utils/lru_cache.d.ts.map +1 -0
package/types/utils/memoize_promise.d.ts +14 -0
package/types/utils/memoize_promise.d.ts.map +1 -0
package/types/utils/model_registry/ModelRegistry.d.ts +66 -6
package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
package/types/utils/model_registry/get_model_files.d.ts +1 -0
package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
package/types/utils/model_registry/get_pipeline_files.d.ts +2 -1
package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -1
package/types/utils/model_registry/is_cached.d.ts +47 -4
package/types/utils/model_registry/is_cached.d.ts.map +1 -1
package/types/utils/tensor.d.ts.map +1 -1
package/src/utils/data-structures.js +0 -572
package/types/utils/data-structures.d.ts +0 -294
package/types/utils/data-structures.d.ts.map +0 -1
package/types/utils/hub/files.d.ts.map +0 -1

package/src/models/voxtral_realtime/modeling_voxtral_realtime.js ADDED Viewed

@@ -0,0 +1,239 @@
+import { PreTrainedModel } from '../modeling_utils.js';
+import { sessionRun } from '../session.js';
+import { getCacheShapes } from '../../configs.js';
+import { Tensor, ones } from '../../utils/tensor.js';
+import { DataTypeMap } from '../../utils/dtypes.js';
+import { pick } from '../../utils/core.js';
+import { DynamicCache } from '../../cache_utils.js';
+import { StoppingCriteria, StoppingCriteriaList } from '../../generation/stopping_criteria.js';
+// Causal conv padding constants
+const CONV1_LEFT_PAD = 2;
+const CONV2_LEFT_PAD = 1;
+/**
+ * WeakMap to hold encoder streaming states for each model instance during generation.
+ * This allows the state to be accessed and modified across the generation process
+ * without exposing it on the model instance itself.
+ * @private
+ * @type {WeakMap<VoxtralRealtimeForConditionalGeneration, Object>}
+ */
+const states = new WeakMap();
+/**
+ * Creates encoder streaming state for a VoxtralRealtime generation session.
+ * @param {VoxtralRealtimeForConditionalGeneration} model
+ * @param {Iterable<Tensor>|AsyncIterable<Tensor>} input_features
+ * @returns {Object} Encoder state object.
+ * @private
+ */
+function createEncoderState(model, input_features) {
+    const { text_config, audio_config } = /** @type {any} */ (model.config);
+    const encoder_session = model.sessions['audio_encoder'];
+    const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
+    const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
+    // Initialize encoder KV cache
+    const enc_kv_cache = new DynamicCache();
+    const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? 'float32';
+    const enc_cls = enc_dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32;
+    const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
+    for (const name in enc_shapes) {
+        const size = enc_shapes[name].reduce((a, b) => a * b, 1);
+        enc_kv_cache[name] = new Tensor(enc_dtype, new enc_cls(size), enc_shapes[name]);
+    }
+    const enc_padding_cache = new Tensor(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
+        1,
+        PADDING_CACHE_CHANNELS,
+        CONV1_LEFT_PAD,
+    ]);
+    // Set up iterator from input_features
+    const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
+    if (!chunks_iter) {
+        throw new Error('input_features must be iterable or async iterable');
+    }
+    return {
+        encoder_session,
+        enc_kv_cache,
+        enc_padding_cache,
+        enc_past_seq_len: 0,
+        audio_embed_queue: [],
+        audio_embed_total_tokens: 0,
+        audio_queue_offset: 0,
+        audio_consumed: 0,
+        stream_exhausted: false,
+        chunks_iter,
+        text_hidden_size: text_config.hidden_size,
+    };
+}
+/**
+ * Encodes one audio chunk through the audio encoder.
+ * @param {Object} s Encoder state.
+ * @param {Tensor} chunk_features Mel spectrogram chunk [1, num_mel_bins, seq_len].
+ * @returns {Promise<Tensor>} Audio embeddings.
+ * @private
+ */
+async function encodeChunk(s, chunk_features) {
+    const audio_seq_len = chunk_features.dims[2];
+    const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
+    const position_ids = new Tensor(
+        'int64',
+        BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
+        [1, conv2_output_len],
+    );
+    const total_seq_len = s.enc_past_seq_len + conv2_output_len;
+    const attention_mask = ones([1, total_seq_len]);
+    const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
+        input_features: chunk_features,
+        attention_mask,
+        position_ids,
+        past_padding_cache: s.enc_padding_cache,
+        ...s.enc_kv_cache,
+    });
+    // Dispose previous padding cache and update
+    if (s.enc_padding_cache.location === 'gpu-buffer') {
+        s.enc_padding_cache.dispose();
+    }
+    s.enc_padding_cache = present_padding_cache;
+    // Update encoder KV cache, disposing previous tensors
+    for (const name in present_cache) {
+        if (name.startsWith('present.')) {
+            const pastName = name.replace('present', 'past_key_values');
+            const prev = s.enc_kv_cache[pastName];
+            if (prev?.location === 'gpu-buffer') {
+                prev.dispose();
+            }
+            s.enc_kv_cache[pastName] = present_cache[name];
+        }
+    }
+    s.enc_past_seq_len = total_seq_len;
+    return audio_embeds;
+}
+/**
+ * Fills the audio embedding buffer until it has enough tokens.
+ * @param {Object} s Encoder state.
+ * @param {number} needed Total number of audio tokens needed.
+ * @private
+ */
+async function fillAudioBuffer(s, needed) {
+    while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
+        const result = await s.chunks_iter.next();
+        if (result.done) {
+            s.stream_exhausted = true;
+            break;
+        }
+        const new_embeds = await encodeChunk(s, result.value);
+        s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
+        s.audio_embed_total_tokens += new_embeds.dims[1];
+    }
+}
+/**
+ * Adds audio embeddings to text embeddings from the queue.
+ * @param {Object} s Encoder state.
+ * @param {Tensor} inputs_embeds Text embeddings tensor (modified in-place).
+ * @param {number} current_len Number of tokens to consume.
+ * @private
+ */
+function addAudioEmbeddings(s, inputs_embeds, current_len) {
+    if (s.audio_embed_queue.length === 0) return;
+    const embed_data = inputs_embeds.data;
+    let embed_write_pos = 0;
+    let remaining = current_len;
+    while (remaining > 0 && s.audio_embed_queue.length > 0) {
+        const front = s.audio_embed_queue[0];
+        const available = front.tokens - s.audio_queue_offset;
+        const n = Math.min(remaining, available);
+        const src_offset = s.audio_queue_offset * s.text_hidden_size;
+        for (let i = 0; i < n * s.text_hidden_size; ++i) {
+            embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
+        }
+        embed_write_pos += n;
+        remaining -= n;
+        s.audio_queue_offset += n;
+        if (s.audio_queue_offset >= front.tokens) {
+            s.audio_embed_queue.shift();
+            s.audio_queue_offset = 0;
+        }
+    }
+    s.audio_consumed += current_len - remaining;
+}
+/**
+ * Stopping criterion that triggers when the audio stream is exhausted
+ * and all buffered audio embeddings have been consumed.
+ * @private
+ */
+class AudioExhaustedCriteria extends StoppingCriteria {
+    constructor(enc_state) {
+        super();
+        this._s = enc_state;
+    }
+    _call(input_ids) {
+        const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
+        return input_ids.map(() => done);
+    }
+}
+export class VoxtralRealtimePreTrainedModel extends PreTrainedModel {
+    forward_params = ['input_ids', 'attention_mask', 'position_ids', 'past_key_values'];
+}
+export class VoxtralRealtimeForConditionalGeneration extends VoxtralRealtimePreTrainedModel {
+    async forward({ input_ids, past_key_values, ...kwargs }) {
+        const current_len = input_ids.dims[1];
+        const enc = states.get(this);
+        if (enc) {
+            // Fill audio buffer and embed tokens with audio
+            await fillAudioBuffer(enc, enc.audio_consumed + current_len);
+        }
+        const { inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], { input_ids });
+        if (enc) {
+            addAudioEmbeddings(enc, inputs_embeds, current_len);
+        }
+        const decoder_feeds = { inputs_embeds, ...kwargs };
+        this.addPastKeyValues(decoder_feeds, past_key_values);
+        const session = this.sessions['decoder_model_merged'];
+        const fixed = pick(decoder_feeds, session.inputNames);
+        return await sessionRun(session, fixed);
+    }
+    async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
+        if (!input_features) {
+            throw new Error('input_features (generator/iterable) must be provided');
+        }
+        const enc_state = createEncoderState(this, input_features);
+        states.set(this, enc_state);
+        const stopping_criteria = new StoppingCriteriaList();
+        stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
+        if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
+        try {
+            return await super.generate({ ...kwargs, stopping_criteria });
+        } finally {
+            // Cleanup encoder state
+            enc_state.enc_kv_cache.dispose();
+            states.delete(this);
+        }
+    }
+}

package/src/models/voxtral_realtime/processing_voxtral_realtime.js ADDED Viewed

@@ -0,0 +1,113 @@
+import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
+import { AutoTokenizer } from '../auto/tokenization_auto.js';
+import { Processor } from '../../processing_utils.js';
+import { Tensor } from '../../utils/tensor.js';
+import { validate_audio_inputs } from '../../feature_extraction_utils.js';
+// Voxtral Realtime audio config constants (from mistral_common AudioConfig)
+const NUM_LEFT_PAD_TOKENS = 32;
+const NUM_DELAY_TOKENS = 6;
+const AUDIO_LENGTH_PER_TOK = 8;
+const OFFLINE_STREAMING_BUFFER_TOKENS = 10;
+/** Token ID for [STREAMING_PAD] in the Voxtral tokenizer. */
+const STREAMING_PAD_TOKEN_ID = 32;
+export class VoxtralRealtimeProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer;
+    static feature_extractor_class = AutoFeatureExtractor;
+    static uses_processor_config = false;
+    /** Number of mel frames in the first audio chunk. */
+    get num_mel_frames_first_audio_chunk() {
+        return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
+    }
+    /** Number of raw audio samples in the first audio chunk. */
+    get num_samples_first_audio_chunk() {
+        const { hop_length, n_fft } = this.feature_extractor.config;
+        return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
+    }
+    /** Number of raw audio samples per subsequent audio chunk. */
+    get num_samples_per_audio_chunk() {
+        const { hop_length, n_fft } = this.feature_extractor.config;
+        return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
+    }
+    /** Number of right-pad tokens for non-streaming mode. */
+    get num_right_pad_tokens() {
+        return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
+    }
+    /** Number of mel frames per text token. */
+    get audio_length_per_tok() {
+        return AUDIO_LENGTH_PER_TOK;
+    }
+    /** Number of raw audio samples per token. */
+    get raw_audio_length_per_tok() {
+        return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
+    }
+    /**
+     * Process audio input for VoxtralRealtime.
+     *
+     * In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
+     * with silence and mel features are extracted with `center=true`.
+     * Returns `{ input_ids, input_features }`.
+     *
+     * In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
+     * processed with `center=false` and only `{ input_features }` is returned.
+     *
+     * In non-streaming mode, the audio is right-padded to ensure the model
+     * transcribes the full audio, then processed with `center=true`.
+     * Returns `{ input_features }`.
+     *
+     * @param {Float32Array|Float64Array} audio The audio waveform.
+     * @param {Object} [options]
+     * @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
+     * @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
+     * @returns {Promise<Object>}
+     */
+    async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
+        validate_audio_inputs(audio, 'VoxtralRealtimeProcessor');
+        if (!is_streaming && !is_first_audio_chunk) {
+            throw new Error('In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.');
+        }
+        if (is_first_audio_chunk) {
+            if (is_streaming) {
+                // Streaming first chunk: left-pad audio with silence, extract mel with center=true, build input_ids
+                const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
+                const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
+                padded_audio.set(audio, num_left_pad_samples);
+                const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
+                // Build input_ids: BOS + (num_left_pad_tokens + num_delay_tokens) * [STREAMING_PAD]
+                const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
+                const num_input_tokens = 1 + num_pad_tokens;
+                const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
+                input_ids_data[0] = 1n; // BOS
+                const input_ids = new Tensor('int64', input_ids_data, [1, num_input_tokens]);
+                return {
+                    input_ids,
+                    ...audio_encoding,
+                };
+            } else {
+                // Non-streaming: right-pad audio to ensure full transcription, extract mel with center=true
+                const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
+                const padded_audio = new Float32Array(audio.length + right_pad_samples);
+                padded_audio.set(audio);
+                return await this.feature_extractor(padded_audio, { center: true });
+            }
+        } else {
+            // Subsequent streaming chunks: extract mel with center=false
+            return await this.feature_extractor(audio, { center: false });
+        }
+    }
+}

package/src/models/whisper/feature_extraction_whisper.js CHANGED Viewed

@@ -1,7 +1,6 @@
 import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
 import { Tensor } from '../../utils/tensor.js';
 import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
-import { max } from '../../utils/maths.js';
 import { logger } from '../../utils/logger.js';
 export class WhisperFeatureExtractor extends FeatureExtractor {
@@ -28,7 +27,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
      * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
      */
     async _extract_fbank_features(waveform) {
-        const features = await spectrogram(
+        return await spectrogram(
             waveform,
             this.window, // window
             this.config.n_fft, // frame_length
@@ -36,7 +35,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
             {
                 power: 2.0,
                 mel_filters: this.config.mel_filters,
-                log_mel: 'log10',
+                log_mel: 'log10_max_norm',
                 // Custom
                 max_num_frames: Math.min(
@@ -45,15 +44,6 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
                 ),
             },
         );
-        const data = features.data;
-        const maxValue = max(/** @type {Float32Array} */ (data))[0];
-        for (let i = 0; i < data.length; ++i) {
-            data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
-        }
-        return features;
     }
     /**

package/src/pipelines/index.js CHANGED Viewed

@@ -1,11 +1,10 @@
 /**
  * @file Pipeline task configurations and aliases
  *
- * Defines which components (tokenizer, processor, model) each pipeline task needs.
+ * Defines which pipeline class and model class(es) each pipeline task needs.
+ * Tokenizer and processor loading is determined automatically from the model's files.
  */
-import { AutoTokenizer } from '../models/auto/tokenization_auto.js';
-import { AutoProcessor } from '../models/auto/processing_auto.js';
 import {
     AutoModel,
     AutoModelForSequenceClassification,
@@ -60,41 +59,30 @@ import { ImageFeatureExtractionPipeline } from './image-feature-extraction.js';
 export const SUPPORTED_TASKS = Object.freeze({
     'text-classification': {
-        tokenizer: AutoTokenizer,
         pipeline: TextClassificationPipeline,
         model: AutoModelForSequenceClassification,
         default: {
-            // TODO: replace with original
-            // "model": "distilbert-base-uncased-finetuned-sst-2-english",
             model: 'Xenova/distilbert-base-uncased-finetuned-sst-2-english',
         },
         type: 'text',
     },
     'token-classification': {
-        tokenizer: AutoTokenizer,
         pipeline: TokenClassificationPipeline,
         model: AutoModelForTokenClassification,
         default: {
-            // TODO: replace with original
-            // "model": "Davlan/bert-base-multilingual-cased-ner-hrl",
             model: 'Xenova/bert-base-multilingual-cased-ner-hrl',
         },
         type: 'text',
     },
     'question-answering': {
-        tokenizer: AutoTokenizer,
         pipeline: QuestionAnsweringPipeline,
         model: AutoModelForQuestionAnswering,
         default: {
-            // TODO: replace with original
-            // "model": "distilbert-base-cased-distilled-squad",
             model: 'Xenova/distilbert-base-cased-distilled-squad',
         },
         type: 'text',
     },
     'fill-mask': {
-        tokenizer: AutoTokenizer,
         pipeline: FillMaskPipeline,
         model: AutoModelForMaskedLM,
         default: {
@@ -104,40 +92,30 @@ export const SUPPORTED_TASKS = Object.freeze({
         type: 'text',
     },
     summarization: {
-        tokenizer: AutoTokenizer,
         pipeline: SummarizationPipeline,
         model: AutoModelForSeq2SeqLM,
         default: {
-            // TODO: replace with original
-            // "model": "sshleifer/distilbart-cnn-6-6",
             model: 'Xenova/distilbart-cnn-6-6',
         },
         type: 'text',
     },
     translation: {
-        tokenizer: AutoTokenizer,
         pipeline: TranslationPipeline,
         model: AutoModelForSeq2SeqLM,
         default: {
-            // TODO: replace with original
-            // "model": "t5-small",
             model: 'Xenova/t5-small',
         },
         type: 'text',
     },
     'text2text-generation': {
-        tokenizer: AutoTokenizer,
         pipeline: Text2TextGenerationPipeline,
         model: AutoModelForSeq2SeqLM,
         default: {
-            // TODO: replace with original
-            // "model": "google/flan-t5-small",
             model: 'Xenova/flan-t5-small',
         },
         type: 'text',
     },
     'text-generation': {
-        tokenizer: AutoTokenizer,
         pipeline: TextGenerationPipeline,
         model: AutoModelForCausalLM,
         default: {
@@ -147,12 +125,9 @@ export const SUPPORTED_TASKS = Object.freeze({
         type: 'text',
     },
     'zero-shot-classification': {
-        tokenizer: AutoTokenizer,
         pipeline: ZeroShotClassificationPipeline,
         model: AutoModelForSequenceClassification,
         default: {
-            // TODO: replace with original
-            // "model": "typeform/distilbert-base-uncased-mnli",
             model: 'Xenova/distilbert-base-uncased-mnli',
         },
         type: 'text',
@@ -160,43 +135,30 @@ export const SUPPORTED_TASKS = Object.freeze({
     'audio-classification': {
         pipeline: AudioClassificationPipeline,
         model: AutoModelForAudioClassification,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "superb/wav2vec2-base-superb-ks",
             model: 'Xenova/wav2vec2-base-superb-ks',
         },
         type: 'audio',
     },
     'zero-shot-audio-classification': {
-        tokenizer: AutoTokenizer,
         pipeline: ZeroShotAudioClassificationPipeline,
         model: AutoModel,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "laion/clap-htsat-fused",
             model: 'Xenova/clap-htsat-unfused',
         },
         type: 'multimodal',
     },
     'automatic-speech-recognition': {
-        tokenizer: AutoTokenizer,
         pipeline: AutomaticSpeechRecognitionPipeline,
         model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "openai/whisper-tiny.en",
             model: 'Xenova/whisper-tiny.en',
         },
         type: 'multimodal',
     },
     'text-to-audio': {
-        tokenizer: AutoTokenizer,
         pipeline: TextToAudioPipeline,
         model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
-        processor: [AutoProcessor, /* Some don't use a processor */ null],
         default: {
             model: 'onnx-community/Supertonic-TTS-ONNX',
             dtype: 'fp32',
@@ -204,129 +166,86 @@ export const SUPPORTED_TASKS = Object.freeze({
         type: 'text',
     },
     'image-to-text': {
-        tokenizer: AutoTokenizer,
         pipeline: ImageToTextPipeline,
         model: AutoModelForVision2Seq,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "nlpconnect/vit-gpt2-image-captioning",
             model: 'Xenova/vit-gpt2-image-captioning',
         },
         type: 'multimodal',
     },
     'image-classification': {
-        // no tokenizer
         pipeline: ImageClassificationPipeline,
         model: AutoModelForImageClassification,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "google/vit-base-patch16-224",
             model: 'Xenova/vit-base-patch16-224',
         },
         type: 'multimodal',
     },
     'image-segmentation': {
-        // no tokenizer
         pipeline: ImageSegmentationPipeline,
         model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "facebook/detr-resnet-50-panoptic",
             model: 'Xenova/detr-resnet-50-panoptic',
         },
         type: 'multimodal',
     },
     'background-removal': {
-        // no tokenizer
         pipeline: BackgroundRemovalPipeline,
         model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
-        processor: AutoProcessor,
         default: {
             model: 'Xenova/modnet',
         },
         type: 'image',
     },
     'zero-shot-image-classification': {
-        tokenizer: AutoTokenizer,
         pipeline: ZeroShotImageClassificationPipeline,
         model: AutoModel,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "openai/clip-vit-base-patch32",
             model: 'Xenova/clip-vit-base-patch32',
         },
         type: 'multimodal',
     },
     'object-detection': {
-        // no tokenizer
         pipeline: ObjectDetectionPipeline,
         model: AutoModelForObjectDetection,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "facebook/detr-resnet-50",
             model: 'Xenova/detr-resnet-50',
         },
         type: 'multimodal',
     },
     'zero-shot-object-detection': {
-        tokenizer: AutoTokenizer,
         pipeline: ZeroShotObjectDetectionPipeline,
         model: AutoModelForZeroShotObjectDetection,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "google/owlvit-base-patch32",
             model: 'Xenova/owlvit-base-patch32',
         },
         type: 'multimodal',
     },
     'document-question-answering': {
-        tokenizer: AutoTokenizer,
         pipeline: DocumentQuestionAnsweringPipeline,
         model: AutoModelForDocumentQuestionAnswering,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "naver-clova-ix/donut-base-finetuned-docvqa",
             model: 'Xenova/donut-base-finetuned-docvqa',
         },
         type: 'multimodal',
     },
     'image-to-image': {
-        // no tokenizer
         pipeline: ImageToImagePipeline,
         model: AutoModelForImageToImage,
-        processor: AutoProcessor,
         default: {
-            // TODO: replace with original
-            // "model": "caidas/swin2SR-classical-sr-x2-64",
             model: 'Xenova/swin2SR-classical-sr-x2-64',
         },
         type: 'image',
     },
     'depth-estimation': {
-        // no tokenizer
         pipeline: DepthEstimationPipeline,
         model: AutoModelForDepthEstimation,
-        processor: AutoProcessor,
         default: {
             model: 'onnx-community/depth-anything-v2-small',
         },
         type: 'image',
     },
-    // This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers).
     'feature-extraction': {
-        tokenizer: AutoTokenizer,
         pipeline: FeatureExtractionPipeline,
         model: AutoModel,
         default: {
@@ -336,7 +255,6 @@ export const SUPPORTED_TASKS = Object.freeze({
         type: 'text',
     },
     'image-feature-extraction': {
-        processor: AutoProcessor,
         pipeline: ImageFeatureExtractionPipeline,
         model: [AutoModelForImageFeatureExtraction, AutoModel],
         default: {