npm - @huggingface/transformers - Versions diffs - 3.1.1 → 3.2.0 - Mend

@huggingface/transformers 3.1.1 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

package/README.md +10 -4
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/transformers.cjs +1062 -183
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +2239 -1232
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +1 -358
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +1 -421
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +1 -358
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +1082 -181
package/dist/transformers.mjs.map +1 -1
package/package.json +11 -16
package/src/backends/onnx.js +2 -7
package/src/base/image_processors_utils.js +3 -1
package/src/configs.js +11 -2
package/src/env.js +1 -1
package/src/models/feature_extractors.js +1 -0
package/src/models/idefics3/image_processing_idefics3.js +24 -13
package/src/models/image_processors.js +1 -0
package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
package/src/models/moonshine/processing_moonshine.js +20 -0
package/src/models/paligemma/processing_paligemma.js +82 -0
package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
package/src/models/phi3_v/processing_phi3_v.js +53 -0
package/src/models/processors.js +3 -0
package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
package/src/models/pyannote/processing_pyannote.js +7 -54
package/src/models.js +233 -35
package/src/ops/registry.js +11 -0
package/src/pipelines.js +30 -0
package/src/tokenizers.js +12 -1
package/src/utils/core.js +39 -9
package/src/utils/hub.js +8 -12
package/src/utils/image.js +40 -0
package/src/utils/tensor.js +51 -1
package/types/backends/onnx.d.ts +2 -2
package/types/backends/onnx.d.ts.map +1 -1
package/types/base/feature_extraction_utils.d.ts +1 -1
package/types/base/feature_extraction_utils.d.ts.map +1 -1
package/types/base/image_processors_utils.d.ts +4 -4
package/types/base/image_processors_utils.d.ts.map +1 -1
package/types/base/processing_utils.d.ts +4 -4
package/types/base/processing_utils.d.ts.map +1 -1
package/types/configs.d.ts +7 -7
package/types/configs.d.ts.map +1 -1
package/types/env.d.ts +1 -1
package/types/env.d.ts.map +1 -1
package/types/generation/configuration_utils.d.ts +2 -2
package/types/generation/logits_process.d.ts +2 -2
package/types/generation/logits_process.d.ts.map +1 -1
package/types/generation/logits_sampler.d.ts.map +1 -1
package/types/generation/parameters.d.ts +5 -5
package/types/generation/stopping_criteria.d.ts +1 -1
package/types/generation/stopping_criteria.d.ts.map +1 -1
package/types/generation/streamers.d.ts +2 -2
package/types/generation/streamers.d.ts.map +1 -1
package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
package/types/models/auto/image_processing_auto.d.ts.map +1 -1
package/types/models/auto/processing_auto.d.ts +1 -1
package/types/models/auto/processing_auto.d.ts.map +1 -1
package/types/models/clap/feature_extraction_clap.d.ts +1 -1
package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
package/types/models/detr/image_processing_detr.d.ts +11 -11
package/types/models/detr/image_processing_detr.d.ts.map +1 -1
package/types/models/donut/image_processing_donut.d.ts +1 -1
package/types/models/donut/image_processing_donut.d.ts.map +1 -1
package/types/models/feature_extractors.d.ts +1 -0
package/types/models/florence2/processing_florence2.d.ts.map +1 -1
package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
package/types/models/idefics3/processing_idefics3.d.ts.map +1 -1
package/types/models/image_processors.d.ts +1 -0
package/types/models/janus/image_processing_janus.d.ts +1 -1
package/types/models/janus/image_processing_janus.d.ts.map +1 -1
package/types/models/janus/processing_janus.d.ts.map +1 -1
package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
package/types/models/moonshine/processing_moonshine.d.ts +17 -0
package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
package/types/models/paligemma/processing_paligemma.d.ts +12 -0
package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
package/types/models/processors.d.ts +3 -0
package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
package/types/models/pyannote/processing_pyannote.d.ts +4 -15
package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
package/types/models/sam/image_processing_sam.d.ts.map +1 -1
package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
package/types/models/whisper/generation_whisper.d.ts.map +1 -1
package/types/models/whisper/processing_whisper.d.ts.map +1 -1
package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
package/types/models.d.ts +61 -5
package/types/models.d.ts.map +1 -1
package/types/ops/registry.d.ts +1 -0
package/types/ops/registry.d.ts.map +1 -1
package/types/pipelines.d.ts +31 -51
package/types/pipelines.d.ts.map +1 -1
package/types/tokenizers.d.ts +10 -6
package/types/tokenizers.d.ts.map +1 -1
package/types/utils/audio.d.ts.map +1 -1
package/types/utils/constants.d.ts.map +1 -1
package/types/utils/core.d.ts +87 -22
package/types/utils/core.d.ts.map +1 -1
package/types/utils/data-structures.d.ts.map +1 -1
package/types/utils/devices.d.ts.map +1 -1
package/types/utils/dtypes.d.ts.map +1 -1
package/types/utils/generic.d.ts.map +1 -1
package/types/utils/hub.d.ts +3 -3
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +10 -1
package/types/utils/image.d.ts.map +1 -1
package/types/utils/maths.d.ts +10 -10
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +22 -6
package/types/utils/tensor.d.ts.map +1 -1

package/src/models/pyannote/processing_pyannote.js CHANGED Viewed

@@ -1,9 +1,8 @@
 import { Processor } from '../../base/processing_utils.js';
-import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { max, softmax } from '../../utils/maths.js';
+import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js';
 export class PyAnnoteProcessor extends Processor {
-    static feature_extractor_class = AutoFeatureExtractor
+    static feature_extractor_class = PyAnnoteFeatureExtractor
     /**
      * Calls the feature_extractor function with the given audio input.
@@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
         return await this.feature_extractor(audio)
     }
-    /**
-     * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
-     * @param {number} samples The number of frames in the audio.
-     * @returns {number} The number of frames in the audio.
-     */
-    samples_to_frames(samples) {
-        return ((samples - this.config.offset) / this.config.step);
+    /** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization']} */
+    post_process_speaker_diarization(...args) {
+        return /** @type {PyAnnoteFeatureExtractor} */(this.feature_extractor).post_process_speaker_diarization(...args);
     }
-    /**
-     * Post-processes the speaker diarization logits output by the model.
-     * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
-     * @param {number} num_samples Number of samples in the input audio.
-     * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
-     */
-    post_process_speaker_diarization(logits, num_samples) {
-        const ratio = (
-            num_samples / this.samples_to_frames(num_samples)
-        ) / this.config.sampling_rate;
-        const results = [];
-        for (const scores of logits.tolist()) {
-            const accumulated_segments = [];
-            let current_speaker = -1;
-            for (let i = 0; i < scores.length; ++i) {
-                const probabilities = softmax(scores[i]);
-                const [score, id] = max(probabilities);
-                const [start, end] = [i, i + 1];
-                if (id !== current_speaker) {
-                    // Speaker has changed
-                    current_speaker = id;
-                    accumulated_segments.push({ id, start, end, score });
-                } else {
-                    // Continue the current segment
-                    accumulated_segments.at(-1).end = end;
-                    accumulated_segments.at(-1).score += score;
-                }
-            }
-            results.push(accumulated_segments.map(
-                // Convert frame-space to time-space
-                // and compute the confidence
-                ({ id, start, end, score }) => ({
-                    id,
-                    start: start * ratio,
-                    end: end * ratio,
-                    confidence: score / (end - start),
-                })
-            ));
-        }
-        return results;
+    get sampling_rate() {
+        return this.feature_extractor.config.sampling_rate;
     }
 }

package/src/models.js CHANGED Viewed

@@ -131,6 +131,7 @@ const MODEL_TYPES = {
     ImageTextToText: 6,
     Musicgen: 7,
     MultiModality: 8,
+    Phi3V: 9,
 }
 //////////////////////////////////////////////////
@@ -558,7 +559,9 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
         new_model_inputs.use_cache_branch = boolTensor(!!past_key_values);
     }
     if (session.inputNames.includes('position_ids') && new_model_inputs.attention_mask && !new_model_inputs.position_ids) {
-        new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values);
+        // NOTE: Handle a special case for paligemma models, where positions are 1-indexed
+        const start_index = self.config.model_type === 'paligemma' ? 1 : 0;
+        new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index);
     }
     // Unpack the `past_key_values` object into model inputs
@@ -694,14 +697,14 @@ async function imageTextToTextForward(self, {
  * @param {Tensor} attention_mask
  * @returns {{data: BigInt64Array, dims: number[]}}
  */
-function cumsum_masked_fill(attention_mask) {
+function cumsum_masked_fill(attention_mask, start_index = 0) {
     const [bz, seq_len] = attention_mask.dims;
     const attn_mask_data = attention_mask.data;
     const data = new BigInt64Array(attn_mask_data.length);
     for (let i = 0; i < bz; ++i) {
         const start = i * seq_len;
-        let sum = BigInt(0);
+        let sum = BigInt(start_index);
         for (let j = 0; j < seq_len; ++j) {
             const index = start + j;
             if (attn_mask_data[index] === 0n) {
@@ -728,10 +731,10 @@ function cumsum_masked_fill(attention_mask) {
  *     position_ids = position_ids[:, -input_ids.shape[1] :]
  * ```
  */
-function createPositionIds(model_inputs, past_key_values = null) {
+function createPositionIds(model_inputs, past_key_values = null, start_index = 0) {
     const { input_ids, inputs_embeds, attention_mask } = model_inputs;
-    const { data, dims } = cumsum_masked_fill(attention_mask);
+    const { data, dims } = cumsum_masked_fill(attention_mask, start_index);
     let position_ids = new Tensor('int64', data, dims);
     if (past_key_values) {
         const offset = -(input_ids ?? inputs_embeds).dims.at(1);
@@ -904,6 +907,10 @@ export class PreTrainedModel extends Callable {
                 this._forward = imageTextToTextForward;
                 this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
                 break;
+            case MODEL_TYPES.Phi3V:
+                this.can_generate = true;
+                this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
+                break;
             case MODEL_TYPES.MultiModality:
                 this.can_generate = true;
@@ -1068,6 +1075,18 @@ export class PreTrainedModel extends Callable {
                 }, options),
             ]);
+        } else if (modelType === MODEL_TYPES.Phi3V) {
+            info = await Promise.all([
+                constructSessions(pretrained_model_name_or_path, {
+                    prepare_inputs_embeds: 'prepare_inputs_embeds',
+                    model: 'model',
+                    vision_encoder: 'vision_encoder',
+                }, options),
+                getOptionalConfigs(pretrained_model_name_or_path, {
+                    generation_config: 'generation_config.json',
+                }, options),
+            ]);
         } else { // should be MODEL_TYPES.EncoderOnly
             if (modelType !== MODEL_TYPES.EncoderOnly) {
                 const type = modelName ?? config?.model_type;
@@ -3340,6 +3359,29 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
 }
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+// Moonshine models
+export class MoonshinePreTrainedModel extends PreTrainedModel {
+    requires_attention_mask = false;
+    main_input_name = 'input_values';
+    forward_params = [
+        'input_values',
+        'decoder_input_ids',
+        'past_key_values',
+    ];
+};
+/**
+ * MoonshineModel class for training Moonshine models without a language model head.
+ */
+export class MoonshineModel extends MoonshinePreTrainedModel { }
+export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 /**
  * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
@@ -3548,6 +3590,30 @@ export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel
     }
 }
+export class PaliGemmaPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        // 'inputs_embeds',
+        'attention_mask',
+        'pixel_values',
+        'position_ids',
+        'past_key_values',
+    ];
+}
+export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
+    _merge_input_ids_with_image_features(kwargs) {
+        const vision_hidden_size = kwargs.image_features.dims.at(-1);
+        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_index,
+            ...kwargs,
+            image_features: reshaped_image_hidden_states,
+        })
+    }
+}
 //////////////////////////////////////////////////
 // Idefics3 Models
@@ -3586,6 +3652,77 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
 }
 //////////////////////////////////////////////////
+export class Phi3VPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'inputs_embeds',
+        'attention_mask',
+        'position_ids',
+        'pixel_values',
+        'image_sizes',
+        'past_key_values',
+    ];
+}
+export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
+    async forward({
+        // Produced by the tokenizer/processor:
+        input_ids = null,
+        attention_mask = null,
+        pixel_values = null,
+        image_sizes = null,
+        // Used during generation:
+        position_ids = null,
+        inputs_embeds = null,
+        past_key_values = null,
+        // Generic generation parameters
+        generation_config = null,
+        logits_processor = null,
+        // TODO: needed?
+        ...kwargs
+    }) {
+        if (!inputs_embeds) {
+            let image_features;
+            if (pixel_values && input_ids.dims[1] !== 1) {
+                if (!image_sizes) {
+                    throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
+                }
+                // Encode the image
+                ({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
+                    pixel_values,
+                    image_sizes,
+                }));
+            } else {
+                const hidden_size = this.config.normalized_config.hidden_size;
+                image_features = new Tensor(
+                    'float32',
+                    [],
+                    [0, hidden_size],
+                );
+            }
+            ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
+                input_ids,
+                image_features,
+            }));
+        }
+        const outputs = await decoderForward(this, {
+            inputs_embeds,
+            past_key_values,
+            attention_mask,
+            position_ids,
+            generation_config,
+            logits_processor,
+        }, false);
+        return outputs;
+    }
+}
 //////////////////////////////////////////////////
 export class CLIPPreTrainedModel extends PreTrainedModel { }
@@ -3640,9 +3777,11 @@ export class CLIPModel extends CLIPPreTrainedModel { }
 export class CLIPTextModel extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'text_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'text_model',
+            ...options,
+        });
     }
 }
@@ -3675,9 +3814,11 @@ export class CLIPTextModel extends CLIPPreTrainedModel {
 export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'text_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'text_model',
+            ...options,
+        });
     }
 }
@@ -3687,9 +3828,11 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
 export class CLIPVisionModel extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'vision_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'vision_model',
+            ...options,
+        });
     }
 }
@@ -3722,9 +3865,11 @@ export class CLIPVisionModel extends CLIPPreTrainedModel {
 export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'vision_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'vision_model',
+            ...options,
+        });
     }
 }
 //////////////////////////////////////////////////
@@ -3808,9 +3953,11 @@ export class SiglipModel extends SiglipPreTrainedModel { }
 export class SiglipTextModel extends SiglipPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'text_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'text_model',
+            ...options,
+        });
     }
 }
@@ -3843,9 +3990,11 @@ export class SiglipTextModel extends SiglipPreTrainedModel {
 export class SiglipVisionModel extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'vision_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'vision_model',
+            ...options,
+        });
     }
 }
 //////////////////////////////////////////////////
@@ -3900,18 +4049,22 @@ export class JinaCLIPModel extends JinaCLIPPreTrainedModel {
 export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'text_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'text_model',
+            ...options,
+        });
     }
 }
 export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'vision_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'vision_model',
+            ...options,
+        });
     }
 }
 //////////////////////////////////////////////////
@@ -4071,6 +4224,14 @@ export class LlamaForCausalLM extends LlamaPreTrainedModel { }
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+// EXAONE models
+export class ExaonePreTrainedModel extends PreTrainedModel { }
+export class ExaoneModel extends ExaonePreTrainedModel { }
+export class ExaoneForCausalLM extends ExaonePreTrainedModel { }
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 // MobileLLM models
 export class MobileLLMPreTrainedModel extends PreTrainedModel { }
@@ -4086,6 +4247,13 @@ export class OlmoModel extends OlmoPreTrainedModel { }
 export class OlmoForCausalLM extends OlmoPreTrainedModel { }
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+// OLMo2 models
+export class Olmo2PreTrainedModel extends PreTrainedModel { }
+export class Olmo2Model extends Olmo2PreTrainedModel { }
+export class Olmo2ForCausalLM extends Olmo2PreTrainedModel { }
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 // Granite models
@@ -4502,6 +4670,20 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+export class IJepaPreTrainedModel extends PreTrainedModel { }
+export class IJepaModel extends IJepaPreTrainedModel { }
+export class IJepaForImageClassification extends IJepaPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 export class VitPosePreTrainedModel extends PreTrainedModel { }
@@ -6112,9 +6294,11 @@ export class ClapModel extends ClapPreTrainedModel { }
 export class ClapTextModelWithProjection extends ClapPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'text_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'text_model',
+            ...options,
+        });
     }
 }
@@ -6147,9 +6331,11 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel {
 export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        // Update default model file name if not provided
-        options.model_file_name ??= 'audio_model';
-        return super.from_pretrained(pretrained_model_name_or_path, options);
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            // Update default model file name if not provided
+            model_file_name: 'audio_model',
+            ...options,
+        });
     }
 }
 //////////////////////////////////////////////////
@@ -6772,6 +6958,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['rt_detr', ['RTDetrModel', RTDetrModel]],
     ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
     ['vit', ['ViTModel', ViTModel]],
+    ['ijepa', ['IJepaModel', IJepaModel]],
     ['pvt', ['PvtModel', PvtModel]],
     ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
     ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
@@ -6835,7 +7022,9 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
     ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]],
     ['codegen', ['CodeGenModel', CodeGenModel]],
     ['llama', ['LlamaModel', LlamaModel]],
+    ['exaone', ['ExaoneModel', ExaoneModel]],
     ['olmo', ['OlmoModel', OlmoModel]],
+    ['olmo2', ['Olmo2Model', Olmo2Model]],
     ['mobilellm', ['MobileLLMModel', MobileLLMModel]],
     ['granite', ['GraniteModel', GraniteModel]],
     ['cohere', ['CohereModel', CohereModel]],
@@ -6856,6 +7045,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
 const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
     ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
     ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
+    ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
 ]);
 const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
@@ -6926,7 +7116,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
     ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
     ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
     ['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
+    ['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]],
     ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
+    ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
     ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
     ['granite', ['GraniteForCausalLM', GraniteForCausalLM]],
     ['cohere', ['CohereForCausalLM', CohereForCausalLM]],
@@ -6944,6 +7136,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
     ['falcon', ['FalconForCausalLM', FalconForCausalLM]],
     ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
     ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
+    // Also image-text-to-text
+    ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
 ]);
 const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
@@ -7000,6 +7195,7 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
     ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
     ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
     ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
+    ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
 ]);
 const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
@@ -7008,6 +7204,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
 const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['vit', ['ViTForImageClassification', ViTForImageClassification]],
+    ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]],
     ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
     ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
     ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
@@ -7179,6 +7376,7 @@ const CUSTOM_MAPPING = [
     // OVERRIDE:
     // TODO: Refactor to allow class to specify model
     ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
+    ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
     ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
     ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],

package/src/ops/registry.js CHANGED Viewed

@@ -100,4 +100,15 @@ export class TensorOpRegistry {
         }
         return this._top_k;
     }
+    static get slice() {
+        if (!this._slice) {
+            this._slice = wrap(
+                [8, 7, 18, 0, 58, 96, 10, 25, 10, 1, 120, 10, 1, 115, 10, 1, 101, 10, 1, 97, 10, 1, 116, 18, 1, 121, 34, 5, 83, 108, 105, 99, 101, 18, 1, 114, 90, 9, 10, 1, 120, 18, 4, 10, 2, 8, 1, 90, 9, 10, 1, 115, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 101, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 97, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 116, 18, 4, 10, 2, 8, 7, 98, 9, 10, 1, 121, 18, 4, 10, 2, 8, 1, 66, 2, 16, 13],
+                this.session_options,
+                'y',
+            )
+        }
+        return this._slice;
+    }
 }

package/src/pipelines.js CHANGED Viewed

@@ -1729,6 +1729,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
             case 'unispeech-sat':
             case 'hubert':
                 return this._call_wav2vec2(audio, kwargs)
+            case 'moonshine':
+                return this._call_moonshine(audio, kwargs)
             default:
                 throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)
         }
@@ -1882,6 +1884,34 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
         }
         return single ? toReturn[0] : toReturn;
     }
+    /**
+     * @type {AutomaticSpeechRecognitionPipelineCallback}
+     * @private
+     */
+    async _call_moonshine(audio, kwargs) {
+        const single = !Array.isArray(audio);
+        if (single) {
+            audio = [/** @type {AudioInput} */ (audio)];
+        }
+        const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
+        const preparedAudios = await prepareAudios(audio, sampling_rate);
+        const toReturn = [];
+        for (const aud of preparedAudios) {
+            const inputs = await this.processor(aud);
+            // According to the [paper](https://arxiv.org/pdf/2410.15608):
+            // "We use greedy decoding, with a heuristic limit of 6 output tokens
+            // per second of audio to avoid repeated output sequences."
+            const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
+            const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
+            const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
+            toReturn.push({ text });
+        }
+        return single ? toReturn[0] : toReturn;
+    }
 }
 /**

package/src/tokenizers.js CHANGED Viewed

@@ -2605,6 +2605,12 @@ export class PreTrainedTokenizer extends Callable {
         this.unk_token = this.getToken('unk_token');
         this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token);
+        this.bos_token = this.getToken('bos_token');
+        this.bos_token_id = this.model.tokens_to_ids.get(this.bos_token);
+        this.eos_token = this.getToken('eos_token');
+        this.eos_token_id = this.model.tokens_to_ids.get(this.eos_token);
         this.model_max_length = tokenizerConfig.model_max_length;
         /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
@@ -3577,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
         let chunk = new_chunk();
         let time_offset = 0.0;
         const timestamp_begin = this.timestamp_begin;
+        // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments.
+        // We can calculate the last time stamp token as timestamp_begin plus the number of tokens
+        // tokens from 0.00 to 30.00 which is 1500.
+        const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02
+        const timestamp_end = timestamp_begin + total_timestamp_tokens;
         let previous_tokens = [];
         let previous_token_timestamps = [];
@@ -3664,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
                     } else {
                         // 2/ This is a regular special token, ignoring it
                     }
-                } else if (token >= timestamp_begin) {
+                } else if (token >= timestamp_begin && token <= timestamp_end) {
                     // 3/ Timestamp token
                     const time = (token - timestamp_begin) * time_precision + time_offset;
                     const rounded_time = round(time, 2);