npm - @huggingface/transformers - Versions diffs - 3.1.0 → 3.1.2 - Mend

@huggingface/transformers 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

package/README.md +7 -3
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/transformers.cjs +965 -195
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +2251 -1360
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +1 -352
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +1 -415
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +1 -352
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +979 -194
package/dist/transformers.mjs.map +1 -1
package/package.json +11 -16
package/src/backends/onnx.js +2 -7
package/src/configs.js +3 -1
package/src/env.js +6 -6
package/src/generation/configuration_utils.js +7 -0
package/src/generation/logits_process.js +22 -16
package/src/generation/streamers.js +7 -2
package/src/models/idefics3/image_processing_idefics3.js +219 -0
package/src/models/idefics3/processing_idefics3.js +136 -0
package/src/models/image_processors.js +1 -0
package/src/models/paligemma/processing_paligemma.js +82 -0
package/src/models/processors.js +2 -0
package/src/models.js +169 -39
package/src/tokenizers.js +12 -1
package/src/utils/core.js +53 -9
package/src/utils/dtypes.js +2 -1
package/src/utils/hub.js +8 -12
package/src/utils/image.js +59 -16
package/src/utils/tensor.js +6 -1
package/types/backends/onnx.d.ts +2 -2
package/types/backends/onnx.d.ts.map +1 -1
package/types/base/feature_extraction_utils.d.ts +1 -1
package/types/base/feature_extraction_utils.d.ts.map +1 -1
package/types/base/image_processors_utils.d.ts +2 -2
package/types/base/image_processors_utils.d.ts.map +1 -1
package/types/base/processing_utils.d.ts +4 -4
package/types/base/processing_utils.d.ts.map +1 -1
package/types/configs.d.ts +7 -7
package/types/configs.d.ts.map +1 -1
package/types/env.d.ts +2 -2
package/types/env.d.ts.map +1 -1
package/types/generation/configuration_utils.d.ts +7 -1
package/types/generation/configuration_utils.d.ts.map +1 -1
package/types/generation/logits_process.d.ts +32 -22
package/types/generation/logits_process.d.ts.map +1 -1
package/types/generation/logits_sampler.d.ts.map +1 -1
package/types/generation/parameters.d.ts +5 -5
package/types/generation/stopping_criteria.d.ts +1 -1
package/types/generation/stopping_criteria.d.ts.map +1 -1
package/types/generation/streamers.d.ts +15 -10
package/types/generation/streamers.d.ts.map +1 -1
package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
package/types/models/auto/image_processing_auto.d.ts.map +1 -1
package/types/models/auto/processing_auto.d.ts +1 -1
package/types/models/auto/processing_auto.d.ts.map +1 -1
package/types/models/clap/feature_extraction_clap.d.ts +1 -1
package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
package/types/models/detr/image_processing_detr.d.ts +11 -11
package/types/models/detr/image_processing_detr.d.ts.map +1 -1
package/types/models/donut/image_processing_donut.d.ts +1 -1
package/types/models/donut/image_processing_donut.d.ts.map +1 -1
package/types/models/florence2/processing_florence2.d.ts.map +1 -1
package/types/models/idefics3/image_processing_idefics3.d.ts +40 -0
package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -0
package/types/models/idefics3/processing_idefics3.d.ts +19 -0
package/types/models/idefics3/processing_idefics3.d.ts.map +1 -0
package/types/models/image_processors.d.ts +1 -0
package/types/models/janus/image_processing_janus.d.ts +1 -1
package/types/models/janus/image_processing_janus.d.ts.map +1 -1
package/types/models/janus/processing_janus.d.ts.map +1 -1
package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
package/types/models/paligemma/processing_paligemma.d.ts +12 -0
package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
package/types/models/processors.d.ts +2 -0
package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
package/types/models/pyannote/processing_pyannote.d.ts +1 -1
package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
package/types/models/sam/image_processing_sam.d.ts.map +1 -1
package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
package/types/models/whisper/generation_whisper.d.ts.map +1 -1
package/types/models/whisper/processing_whisper.d.ts.map +1 -1
package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
package/types/models.d.ts +44 -10
package/types/models.d.ts.map +1 -1
package/types/ops/registry.d.ts.map +1 -1
package/types/pipelines.d.ts +26 -51
package/types/pipelines.d.ts.map +1 -1
package/types/tokenizers.d.ts +10 -6
package/types/tokenizers.d.ts.map +1 -1
package/types/utils/audio.d.ts.map +1 -1
package/types/utils/constants.d.ts.map +1 -1
package/types/utils/core.d.ts +94 -22
package/types/utils/core.d.ts.map +1 -1
package/types/utils/data-structures.d.ts.map +1 -1
package/types/utils/devices.d.ts.map +1 -1
package/types/utils/dtypes.d.ts +3 -2
package/types/utils/dtypes.d.ts.map +1 -1
package/types/utils/generic.d.ts.map +1 -1
package/types/utils/hub.d.ts +3 -3
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +14 -1
package/types/utils/image.d.ts.map +1 -1
package/types/utils/maths.d.ts +10 -10
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +10 -8
package/types/utils/tensor.d.ts.map +1 -1

package/src/models/paligemma/processing_paligemma.js ADDED Viewed

@@ -0,0 +1,82 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+const IMAGE_TOKEN = "<image>";
+function build_string_from_input(
+    prompt,
+    bos_token,
+    image_seq_len,
+    image_token,
+    num_images,
+) {
+    return `${image_token.repeat(image_seq_len * num_images)}${bos_token}${prompt}\n`
+}
+export class PaliGemmaProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static image_processor_class = AutoImageProcessor
+    static uses_processor_config = false;
+    /**
+     * @typedef {import('../../utils/image.js').RawImage} RawImage
+     */
+    // `images` is required, `text` is optional
+    async _call(/** @type {RawImage|RawImage[]} */ images, text = null, kwargs = {}) {
+        if (!text) {
+            console.warn(
+                "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
+            )
+            text = ""
+        }
+        if (!Array.isArray(images)) {
+            images = [images]
+        }
+        if (!Array.isArray(text)) {
+            text = [text]
+        }
+        const bos_token = this.tokenizer.bos_token;
+        const image_seq_length = this.image_processor.config.image_seq_length;
+        let input_strings;
+        if (text.some((t) => t.includes(IMAGE_TOKEN))) {
+            input_strings = text.map(
+                sample => {
+                    const expanded_sample = sample.replaceAll(IMAGE_TOKEN, IMAGE_TOKEN.repeat(image_seq_length));
+                    const bos_rfind_index = expanded_sample.lastIndexOf(IMAGE_TOKEN);
+                    const bos_index = bos_rfind_index === -1 ? 0 : bos_rfind_index + IMAGE_TOKEN.length;
+                    return expanded_sample.slice(0, bos_index) + bos_token + expanded_sample.slice(bos_index) + "\n";
+                }
+            )
+        } else {
+            console.warn(
+                "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special " +
+                "image tokens in the text, as many tokens as there are images per each text. It is recommended to " +
+                "add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images " +
+                "each text has and add special tokens."
+            )
+            input_strings = text.map(
+                sample => build_string_from_input(
+                    sample,
+                    bos_token,
+                    image_seq_length,
+                    IMAGE_TOKEN,
+                    images.length,
+                )
+            )
+        }
+        const text_inputs = this.tokenizer(input_strings, kwargs);
+        const image_inputs = await this.image_processor(images, kwargs);
+        return {
+            ...image_inputs,
+            ...text_inputs,
+        }
+    }
+}

package/src/models/processors.js CHANGED Viewed

@@ -1,8 +1,10 @@
 export * from './florence2/processing_florence2.js';
 export * from './mgp_str/processing_mgp_str.js';
+export * from './idefics3/processing_idefics3.js';
 export * from './janus/processing_janus.js';
 export * from './jina_clip/processing_jina_clip.js';
 export * from './owlvit/processing_owlvit.js';
+export * from './paligemma/processing_paligemma.js';
 export * from './pyannote/processing_pyannote.js';
 export * from './qwen2_vl/processing_qwen2_vl.js';
 export * from './sam/processing_sam.js';

package/src/models.js CHANGED Viewed

@@ -182,6 +182,22 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
         }
     }
+    if (dtype === DATA_TYPES.auto) {
+        // Try to choose the auto dtype based on the custom config
+        let config_dtype = custom_config.dtype;
+        if (typeof config_dtype !== 'string') {
+            config_dtype = config_dtype[fileName];
+        }
+        if (config_dtype && config_dtype !== DATA_TYPES.auto && DATA_TYPES.hasOwnProperty(config_dtype)) {
+            // Defined by the custom config, and is not "auto"
+            dtype = config_dtype;
+        } else {
+            // Choose default dtype based on device, falling back to fp32
+            dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32;
+        }
+    }
     const selectedDtype = /** @type {import("./utils/dtypes.js").DataType} */(dtype);
     if (!DEFAULT_DTYPE_SUFFIX_MAPPING.hasOwnProperty(selectedDtype)) {
@@ -387,9 +403,17 @@ async function sessionRun(session, inputs) {
         output = replaceTensors(output);
         return output;
     } catch (e) {
+        // Error messages can be long (nested) and uninformative. For this reason,
+        // we apply minor formatting to show the most important information
+        const formatted = Object.fromEntries(Object.entries(checkedInputs)
+            .map(([k, { type, dims, data }]) => [k, {
+                // Extract these properties from the underlying ORT tensor
+                type, dims, data,
+            }]));
         // This usually occurs when the inputs are of the wrong type.
         console.error(`An error occurred during model execution: "${e}".`);
-        console.error('Inputs given to model:', checkedInputs)
+        console.error('Inputs given to model:', formatted);
         throw e;
     }
 }
@@ -534,7 +558,9 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
         new_model_inputs.use_cache_branch = boolTensor(!!past_key_values);
     }
     if (session.inputNames.includes('position_ids') && new_model_inputs.attention_mask && !new_model_inputs.position_ids) {
-        new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values);
+        // NOTE: Handle a special case for paligemma models, where positions are 1-indexed
+        const start_index = self.config.model_type === 'paligemma' ? 1 : 0;
+        new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index);
     }
     // Unpack the `past_key_values` object into model inputs
@@ -546,6 +572,39 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
 }
+function default_merge_input_ids_with_image_features({
+    image_token_id,
+    inputs_embeds,
+    image_features,
+    input_ids,
+    attention_mask,
+}) {
+    const image_tokens = input_ids.tolist().map(ids =>
+        ids.reduce((acc, x, idx) => {
+            if (x == image_token_id) acc.push(idx);
+            return acc;
+        }, [])
+    );
+    const n_image_tokens = image_tokens.reduce((acc, x) => acc + x.length, 0);
+    const n_image_features = image_features.dims[0];
+    if (n_image_tokens !== n_image_features) {
+        throw new Error(`Image features and image tokens do not match: tokens: ${n_image_tokens}, features ${n_image_features}`);
+    }
+    // Equivalent to performing a masked_scatter
+    let img = 0;
+    for (let i = 0; i < image_tokens.length; ++i) {
+        const tokens = image_tokens[i];
+        const embeds = inputs_embeds[i];
+        for (let j = 0; j < tokens.length; ++j) {
+            embeds[tokens[j]].data.set(image_features[img++].data)
+        }
+    }
+    return { inputs_embeds, attention_mask }
+}
 /**
  * Forward pass of an image-text-to-text model.
  * @param {Object} self The image-text-to-text model model.
@@ -637,14 +696,14 @@ async function imageTextToTextForward(self, {
  * @param {Tensor} attention_mask
  * @returns {{data: BigInt64Array, dims: number[]}}
  */
-function cumsum_masked_fill(attention_mask) {
+function cumsum_masked_fill(attention_mask, start_index = 0) {
     const [bz, seq_len] = attention_mask.dims;
     const attn_mask_data = attention_mask.data;
     const data = new BigInt64Array(attn_mask_data.length);
     for (let i = 0; i < bz; ++i) {
         const start = i * seq_len;
-        let sum = BigInt(0);
+        let sum = BigInt(start_index);
         for (let j = 0; j < seq_len; ++j) {
             const index = start + j;
             if (attn_mask_data[index] === 0n) {
@@ -671,10 +730,10 @@ function cumsum_masked_fill(attention_mask) {
  *     position_ids = position_ids[:, -input_ids.shape[1] :]
  * ```
  */
-function createPositionIds(model_inputs, past_key_values = null) {
+function createPositionIds(model_inputs, past_key_values = null, start_index = 0) {
     const { input_ids, inputs_embeds, attention_mask } = model_inputs;
-    const { data, dims } = cumsum_masked_fill(attention_mask);
+    const { data, dims } = cumsum_masked_fill(attention_mask, start_index);
     let position_ids = new Tensor('int64', data, dims);
     if (past_key_values) {
         const offset = -(input_ids ?? inputs_embeds).dims.at(1);
@@ -1013,7 +1072,10 @@ export class PreTrainedModel extends Callable {
         } else { // should be MODEL_TYPES.EncoderOnly
             if (modelType !== MODEL_TYPES.EncoderOnly) {
-                console.warn(`Model type for '${modelName ?? config?.model_type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`)
+                const type = modelName ?? config?.model_type;
+                if (type !== 'custom') {
+                    console.warn(`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`)
+                }
             }
             info = await Promise.all([
                 constructSessions(pretrained_model_name_or_path, {
@@ -1757,7 +1819,7 @@ export class PreTrainedModel extends Callable {
             const dtype = session?.config?.kv_cache_dtype ?? 'float32';
             const empty = (dtype === 'float16') ? new Uint16Array() : [];
-            const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask).dims?.[0] ?? 1;
+            const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1;
             const shapes = getKeyValueShapes(this.config, { batch_size });
             for (const name in shapes) {
@@ -3304,8 +3366,8 @@ export class VisionEncoderDecoderModel extends PreTrainedModel {
 export class LlavaPreTrainedModel extends PreTrainedModel {
     forward_params = [
         'input_ids',
-        'pixel_values',
         'attention_mask',
+        'pixel_values',
         'position_ids',
         'past_key_values',
     ];
@@ -3487,6 +3549,70 @@ export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel
         return decoder_outputs;
     }
 }
+export class PaliGemmaPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        // 'inputs_embeds',
+        'attention_mask',
+        'pixel_values',
+        'position_ids',
+        'past_key_values',
+    ];
+}
+export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
+    _merge_input_ids_with_image_features(kwargs) {
+        const vision_hidden_size = kwargs.image_features.dims.at(-1);
+        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_index,
+            ...kwargs,
+            image_features: reshaped_image_hidden_states,
+        })
+    }
+}
+//////////////////////////////////////////////////
+// Idefics3 Models
+export class Idefics3PreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'attention_mask',
+        'pixel_values',
+        'pixel_attention_mask',
+        'position_ids',
+        'past_key_values',
+    ];
+}
+/**
+ * The LLAVA model which consists of a vision backbone and a language model.
+ */
+export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
+    async encode_image({ pixel_values, pixel_attention_mask }) {
+        const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask })).image_features;
+        return features;
+    }
+    _merge_input_ids_with_image_features(kwargs) {
+        const vision_hidden_size = kwargs.image_features.dims.at(-1);
+        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_id,
+            ...kwargs,
+            image_features: reshaped_image_hidden_states,
+        })
+    }
+}
+//////////////////////////////////////////////////
+//////////////////////////////////////////////////
 export class CLIPPreTrainedModel extends PreTrainedModel { }
 /**
@@ -3986,6 +4112,13 @@ export class OlmoModel extends OlmoPreTrainedModel { }
 export class OlmoForCausalLM extends OlmoPreTrainedModel { }
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+// OLMo2 models
+export class Olmo2PreTrainedModel extends PreTrainedModel { }
+export class Olmo2Model extends Olmo2PreTrainedModel { }
+export class Olmo2ForCausalLM extends Olmo2PreTrainedModel { }
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 // Granite models
@@ -4280,36 +4413,12 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
         return features;
     }
-    _merge_input_ids_with_image_features({
-        inputs_embeds,
-        image_features,
-        input_ids,
-        attention_mask,
-    }) {
-        // @ts-ignore
-        const { image_token_id } = this.config;
-        const image_tokens = input_ids.tolist().map(ids =>
-            ids.reduce((acc, x, idx) => {
-                if (x == image_token_id) acc.push(idx);
-                return acc;
-            }, [])
-        );
-        const n_image_tokens = image_tokens.reduce((acc, x) => acc + x.length, 0);
-        const n_image_features = image_features.dims[0];
-        if (n_image_tokens !== n_image_features) {
-            throw new Error(`Image features and image tokens do not match: tokens: ${n_image_tokens}, features ${n_image_features}`);
-        }
-        // Equivalent to performing a masked_scatter
-        let img = 0;
-        for (let i = 0; i < image_tokens.length; ++i) {
-            const tokens = image_tokens[i];
-            const embeds = inputs_embeds[i];
-            for (let j = 0; j < tokens.length; ++j) {
-                embeds[tokens[j]].data.set(image_features[img++].data)
-            }
-        }
-        return { inputs_embeds, attention_mask }
+    _merge_input_ids_with_image_features(kwargs) {
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_id,
+            ...kwargs
+        })
     }
     prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
@@ -4426,6 +4535,20 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+export class IJepaPreTrainedModel extends PreTrainedModel { }
+export class IJepaModel extends IJepaPreTrainedModel { }
+export class IJepaForImageClassification extends IJepaPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 export class VitPosePreTrainedModel extends PreTrainedModel { }
@@ -6696,6 +6819,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['rt_detr', ['RTDetrModel', RTDetrModel]],
     ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
     ['vit', ['ViTModel', ViTModel]],
+    ['ijepa', ['IJepaModel', IJepaModel]],
     ['pvt', ['PvtModel', PvtModel]],
     ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
     ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
@@ -6760,6 +6884,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
     ['codegen', ['CodeGenModel', CodeGenModel]],
     ['llama', ['LlamaModel', LlamaModel]],
     ['olmo', ['OlmoModel', OlmoModel]],
+    ['olmo2', ['Olmo2Model', Olmo2Model]],
     ['mobilellm', ['MobileLLMModel', MobileLLMModel]],
     ['granite', ['GraniteModel', GraniteModel]],
     ['cohere', ['CohereModel', CohereModel]],
@@ -6851,6 +6976,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
     ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
     ['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
     ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
+    ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
     ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
     ['granite', ['GraniteForCausalLM', GraniteForCausalLM]],
     ['cohere', ['CohereForCausalLM', CohereForCausalLM]],
@@ -6914,6 +7040,7 @@ const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
 const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([
     ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
+    ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
 ]);
 const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
@@ -6922,6 +7049,8 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
     ['moondream1', ['Moondream1ForConditionalGeneration', Moondream1ForConditionalGeneration]],
     ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
     ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
+    ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
+    ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
 ]);
 const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
@@ -6930,6 +7059,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
 const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['vit', ['ViTForImageClassification', ViTForImageClassification]],
+    ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]],
     ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
     ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
     ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],

package/src/tokenizers.js CHANGED Viewed

@@ -2605,6 +2605,12 @@ export class PreTrainedTokenizer extends Callable {
         this.unk_token = this.getToken('unk_token');
         this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token);
+        this.bos_token = this.getToken('bos_token');
+        this.bos_token_id = this.model.tokens_to_ids.get(this.bos_token);
+        this.eos_token = this.getToken('eos_token');
+        this.eos_token_id = this.model.tokens_to_ids.get(this.eos_token);
         this.model_max_length = tokenizerConfig.model_max_length;
         /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
@@ -3577,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
         let chunk = new_chunk();
         let time_offset = 0.0;
         const timestamp_begin = this.timestamp_begin;
+        // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments.
+        // We can calculate the last time stamp token as timestamp_begin plus the number of tokens
+        // tokens from 0.00 to 30.00 which is 1500.
+        const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02
+        const timestamp_end = timestamp_begin + total_timestamp_tokens;
         let previous_tokens = [];
         let previous_token_timestamps = [];
@@ -3664,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
                     } else {
                         // 2/ This is a regular special token, ignoring it
                     }
-                } else if (token >= timestamp_begin) {
+                } else if (token >= timestamp_begin && token <= timestamp_end) {
                     // 3/ Timestamp token
                     const time = (token - timestamp_begin) * time_precision + time_offset;
                     const rounded_time = round(time, 2);

package/src/utils/core.js CHANGED Viewed

@@ -9,15 +9,45 @@
  */
 /**
- * @typedef {Object} ProgressInfo
- * @property {'initiate' | 'download' | 'progress' | 'done'} status The status of the progress item.
- * @property {string} name This can be either:
- * - a string, the *model id* of a model repo on huggingface.co.
- * - a path to a *directory* potentially containing the file.
- * @property {string} file The name of the file
- * @property {number} [progress] A number between 0 and 100. Only available for the 'progress' status.
- * @property {number} [loaded] The number of bytes loaded. Only available for the 'progress' status.
- * @property {number} [total] The total number of bytes to be loaded. Only available for the 'progress' status.
+ * @typedef {Object} InitiateProgressInfo
+ * @property {'initiate'} status
+ * @property {string} name The model id or directory path.
+ * @property {string} file The name of the file.
+ */
+/**
+ * @typedef {Object} DownloadProgressInfo
+ * @property {'download'} status
+ * @property {string} name The model id or directory path.
+ * @property {string} file The name of the file.
+ */
+/**
+ * @typedef {Object} ProgressStatusInfo
+ * @property {'progress'} status
+ * @property {string} name The model id or directory path.
+ * @property {string} file The name of the file.
+ * @property {number} progress A number between 0 and 100.
+ * @property {number} loaded The number of bytes loaded.
+ * @property {number} total The total number of bytes to be loaded.
+ */
+/**
+ * @typedef {Object} DoneProgressInfo
+ * @property {'done'} status
+ * @property {string} name The model id or directory path.
+ * @property {string} file The name of the file.
+ */
+/**
+ * @typedef {Object} ReadyProgressInfo
+ * @property {'ready'} status
+ * @property {string} task The loaded task.
+ * @property {string} model The loaded model.
+ */
+/**
+ * @typedef {InitiateProgressInfo | DownloadProgressInfo | ProgressStatusInfo | DoneProgressInfo | ReadyProgressInfo} ProgressInfo
  */
 /**
@@ -187,3 +217,17 @@ export function len(s) {
     for (const c of s) ++length;
     return length;
 }
+/**
+ * Count the occurrences of a value in an array or string.
+ * This mimics the behavior of Python's `count` method.
+ * @param {any[]|string} arr The array or string to search.
+ * @param {any} value The value to count.
+ */
+export function count(arr, value) {
+    let count = 0;
+    for (const v of arr) {
+        if (v === value) ++count;
+    }
+    return count;
+}

package/src/utils/dtypes.js CHANGED Viewed

@@ -31,6 +31,7 @@ export const isWebGpuFp16Supported = (function () {
 })();
 export const DATA_TYPES = Object.freeze({
+    auto: 'auto', // Auto-detect based on environment
     fp32: 'fp32',
     fp16: 'fp16',
     q8: 'q8',
@@ -47,7 +48,7 @@ export const DEFAULT_DEVICE_DTYPE_MAPPING = Object.freeze({
     [DEVICE_TYPES.wasm]: DATA_TYPES.q8,
 });
-/** @type {Record<DataType, string>} */
+/** @type {Record<Exclude<DataType, "auto">, string>} */
 export const DEFAULT_DTYPE_SUFFIX_MAPPING = Object.freeze({
     [DATA_TYPES.fp32]: '',
     [DATA_TYPES.fp16]: '_fp16',

package/src/utils/hub.js CHANGED Viewed

@@ -504,13 +504,6 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
         file: filename
     })
-    /** @type {import('./core.js').ProgressInfo} */
-    const progressInfo = {
-        status: 'progress',
-        name: path_or_repo_id,
-        file: filename
-    }
     /** @type {Uint8Array} */
     let buffer;
@@ -530,7 +523,9 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
         // For completeness, we still fire the final progress callback
         dispatchCallback(options.progress_callback, {
-            ...progressInfo,
+            status: 'progress',
+            name: path_or_repo_id,
+            file: filename,
             progress: 100,
             loaded: buffer.length,
             total: buffer.length,
@@ -538,7 +533,9 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
     } else {
         buffer = await readResponse(response, data => {
             dispatchCallback(options.progress_callback, {
-                ...progressInfo,
+                status: 'progress',
+                name: path_or_repo_id,
+                file: filename,
                 ...data,
             })
         })
@@ -595,12 +592,11 @@ export async function getModelJSON(modelPath, fileName, fatal = true, options =
     return JSON.parse(jsonData);
 }
 /**
  * Read and track progress when reading a Response object
  *
- * @param {any} response The Response object to read
- * @param {function} progress_callback The function to call with progress updates
+ * @param {Response|FileResponse} response The Response object to read
+ * @param {(data: {progress: number, loaded: number, total: number}) => void} progress_callback The function to call with progress updates
  * @returns {Promise<Uint8Array>} A Promise that resolves with the Uint8Array buffer
  */
 async function readResponse(response, progress_callback) {