npm - @huggingface/transformers - Versions diffs - 3.1.0 → 3.1.1 - Mend

@huggingface/transformers 3.1.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/README.md +3 -2
package/dist/transformers.cjs +678 -153
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +682 -154
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +24 -18
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +19 -13
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +30 -24
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +682 -154
package/dist/transformers.mjs.map +1 -1
package/package.json +1 -1
package/src/configs.js +2 -1
package/src/env.js +6 -6
package/src/generation/configuration_utils.js +7 -0
package/src/generation/logits_process.js +22 -16
package/src/generation/streamers.js +7 -2
package/src/models/idefics3/image_processing_idefics3.js +219 -0
package/src/models/idefics3/processing_idefics3.js +136 -0
package/src/models/image_processors.js +1 -0
package/src/models/processors.js +1 -0
package/src/models.js +112 -34
package/src/utils/core.js +14 -0
package/src/utils/dtypes.js +2 -1
package/src/utils/image.js +19 -16
package/src/utils/tensor.js +6 -1
package/types/configs.d.ts +1 -1
package/types/configs.d.ts.map +1 -1
package/types/env.d.ts +1 -1
package/types/env.d.ts.map +1 -1
package/types/generation/configuration_utils.d.ts +6 -0
package/types/generation/configuration_utils.d.ts.map +1 -1
package/types/generation/logits_process.d.ts +30 -20
package/types/generation/logits_process.d.ts.map +1 -1
package/types/generation/streamers.d.ts +13 -8
package/types/generation/streamers.d.ts.map +1 -1
package/types/models/idefics3/image_processing_idefics3.d.ts +40 -0
package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -0
package/types/models/idefics3/processing_idefics3.d.ts +19 -0
package/types/models/idefics3/processing_idefics3.d.ts.map +1 -0
package/types/models/image_processors.d.ts +1 -0
package/types/models/processors.d.ts +1 -0
package/types/models.d.ts +16 -6
package/types/models.d.ts.map +1 -1
package/types/utils/core.d.ts +7 -0
package/types/utils/core.d.ts.map +1 -1
package/types/utils/dtypes.d.ts +3 -2
package/types/utils/dtypes.d.ts.map +1 -1
package/types/utils/image.d.ts +4 -0
package/types/utils/image.d.ts.map +1 -1
package/types/utils/tensor.d.ts +5 -3
package/types/utils/tensor.d.ts.map +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.1.0",
+  "version": "3.1.1",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",

package/src/configs.js CHANGED Viewed

@@ -69,6 +69,7 @@ function getNormalizedConfig(config) {
         case 'paligemma':
         case 'florence2':
         case 'llava_onevision':
+        case 'idefics3':
             init_normalized_config = getNormalizedConfig(config.text_config);
             break;
         case 'moondream1':
@@ -382,6 +383,6 @@ export class AutoConfig {
  * See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
  * for more information.
  * @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
- * @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
+ * @property {import('./utils/dtypes.js').DataType|Record<string, import('./utils/dtypes.js').DataType>} [dtype] The default data type to use for the model.
  * @property {boolean|Record<string, boolean>} [use_external_data_format=false] Whether to load the model using the external data format (used for models >= 2GB in size).
  */

package/src/env.js CHANGED Viewed

@@ -26,12 +26,12 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
-const VERSION = '3.1.0';
+const VERSION = '3.1.1';
 // Check if various APIs are available (depends on environment)
-const IS_BROWSER_ENV = typeof self !== 'undefined';
-const IS_WEBWORKER_ENV = IS_BROWSER_ENV && self.constructor.name === 'DedicatedWorkerGlobalScope';
-const IS_WEB_CACHE_AVAILABLE = IS_BROWSER_ENV && 'caches' in self;
+const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
+const IS_WEBWORKER_ENV = typeof self !== "undefined"  && self.constructor?.name === 'DedicatedWorkerGlobalScope';
+const IS_WEB_CACHE_AVAILABLE = typeof self !== "undefined" && 'caches' in self;
 const IS_WEBGPU_AVAILABLE = typeof navigator !== 'undefined' && 'gpu' in navigator;
 const IS_WEBNN_AVAILABLE = typeof navigator !== 'undefined' && 'ml' in navigator;
@@ -44,7 +44,7 @@ const IS_PATH_AVAILABLE = !isEmpty(path);
  * A read-only object containing information about the APIs available in the current environment.
  */
 export const apis = Object.freeze({
-    /** Whether we are running in a browser environment */
+    /** Whether we are running in a browser environment (and not a web worker) */
     IS_BROWSER_ENV,
     /** Whether we are running in a web worker environment */
@@ -137,7 +137,7 @@ export const env = {
     remoteHost: 'https://huggingface.co/',
     remotePathTemplate: '{model}/resolve/{revision}/',
-    allowLocalModels: !IS_BROWSER_ENV,
+    allowLocalModels: !(IS_BROWSER_ENV || IS_WEBWORKER_ENV),
     localModelPath: localModelPath,
     useFS: IS_FS_AVAILABLE,

package/src/generation/configuration_utils.js CHANGED Viewed

@@ -259,6 +259,13 @@ export class GenerationConfig {
      */
     suppress_tokens = null;
+    /**
+     * A streamer that will be used to stream the generation.
+     * @type {import('./streamers.js').TextStreamer}
+     * @default null
+     */
+    streamer = null;
     /**
      * A list of tokens that will be suppressed at the beginning of the generation.
      * The `SuppressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled.

package/src/generation/logits_process.js CHANGED Viewed

@@ -151,7 +151,7 @@ export class ForcedBOSTokenLogitsProcessor extends LogitsProcessor {
      * Apply the BOS token forcing to the logits.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The logits with BOS token forcing.
+     * @returns {Tensor} The logits with BOS token forcing.
      */
     _call(input_ids, logits) {
         for (let i = 0; i < input_ids.length; ++i) {
@@ -221,7 +221,7 @@ export class SuppressTokensAtBeginLogitsProcessor extends LogitsProcessor {
      * Apply the BOS token forcing to the logits.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The logits with BOS token forcing.
+     * @returns {Tensor} The logits with BOS token forcing.
      */
     _call(input_ids, logits) {
         for (let i = 0; i < input_ids.length; ++i) {
@@ -391,7 +391,7 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor {
      * Apply the no-repeat-ngram processor to the logits.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The logits with no-repeat-ngram processing.
+     * @returns {Tensor} The logits with no-repeat-ngram processing.
      */
     _call(input_ids, logits) {
         for (let i = 0; i < input_ids.length; ++i) {
@@ -406,12 +406,22 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor {
 }
 /**
- * A logits processor that penalises repeated output tokens.
+ * A logits processor that prevents the repetition of previous tokens through a penalty.
+ * This penalty is applied at most once per token. Note that, for decoder-only models like most LLMs,
+ * the considered tokens include the prompt.
+ *
+ * In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a
+ * penalty of around 1.2 to achieve a good balance between truthful generation and lack of repetition.
+ * To penalize and reduce repetition, use `penalty` values above 1.0, where a higher value penalizes
+ * more strongly. To reward and encourage repetition, use `penalty` values between 0.0 and 1.0, where
+ * a lower value rewards more strongly.
  */
 export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor {
     /**
      * Create a RepetitionPenaltyLogitsProcessor.
-     * @param {number} penalty The penalty to apply for repeated tokens.
+     * @param {number} penalty The parameter for repetition penalty.
+     * - 1.0 means no penalty. Above 1.0 penalizes previously generated tokens.
+     * - Between 0.0 and 1.0 rewards previously generated tokens.
      */
     constructor(penalty) {
         super();
@@ -422,16 +432,12 @@ export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor {
      * Apply the repetition penalty to the logits.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The logits with repetition penalty processing.
+     * @returns {Tensor} The logits with repetition penalty processing.
      */
     _call(input_ids, logits) {
-        // Modify the logits corresponding to each element in `input_ids`.
-        // As a consequence, the logits corresponding to tokens that appear
-        // many times in the output will be penalised more.
         for (let i = 0; i < input_ids.length; ++i) {
             const batch_logits_data = /** @type {Float32Array} */(logits[i].data);
-            for (const input_id of input_ids[i]) {
+            for (const input_id of new Set(input_ids[i])) {
                 const token = Number(input_id);
                 if (batch_logits_data[token] < 0) {
                     batch_logits_data[token] *= this.penalty;
@@ -464,7 +470,7 @@ export class MinLengthLogitsProcessor extends LogitsProcessor {
      * Apply logit processor.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The processed logits.
+     * @returns {Tensor} The processed logits.
      */
     _call(input_ids, logits) {
         for (let i = 0; i < input_ids.length; ++i) {
@@ -502,7 +508,7 @@ export class MinNewTokensLengthLogitsProcessor extends LogitsProcessor {
      * Apply logit processor.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The processed logits.
+     * @returns {Tensor} The processed logits.
      */
     _call(input_ids, logits) {
         for (let i = 0; i < input_ids.length; ++i) {
@@ -535,7 +541,7 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor {
      * Apply logit processor.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The processed logits.
+     * @returns {Tensor} The processed logits.
      */
     _call(input_ids, logits) {
         for (let i = 0; i < input_ids.length; ++i) {
@@ -596,7 +602,7 @@ export class ClassifierFreeGuidanceLogitsProcessor extends LogitsProcessor {
      * Apply logit processor.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The processed logits.
+     * @returns {Tensor} The processed logits.
      */
     _call(input_ids, logits) {
         if (logits.dims[0] !== 2 * input_ids.length) {
@@ -650,7 +656,7 @@ export class TemperatureLogitsWarper extends LogitsWarper {
      * Apply logit warper.
      * @param {bigint[][]} input_ids The input IDs.
      * @param {Tensor} logits The logits.
-     * @returns {Object} The processed logits.
+     * @returns {Tensor} The processed logits.
      */
     _call(input_ids, logits) {
         const batch_logits_data = /** @type {Float32Array} */(logits.data);

package/src/generation/streamers.js CHANGED Viewed

@@ -34,7 +34,12 @@ const stdout_write = apis.IS_PROCESS_AVAILABLE
 export class TextStreamer extends BaseStreamer {
     /**
      *
-     * @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
+     * @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
+     * @param {Object} options
+     * @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
+     * @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
+     * @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
+     * @param {Object} [options.decode_kwargs={}] Additional keyword arguments to pass to the tokenizer's decode method
      */
     constructor(tokenizer, {
         skip_prompt = false,
@@ -143,7 +148,7 @@ export class WhisperTextStreamer extends TextStreamer {
      * @param {Object} options
      * @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
      * @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
-     * @param {function(string): void} [options.token_callback_function=null] Function to call when a new token is generated
+     * @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
      * @param {function(number): void} [options.on_chunk_start=null] Function to call when a new chunk starts
      * @param {function(number): void} [options.on_chunk_end=null] Function to call when a chunk ends
      * @param {function(): void} [options.on_finalize=null] Function to call when the stream is finalized

package/src/models/idefics3/image_processing_idefics3.js ADDED Viewed

@@ -0,0 +1,219 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
+export class Idefics3ImageProcessor extends ImageProcessor {
+    constructor(config) {
+        super(config);
+        this.do_image_splitting = config.do_image_splitting ?? true;
+        this.max_image_size = config.max_image_size;
+    }
+    /**
+     * @typedef {import('../../utils/image.js').RawImage} RawImage
+     * @typedef {import('../../utils/tensor.js').Tensor} Tensor
+     */
+    /**
+     * Calculate size to resize images to, to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
+     * @param {Tensor} pixel_values Tensor of the image to resize.
+     * @param {number} vision_encoder_max_size Maximum size of the output image. If the image is larger than this size,
+     * it will be split into patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+     */
+    get_resize_for_vision_encoder(pixel_values, vision_encoder_max_size) {
+        let [height, width] = pixel_values.dims.slice(-2);
+        const aspect_ratio = width / height;
+        if (width >= height) {
+            width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
+            height = Math.floor(width / aspect_ratio);
+            height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
+        } else {
+            height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
+            width = Math.floor(height * aspect_ratio);
+            width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
+        }
+        return { height, width };
+    }
+    /** @param {RawImage|RawImage[]|RawImage[][]} images */
+    async _call(images, {
+        do_image_splitting = null,
+        return_row_col_info = false,
+    } = {}) {
+        /** @type {RawImage[][]} */
+        let batched_2d_images;
+        if (!Array.isArray(images)) {
+            batched_2d_images = [[images]];
+        } else {
+            if (images.length === 0 || !images[0]) {
+                throw new Error("No images provided.");
+            }
+            if (!Array.isArray(images[0])) {
+                batched_2d_images = [/** @type {RawImage[]} */(images)];
+            } else {
+                batched_2d_images = /** @type {RawImage[][]} */(images);
+            }
+        }
+        // List of tensors, each with shape [patches, channels, height, width]
+        let all_pixel_values = [];
+        let images_list_rows = [];
+        let images_list_cols = [];
+        const original_sizes = [];
+        const reshaped_input_sizes = [];
+        for (const image_batch of batched_2d_images) {
+            let images_list = await Promise.all(image_batch.map(x => this.preprocess(x)));
+            // Original sizes of images
+            original_sizes.push(...images_list.map(x => x.original_size));
+            // Reshaped sizes of images, before padding or cropping
+            reshaped_input_sizes.push(...images_list.map(x => x.reshaped_input_size));
+            // Convert images to 4D tensors for easier processing
+            images_list.forEach(x => x.pixel_values.unsqueeze_(0));
+            const { longest_edge } = this.max_image_size;
+            /** @type {Tensor[]} */
+            let images_tensor;
+            if (do_image_splitting ?? this.do_image_splitting) {
+                let image_rows = new Array(images_list.length);
+                let image_cols = new Array(images_list.length);
+                // We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
+                images_tensor = await Promise.all(images_list.map(async (x, i) => {
+                    const new_size = this.get_resize_for_vision_encoder(x.pixel_values, longest_edge);
+                    const resized = await interpolate_4d(x.pixel_values, {
+                        size: [new_size.height, new_size.width],
+                    });
+                    const { frames, num_splits_h, num_splits_w } = await this.split_image(resized, this.max_image_size);
+                    image_rows[i] = num_splits_h;
+                    image_cols[i] = num_splits_w;
+                    return cat(frames, 0);
+                }));
+                images_list_rows.push(image_rows);
+                images_list_cols.push(image_cols);
+            } else {
+                /** @type {[number, number]} */
+                const size = [longest_edge, longest_edge];
+                images_tensor = await Promise.all(
+                    images_list.map(x => interpolate_4d(x.pixel_values, { size }))
+                );
+                images_list_rows.push(new Array(images_list.length).fill(0));
+                images_list_cols.push(new Array(images_list.length).fill(0));
+            }
+            all_pixel_values.push(cat(images_tensor, 0));
+        }
+        const batch_size = all_pixel_values.length;
+        const [n, c, h, w] = all_pixel_values[0].dims;
+        // Stack pixel values
+        let pixel_values;
+        let pixel_attention_mask;
+        if (batch_size === 1) {
+            pixel_values = all_pixel_values[0].unsqueeze_(0);
+            pixel_attention_mask = full([batch_size, n, h, w], true);
+        } else {
+            // Add padding (if necessary) to images with less patches than the maximum number of patches
+            const max_num_patches = Math.max(...all_pixel_values.map(x => x.dims.at(0)));
+            pixel_attention_mask = full([batch_size, max_num_patches, h, w], true);
+            const pixel_attention_mask_data = pixel_attention_mask.data;
+            const pixel_attention_mask_stride = max_num_patches * h * w;
+            for (let i = 0; i < batch_size; ++i) {
+                const num_patches = all_pixel_values[i].dims[0];
+                if (num_patches < max_num_patches) {
+                    all_pixel_values[i] = cat([
+                        all_pixel_values[i],
+                        full([max_num_patches - num_patches, c, h, w], 0),
+                    ], 0);
+                    const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
+                    const end_offset = (i + 1) * pixel_attention_mask_stride;
+                    pixel_attention_mask_data.fill(false, start_offset, end_offset);
+                }
+            }
+            pixel_values = stack(all_pixel_values, 0);
+        }
+        return {
+            pixel_values,
+            pixel_attention_mask,
+            original_sizes,
+            reshaped_input_sizes,
+            ...(
+                return_row_col_info
+                    ? { rows: images_list_rows, cols: images_list_cols }
+                    : {}
+            ),
+        }
+    }
+    async split_image(pixel_values, { longest_edge }) {
+        const max_height = longest_edge;
+        const max_width = longest_edge;
+        const frames = [];
+        const [height, width] = pixel_values.dims.slice(-2);
+        let num_splits_h = 0, num_splits_w = 0;
+        if (height > max_height || width > max_width) {
+            // Calculate the number of splits
+            num_splits_h = Math.ceil(height / max_height);
+            num_splits_w = Math.ceil(width / max_width);
+            // Calculate the optimal width and height for the sub-images
+            const optimal_height = Math.ceil(height / num_splits_h);
+            const optimal_width = Math.ceil(width / num_splits_w);
+            // Iterate through each row and column
+            for (let r = 0; r < num_splits_h; r++) {
+                for (let c = 0; c < num_splits_w; c++) {
+                    // Calculate the starting point of the crop
+                    const start_x = c * optimal_width;
+                    const start_y = r * optimal_height;
+                    // Calculate the ending point of the crop
+                    const end_x = Math.min(start_x + optimal_width, width);
+                    const end_y = Math.min(start_y + optimal_height, height);
+                    // Crop the image
+                    frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
+                }
+            }
+            // Resize the global image to match max dimensions for memory efficiency
+            const global_image_height = max_height;
+            const global_image_width = max_width;
+            if (height !== global_image_height || width !== global_image_width) {
+                pixel_values = await interpolate_4d(pixel_values, {
+                    size: [global_image_height, global_image_width],
+                })
+            }
+        }
+        frames.push(pixel_values);
+        return { frames, num_splits_h, num_splits_w };
+    }
+}

package/src/models/idefics3/processing_idefics3.js ADDED Viewed

@@ -0,0 +1,136 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { RawImage } from "../../utils/image.js";
+import { count } from "../../utils/core.js";
+/**
+ * Prompt with expanded image tokens for when the image is split into patches.
+ * @private
+ */
+function _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token) {
+    let text_split_images = "";
+    for (let n_h = 0; n_h < image_rows; ++n_h) {
+        for (let n_w = 0; n_w < image_cols; ++n_w) {
+            text_split_images += (
+                fake_token_around_image +
+                `<row_${n_h + 1}_col_${n_w + 1}>` +
+                image_token.repeat(image_seq_len)
+            );
+        }
+        text_split_images += "\n";
+    }
+    text_split_images += (
+        `\n${fake_token_around_image}` +
+        `${global_img_token}` +
+        image_token.repeat(image_seq_len) +
+        `${fake_token_around_image}`
+    );
+    return text_split_images;
+}
+/**
+ * Prompt with expanded image tokens for a single image.
+ * @private
+ */
+function _prompt_single_image(image_seq_len, fake_token_around_image, image_token, global_img_token) {
+    return (
+        `${fake_token_around_image}` +
+        `${global_img_token}` +
+        image_token.repeat(image_seq_len) +
+        `${fake_token_around_image}`
+    );
+}
+function get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_around_image, image_token, global_img_token) {
+    if (image_rows === 0 && image_cols === 0) {
+        return _prompt_single_image(
+            image_seq_len,
+            fake_token_around_image,
+            image_token,
+            global_img_token
+        );
+    }
+    return _prompt_split_image(
+        image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token
+    );
+}
+export class Idefics3Processor extends Processor {
+    static image_processor_class = AutoImageProcessor
+    static tokenizer_class = AutoTokenizer
+    static uses_processor_config = true;
+    fake_image_token = "<fake_token_around_image>";
+    image_token = "<image>";
+    global_img_token = "<global-img>";
+    /**
+     *
+     * @param {string|string[]} text
+     * @param {RawImage|RawImage[]|RawImage[][]} images
+     * @returns {Promise<any>}
+     */
+    async _call(text, images = null, options = {}) {
+        options.return_row_col_info ??= true;
+        let image_inputs;
+        if (images) {
+            image_inputs = await this.image_processor(images, options);
+        }
+        // NOTE: We assume text is present
+        if (!Array.isArray(text)) {
+            text = [text];
+        }
+        const image_rows = image_inputs.rows ?? [new Array(text.length).fill(0)];
+        const image_cols = image_inputs.cols ?? [new Array(text.length).fill(0)];
+        const image_seq_len = this.config.image_seq_len;
+        const n_images_in_text = []
+        const prompt_strings = [];
+        for (let i = 0; i < text.length; ++i) {
+            const sample = text[i];
+            const sample_rows = image_rows[i];
+            const sample_cols = image_cols[i];
+            n_images_in_text.push(count(sample, this.image_token));
+            // Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
+            const image_prompt_strings = sample_rows.map(
+                (n_rows, j) => get_image_prompt_string(
+                    n_rows,
+                    sample_cols[j],
+                    image_seq_len,
+                    this.fake_image_token,
+                    this.image_token,
+                    this.global_img_token,
+                )
+            );
+            const split_sample = sample.split(this.image_token);
+            if (split_sample.length === 0) {
+                throw new Error("The image token should be present in the text.");
+            }
+            // Place in the image prompt strings where the image tokens are
+            let new_sample = split_sample[0];
+            for (let j = 0; j < image_prompt_strings.length; ++j) {
+                new_sample += image_prompt_strings[j] + split_sample[j + 1];
+            }
+            prompt_strings.push(new_sample);
+        }
+        const text_inputs = this.tokenizer(prompt_strings);
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+}

package/src/models/image_processors.js CHANGED Viewed

@@ -10,6 +10,7 @@ export * from './donut/image_processing_donut.js'
 export * from './dpt/image_processing_dpt.js'
 export * from './efficientnet/image_processing_efficientnet.js'
 export * from './glpn/image_processing_glpn.js'
+export * from './idefics3/image_processing_idefics3.js'
 export * from './janus/image_processing_janus.js'
 export * from './jina_clip/image_processing_jina_clip.js'
 export * from './llava_onevision/image_processing_llava_onevision.js'

package/src/models/processors.js CHANGED Viewed

@@ -1,5 +1,6 @@
 export * from './florence2/processing_florence2.js';
 export * from './mgp_str/processing_mgp_str.js';
+export * from './idefics3/processing_idefics3.js';
 export * from './janus/processing_janus.js';
 export * from './jina_clip/processing_jina_clip.js';
 export * from './owlvit/processing_owlvit.js';