npm - @huggingface/transformers - Versions diffs - 3.2.3 → 3.2.4 - Mend

@huggingface/transformers 3.2.3 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

package/README.md +2 -2
package/dist/transformers.cjs +203 -92
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +203 -92
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +1 -1
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +1 -1
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +1 -1
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +203 -92
package/dist/transformers.mjs.map +1 -1
package/package.json +2 -2
package/src/base/feature_extraction_utils.js +9 -9
package/src/base/image_processors_utils.js +11 -0
package/src/base/processing_utils.js +13 -3
package/src/configs.js +5 -0
package/src/env.js +1 -1
package/src/models/auto/feature_extraction_auto.js +0 -16
package/src/models/auto/processing_auto.js +0 -16
package/src/models/convnext/image_processing_convnext.js +1 -0
package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
package/src/models/florence2/processing_florence2.js +3 -0
package/src/models/idefics3/image_processing_idefics3.js +2 -0
package/src/models/janus/image_processing_janus.js +1 -0
package/src/models/mgp_str/processing_mgp_str.js +2 -0
package/src/models/paligemma/processing_paligemma.js +1 -0
package/src/models/phi3_v/processing_phi3_v.js +1 -1
package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
package/src/models/whisper/feature_extraction_whisper.js +1 -1
package/src/models.js +50 -15
package/src/ops/registry.js +10 -0
package/src/pipelines.js +34 -7
package/src/tokenizers.js +4 -7
package/src/utils/dtypes.js +2 -0
package/src/utils/hub.js +1 -1
package/src/utils/maths.js +8 -6
package/src/utils/tensor.js +42 -10
package/types/base/feature_extraction_utils.d.ts +7 -7
package/types/base/image_processors_utils.d.ts.map +1 -1
package/types/base/processing_utils.d.ts +17 -19
package/types/base/processing_utils.d.ts.map +1 -1
package/types/configs.d.ts.map +1 -1
package/types/generation/parameters.d.ts +1 -1
package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
package/types/models/auto/image_processing_auto.d.ts.map +1 -1
package/types/models/auto/processing_auto.d.ts.map +1 -1
package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
package/types/models/florence2/processing_florence2.d.ts.map +1 -1
package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
package/types/models/janus/image_processing_janus.d.ts.map +1 -1
package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
package/types/models/whisper/generation_whisper.d.ts +1 -1
package/types/models/whisper/generation_whisper.d.ts.map +1 -1
package/types/models.d.ts +32 -17
package/types/models.d.ts.map +1 -1
package/types/ops/registry.d.ts +1 -0
package/types/ops/registry.d.ts.map +1 -1
package/types/pipelines.d.ts +2 -2
package/types/pipelines.d.ts.map +1 -1
package/types/tokenizers.d.ts.map +1 -1
package/types/tsconfig.tsbuildinfo +1 -0
package/types/utils/dtypes.d.ts.map +1 -1
package/types/utils/hub.d.ts +1 -1
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +3 -2
package/types/utils/image.d.ts.map +1 -1
package/types/utils/maths.d.ts +8 -6
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +8 -4
package/types/utils/tensor.d.ts.map +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.2.3",
+  "version": "3.2.4",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",
@@ -24,7 +24,7 @@
   "scripts": {
     "format": "prettier --write .",
     "format:check": "prettier --check .",
-    "typegen": "tsc ./src/transformers.js --allowJs --declaration --emitDeclarationOnly --declarationMap --outDir types",
+    "typegen": "tsc --build",
     "dev": "webpack serve --no-client-overlay",
     "build": "webpack && npm run typegen",
     "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",

package/src/base/feature_extraction_utils.js CHANGED Viewed

@@ -17,23 +17,23 @@ export class FeatureExtractor extends Callable {
     }
     /**
-     * Instantiate one of the processor classes of the library from a pretrained model.
+     * Instantiate one of the feature extractor classes of the library from a pretrained model.
      *
-     * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
-     * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
+     * The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
+     * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
      *
      * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
-     * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
+     * - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
      *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
      *   user or organization name, like `dbmdz/bert-base-german-cased`.
-     * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
-     * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
+     * - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
+     * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
      *
-     * @returns {Promise<FeatureExtractor>} A new instance of the Processor class.
+     * @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
      */
     static async from_pretrained(pretrained_model_name_or_path, options) {
-        const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, true, options);
-        return new this(preprocessorConfig);
+        const config = await getModelJSON(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, true, options);
+        return new this(config);
     }
 }

package/src/base/image_processors_utils.js CHANGED Viewed

@@ -604,14 +604,20 @@ export class ImageProcessor extends Callable {
         this.do_thumbnail = config.do_thumbnail;
         this.size = config.size ?? config.image_size;
         this.do_resize = config.do_resize ?? (this.size !== undefined);
+        // @ts-expect-error TS2339
         this.size_divisibility = config.size_divisibility ?? config.size_divisor;
         this.do_center_crop = config.do_center_crop;
+        // @ts-expect-error TS2339
         this.crop_size = config.crop_size;
+        // @ts-expect-error TS2339
         this.do_convert_rgb = config.do_convert_rgb ?? true;
+        // @ts-expect-error TS2339
         this.do_crop_margin = config.do_crop_margin;
+        // @ts-expect-error TS2339
         this.pad_size = config.pad_size;
+        // @ts-expect-error TS2339
         this.do_pad = config.do_pad;
         if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
@@ -820,6 +826,7 @@ export class ImageProcessor extends Callable {
         // Support both formats for backwards compatibility
         else if (Number.isInteger(size)) {
             shortest_edge = size;
+            // @ts-expect-error TS2339
             longest_edge = this.config.max_size ?? shortest_edge;
         } else if (size !== undefined) {
@@ -888,6 +895,7 @@ export class ImageProcessor extends Callable {
         } else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
             // Custom resize logic for Qwen2-VL models
             const { min_pixels, max_pixels } = size;
+            // @ts-expect-error TS2339
             const factor = this.config.patch_size * this.config.merge_size;
             return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
         } else {
@@ -903,6 +911,7 @@ export class ImageProcessor extends Callable {
     async resize(image) {
         const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
         return await image.resize(newWidth, newHeight, {
+            // @ts-expect-error TS2322
             resample: this.resample,
         });
     }
@@ -953,6 +962,7 @@ export class ImageProcessor extends Callable {
         // Resize the image using thumbnail method.
         if (this.do_thumbnail) {
+            // @ts-expect-error TS2345
             image = await this.thumbnail(image, this.size, this.resample);
         }
@@ -977,6 +987,7 @@ export class ImageProcessor extends Callable {
         // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
         // occurs with data in the hwc format (height, width, channels),
         // to emulate the behavior of the original Python code (w/ numpy).
+        /** @type {Float32Array} */
         let pixelData = Float32Array.from(image.data);
         let imgDims = [image.height, image.width, image.channels];

package/src/base/processing_utils.js CHANGED Viewed

@@ -28,6 +28,7 @@ import { getModelJSON } from '../utils/hub.js';
 /**
  * @typedef {Object} ProcessorProperties Additional processor-specific properties.
  * @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
+ * @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
  */
@@ -61,7 +62,7 @@ export class Processor extends Callable {
     }
     /**
-     * @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
+     * @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
      */
     get tokenizer() {
         return this.components.tokenizer;
@@ -74,6 +75,11 @@ export class Processor extends Callable {
         return this.components.feature_extractor;
     }
+    /**
+     * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
+     * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
+     * @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
+     */
     apply_chat_template(messages, options = {}) {
         if (!this.tokenizer) {
             throw new Error('Unable to apply chat template without a tokenizer.');
@@ -84,6 +90,10 @@ export class Processor extends Callable {
         });
     }
+    /**
+     * @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
+     * @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
+     */
     batch_decode(...args) {
         if (!this.tokenizer) {
             throw new Error('Unable to decode without a tokenizer.');
@@ -111,8 +121,8 @@ export class Processor extends Callable {
     /**
      * Instantiate one of the processor classes of the library from a pretrained model.
      *
-     * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
-     * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
+     * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
+     * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
      *
      * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
      * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.

package/src/configs.js CHANGED Viewed

@@ -70,15 +70,19 @@ function getNormalizedConfig(config) {
         case 'florence2':
         case 'llava_onevision':
         case 'idefics3':
+            // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.text_config);
             break;
         case 'moondream1':
+            // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.phi_config);
             break;
         case 'musicgen':
+            // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.decoder);
             break;
         case 'multi_modality':
+            // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.language_config);
             break;
@@ -199,6 +203,7 @@ function getNormalizedConfig(config) {
             break;
         case 'vision-encoder-decoder':
+            // @ts-expect-error TS2339
             const decoderConfig = getNormalizedConfig(config.decoder);
             const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;

package/src/env.js CHANGED Viewed

@@ -26,7 +26,7 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
-const VERSION = '3.2.3';
+const VERSION = '3.2.4';
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";

package/src/models/auto/feature_extraction_auto.js CHANGED Viewed

@@ -6,22 +6,6 @@ import * as AllFeatureExtractors from '../feature_extractors.js';
 export class AutoFeatureExtractor {
-    /**
-     * Instantiate one of the feature extractor classes of the library from a pretrained model.
-     *
-     * The processor class to instantiate is selected based on the `feature_extractor_type` property of
-     * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
-     *
-     * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
-     * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
-     *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-     *   user or organization name, like `dbmdz/bert-base-german-cased`.
-     * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
-     * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
-     *
-     * @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
-     */
     /** @type {typeof FeatureExtractor.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options={}) {

package/src/models/auto/processing_auto.js CHANGED Viewed

@@ -40,22 +40,6 @@ import * as AllFeatureExtractors from '../feature_extractors.js';
  */
 export class AutoProcessor {
-    /**
-     * Instantiate one of the processor classes of the library from a pretrained model.
-     *
-     * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
-     * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
-     *
-     * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
-     * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
-     *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-     *   user or organization name, like `dbmdz/bert-base-german-cased`.
-     * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
-     * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
-     *
-     * @returns {Promise<Processor>} A new instance of the Processor class.
-     */
     /** @type {typeof Processor.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options={}) {

package/src/models/convnext/image_processing_convnext.js CHANGED Viewed

@@ -9,6 +9,7 @@ export class ConvNextImageProcessor extends ImageProcessor {
         /**
          * Percentage of the image to crop. Only has an effect if this.size < 384.
          */
+        // @ts-expect-error TS2339
         this.crop_pct = this.config.crop_pct ?? (224 / 256);
     }

package/src/models/efficientnet/image_processing_efficientnet.js CHANGED Viewed

@@ -5,6 +5,7 @@ import {
 export class EfficientNetImageProcessor extends ImageProcessor {
     constructor(config) {
         super(config);
+        // @ts-expect-error TS2339
         this.include_top = this.config.include_top ?? true;
         if (this.include_top) {
             this.image_std = this.image_std.map(x => x * x);

package/src/models/florence2/processing_florence2.js CHANGED Viewed

@@ -10,8 +10,11 @@ export class Florence2Processor extends Processor {
         super(config, components);
         const {
+            // @ts-expect-error TS2339
             tasks_answer_post_processing_type,
+            // @ts-expect-error TS2339
             task_prompts_without_inputs,
+            // @ts-expect-error TS2339
             task_prompts_with_input,
         } = this.image_processor.config;

package/src/models/idefics3/image_processing_idefics3.js CHANGED Viewed

@@ -146,6 +146,8 @@ export class Idefics3ImageProcessor extends ImageProcessor {
                     const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
                     const end_offset = (i + 1) * pixel_attention_mask_stride;
+                    // @ts-expect-error
                     pixel_attention_mask_data.fill(false, start_offset, end_offset);
                 }
             }

package/src/models/janus/image_processing_janus.js CHANGED Viewed

@@ -13,6 +13,7 @@ export class VLMImageProcessor extends ImageProcessor {
             },
             ...config,
         });
+        // @ts-expect-error TS2339
         this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
     }

package/src/models/mgp_str/processing_mgp_str.js CHANGED Viewed

@@ -119,6 +119,8 @@ export class MgpstrProcessor extends Processor {
      * - bpe_preds: The list of BPE decoded sentences.
      * - wp_preds: The list of wp decoded sentences.
      */
+    // @ts-expect-error The type of this method is not compatible with the one
+    // in the base class. It might be a good idea to fix this.
     batch_decode([char_logits, bpe_logits, wp_logits]) {
         const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
         const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');

package/src/models/paligemma/processing_paligemma.js CHANGED Viewed

@@ -41,6 +41,7 @@ export class PaliGemmaProcessor extends Processor {
         }
         const bos_token = this.tokenizer.bos_token;
+        // @ts-expect-error TS2339
         const image_seq_length = this.image_processor.config.image_seq_length;
         let input_strings;
         if (text.some((t) => t.includes(IMAGE_TOKEN))) {

package/src/models/phi3_v/processing_phi3_v.js CHANGED Viewed

@@ -14,7 +14,7 @@ export class Phi3VProcessor extends Processor {
      *
      * @param {string|string[]} text
      * @param {RawImage|RawImage[]} images
-     * @param  {...any} args
+     * @param  { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
      * @returns {Promise<any>}
      */
     async _call(text, images = null, {

package/src/models/pyannote/feature_extraction_pyannote.js CHANGED Viewed

@@ -52,6 +52,7 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
             let current_speaker = -1;
             for (let i = 0; i < scores.length; ++i) {
+                /** @type {number[]} */
                 const probabilities = softmax(scores[i]);
                 const [score, id] = max(probabilities);
                 const [start, end] = [i, i + 1];

package/src/models/qwen2_vl/processing_qwen2_vl.js CHANGED Viewed

@@ -28,6 +28,7 @@ export class Qwen2VLProcessor extends Processor {
         }
         if (image_grid_thw) {
+            // @ts-expect-error TS2551
             let merge_length = this.image_processor.config.merge_size ** 2;
             let index = 0;

package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js CHANGED Viewed

@@ -133,8 +133,8 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
                         'int64',
                         new BigInt64Array(numPaddedFrames),
                         [1, numPaddedFrames],
-                    )
-                    padded_attention_mask.data.fill(1n, 0, num_frames);
+                    );
+                    /** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
                 }
             }
         }

package/src/models/whisper/feature_extraction_whisper.js CHANGED Viewed

@@ -44,7 +44,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
         )
         const data = features.data;
-        const maxValue = max(data)[0];
+        const maxValue = max(/** @type {Float32Array} */(data))[0];
         for (let i = 0; i < data.length; ++i) {
             data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;

package/src/models.js CHANGED Viewed

@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
     } else if (session_options.externalData !== undefined) {
         externalDataPromises = session_options.externalData.map(async (ext) => {
             // if the external data is a string, fetch the file and replace the string with its content
+            // @ts-expect-error TS2339
             if (typeof ext.data === "string") {
+                // @ts-expect-error TS2339
                 const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
+                // @ts-expect-error TS2698
                 return { ...ext, data: ext_buffer };
             }
             return ext;
@@ -1519,6 +1522,7 @@ export class PreTrainedModel extends Callable {
                 if (this.config.model_type === 'musicgen') {
                     // Custom logic (TODO: move to Musicgen class)
                     decoder_input_ids = Array.from({
+                        // @ts-expect-error TS2339
                         length: batch_size * this.config.decoder.num_codebooks
                     }, () => [decoder_start_token_id]);
@@ -1848,11 +1852,13 @@ export class PreTrainedModel extends Callable {
     async encode_image({ pixel_values }) {
         // image_inputs === { pixel_values }
         const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
+        // @ts-expect-error TS2339
         if (!this.config.num_image_tokens) {
             console.warn(
                 'The number of image tokens was not set in the model configuration. ' +
                 `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
             )
+            // @ts-expect-error TS2339
             this.config.num_image_tokens = features.dims[1];
         }
         return features;
@@ -3280,6 +3286,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         if (generation_config.return_token_timestamps) {
             outputs["token_timestamps"] = this._extract_token_timestamps(
+                // @ts-expect-error TS2345
                 outputs,
                 generation_config.alignment_heads,
                 generation_config.num_frames,
@@ -3315,6 +3322,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
             );
         }
+        // @ts-expect-error TS2339
         let median_filter_width = this.config.median_filter_width;
         if (median_filter_width === undefined) {
             console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -3325,6 +3333,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         const batch = generate_outputs.cross_attentions;
         // Create a list with `decoder_layers` elements, each a tensor of shape
         // (batch size, attention_heads, output length, input length).
+        // @ts-expect-error TS2339
         const cross_attentions = Array.from({ length: this.config.decoder_layers },
             // Concatenate the cross attentions for each layer across sequence length dimension.
             (_, i) => cat(batch.map(x => x[i]), 2)
@@ -3468,6 +3477,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
         attention_mask,
     }) {
+        // @ts-expect-error TS2339
         const image_token_index = this.config.image_token_index;
         const idsList = input_ids.tolist();
@@ -4453,6 +4463,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                 const image_nums = vision_tokens.filter(x => x == image_token_id).length;
                 const video_nums = vision_tokens.filter(x => x == video_token_id).length;
+                /** @type {number[][]} */
                 let llm_pos_ids_list = [];
                 let st = 0;
                 let remain_images = image_nums;
@@ -4522,6 +4533,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                 // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
                 // meaning to perform concatenation along dim=1, we can do the following:
                 const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
+                /** @type {number[]} */
                 const llm_positions = new Array(num_items);
                 let index = 0;
                 for (let x = 0; x < 3; ++x) {
@@ -4562,9 +4574,10 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                     { length: 3 * data.length },
                     (_, i) => data[i % data.length]
                 );
+                /** @type {bigint[]} */
                 const mrope_position_deltas = Array.from(
                     { length: dims[0] },
-                    (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
+                    (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
                 );
                 return [
@@ -5135,7 +5148,7 @@ export class DPTModel extends DPTPreTrainedModel { }
  *
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
  * ```javascript
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
  *
  * // Load model and processor
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -5144,7 +5157,7 @@ export class DPTModel extends DPTPreTrainedModel { }
  *
  * // Load image from URL
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
- * const image = await RawImage.fromURL(url);
+ * const image = await RawImage.read(url);
  *
  * // Prepare image for the model
  * const inputs = await processor(image);
@@ -5153,10 +5166,15 @@ export class DPTModel extends DPTPreTrainedModel { }
  * const { predicted_depth } = await model(inputs);
  *
  * // Interpolate to original size
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
+     * size: image.size.reverse(),
+     * mode: 'bilinear',
+ * })).squeeze(1);
  *
  * // Visualize the prediction
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
+ * const min = prediction.min().item();
+ * const max = prediction.max().item();
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
  * const depth = RawImage.fromTensor(formatted);
  * // RawImage {
  * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -5206,11 +5224,7 @@ export class GLPNPreTrainedModel extends PreTrainedModel { }
 export class GLPNModel extends GLPNPreTrainedModel { }
 /**
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
- *
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
- * ```javascript
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
  *
  * // Load model and processor
  * const model_id = 'Xenova/glpn-kitti';
@@ -5219,7 +5233,7 @@ export class GLPNModel extends GLPNPreTrainedModel { }
  *
  * // Load image from URL
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
- * const image = await RawImage.fromURL(url);
+ * const image = await RawImage.read(url);
  *
  * // Prepare image for the model
  * const inputs = await processor(image);
@@ -5228,13 +5242,18 @@ export class GLPNModel extends GLPNPreTrainedModel { }
  * const { predicted_depth } = await model(inputs);
  *
  * // Interpolate to original size
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
+     * size: image.size.reverse(),
+     * mode: 'bilinear',
+ * })).squeeze(1);
  *
  * // Visualize the prediction
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
+ * const min = prediction.min().item();
+ * const max = prediction.max().item();
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
  * const depth = RawImage.fromTensor(formatted);
  * // RawImage {
- * //   data: Uint8Array(307200) [ 207, 169, 154, ... ],
+ * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
  * //   width: 640,
  * //   height: 480,
  * //   channels: 1
@@ -6201,10 +6220,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
         const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
+        // @ts-expect-error TS2339
         const r = encoder_outputs.dims[1] / this.config.reduction_factor;
         const maxlen = Math.floor(r * maxlenratio);
         const minlen = Math.floor(r * minlenratio);
+        // @ts-expect-error TS2339
         const num_mel_bins = this.config.num_mel_bins;
         let spectrogramParts = [];
@@ -6569,11 +6590,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
      */
     _apply_and_filter_by_delay_pattern_mask(outputs) {
         const [bs_x_codebooks, seqLength] = outputs.dims;
+        // @ts-expect-error TS2339
         const num_codebooks = this.config.decoder.num_codebooks;
         const upperBound = (seqLength - num_codebooks);
         let newDataSize = 0;
         for (let i = 0; i < outputs.size; ++i) {
+            // @ts-expect-error TS2339
             if (outputs.data[i] === this.config.decoder.pad_token_id) {
                 continue;
             }
@@ -6603,7 +6626,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
         let clonedInputIds = structuredClone(input_ids);
         for (let i = 0; i < clonedInputIds.length; ++i) {
             for (let j = 0; j < clonedInputIds[i].length; ++j) {
+                // @ts-expect-error TS2339
                 if ((i % this.config.decoder.num_codebooks) >= j) {
+                    // @ts-expect-error TS2339
                     clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
                 }
             }
@@ -6760,6 +6785,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
         'past_key_values',
     ];
+    /**
+     * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
+     */
     constructor(...args) {
         super(...args);
@@ -7728,10 +7756,17 @@ export class SequenceClassifierOutput extends ModelOutput {
     /**
      * @param {Object} output The output of the model.
      * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
+     * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+     * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
      */
-    constructor({ logits }) {
+    constructor({ logits, ...attentions }) {
         super();
         this.logits = logits;
+        const attentions_list = Object.values(attentions);
+        if (attentions_list.length > 0) {
+            // Only set attentions if they are not empty
+            this.attentions = attentions_list;
+        }
     }
 }

package/src/ops/registry.js CHANGED Viewed

@@ -36,6 +36,16 @@ export class TensorOpRegistry {
         // executionProviders: ['webgpu'],
     };
+    static get nearest_interpolate_4d() {
+        if (!this._nearest_interpolate_4d) {
+            this._nearest_interpolate_4d = wrap(
+                [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
+                this.session_options,
+                'y',
+            );
+        }
+        return this._nearest_interpolate_4d;
+    }
     static get bilinear_interpolate_4d() {
         if (!this._bilinear_interpolate_4d) {
             this._bilinear_interpolate_4d = wrap(