npm - @huggingface/transformers - Versions diffs - 3.2.2 → 3.2.4 - Mend

@huggingface/transformers 3.2.2 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

package/README.md +3 -2
package/dist/transformers.cjs +252 -113
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +256 -114
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +1 -1
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +1 -1
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +1 -1
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +256 -114
package/dist/transformers.mjs.map +1 -1
package/package.json +2 -2
package/src/base/feature_extraction_utils.js +9 -9
package/src/base/image_processors_utils.js +11 -0
package/src/base/processing_utils.js +13 -3
package/src/configs.js +5 -0
package/src/env.js +1 -1
package/src/models/auto/feature_extraction_auto.js +0 -16
package/src/models/auto/processing_auto.js +0 -16
package/src/models/convnext/image_processing_convnext.js +1 -0
package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
package/src/models/florence2/processing_florence2.js +3 -0
package/src/models/idefics3/image_processing_idefics3.js +2 -0
package/src/models/janus/image_processing_janus.js +1 -0
package/src/models/mgp_str/processing_mgp_str.js +2 -0
package/src/models/paligemma/processing_paligemma.js +1 -0
package/src/models/phi3_v/processing_phi3_v.js +1 -1
package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
package/src/models/whisper/feature_extraction_whisper.js +1 -1
package/src/models.js +93 -36
package/src/ops/registry.js +10 -0
package/src/pipelines.js +34 -7
package/src/tokenizers.js +4 -7
package/src/utils/dtypes.js +2 -0
package/src/utils/hub.js +1 -1
package/src/utils/maths.js +8 -6
package/src/utils/tensor.js +42 -10
package/types/base/feature_extraction_utils.d.ts +7 -7
package/types/base/image_processors_utils.d.ts.map +1 -1
package/types/base/processing_utils.d.ts +17 -19
package/types/base/processing_utils.d.ts.map +1 -1
package/types/configs.d.ts.map +1 -1
package/types/generation/parameters.d.ts +1 -1
package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
package/types/models/auto/image_processing_auto.d.ts.map +1 -1
package/types/models/auto/processing_auto.d.ts.map +1 -1
package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
package/types/models/florence2/processing_florence2.d.ts.map +1 -1
package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
package/types/models/janus/image_processing_janus.d.ts.map +1 -1
package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
package/types/models/whisper/generation_whisper.d.ts +1 -1
package/types/models/whisper/generation_whisper.d.ts.map +1 -1
package/types/models.d.ts +48 -17
package/types/models.d.ts.map +1 -1
package/types/ops/registry.d.ts +1 -0
package/types/ops/registry.d.ts.map +1 -1
package/types/pipelines.d.ts +2 -2
package/types/pipelines.d.ts.map +1 -1
package/types/tokenizers.d.ts.map +1 -1
package/types/tsconfig.tsbuildinfo +1 -0
package/types/utils/dtypes.d.ts.map +1 -1
package/types/utils/hub.d.ts +1 -1
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +3 -2
package/types/utils/image.d.ts.map +1 -1
package/types/utils/maths.d.ts +8 -6
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +8 -4
package/types/utils/tensor.d.ts.map +1 -1

package/src/models.js CHANGED Viewed

@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
     } else if (session_options.externalData !== undefined) {
         externalDataPromises = session_options.externalData.map(async (ext) => {
             // if the external data is a string, fetch the file and replace the string with its content
+            // @ts-expect-error TS2339
             if (typeof ext.data === "string") {
+                // @ts-expect-error TS2339
                 const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
+                // @ts-expect-error TS2698
                 return { ...ext, data: ext_buffer };
             }
             return ext;
@@ -1519,6 +1522,7 @@ export class PreTrainedModel extends Callable {
                 if (this.config.model_type === 'musicgen') {
                     // Custom logic (TODO: move to Musicgen class)
                     decoder_input_ids = Array.from({
+                        // @ts-expect-error TS2339
                         length: batch_size * this.config.decoder.num_codebooks
                     }, () => [decoder_start_token_id]);
@@ -1848,11 +1852,13 @@ export class PreTrainedModel extends Callable {
     async encode_image({ pixel_values }) {
         // image_inputs === { pixel_values }
         const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
+        // @ts-expect-error TS2339
         if (!this.config.num_image_tokens) {
             console.warn(
                 'The number of image tokens was not set in the model configuration. ' +
                 `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
             )
+            // @ts-expect-error TS2339
             this.config.num_image_tokens = features.dims[1];
         }
         return features;
@@ -3280,6 +3286,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         if (generation_config.return_token_timestamps) {
             outputs["token_timestamps"] = this._extract_token_timestamps(
+                // @ts-expect-error TS2345
                 outputs,
                 generation_config.alignment_heads,
                 generation_config.num_frames,
@@ -3315,6 +3322,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
             );
         }
+        // @ts-expect-error TS2339
         let median_filter_width = this.config.median_filter_width;
         if (median_filter_width === undefined) {
             console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -3325,6 +3333,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         const batch = generate_outputs.cross_attentions;
         // Create a list with `decoder_layers` elements, each a tensor of shape
         // (batch size, attention_heads, output length, input length).
+        // @ts-expect-error TS2339
         const cross_attentions = Array.from({ length: this.config.decoder_layers },
             // Concatenate the cross attentions for each layer across sequence length dimension.
             (_, i) => cat(batch.map(x => x[i]), 2)
@@ -3421,7 +3430,7 @@ export class MoonshinePreTrainedModel extends PreTrainedModel {
  */
 export class MoonshineModel extends MoonshinePreTrainedModel { }
-export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
+export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
 //////////////////////////////////////////////////
@@ -3468,6 +3477,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
         attention_mask,
     }) {
+        // @ts-expect-error TS2339
         const image_token_index = this.config.image_token_index;
         const idsList = input_ids.tolist();
@@ -3821,9 +3831,9 @@ export class CLIPTextModel extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'text_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
         });
     }
 }
@@ -3858,9 +3868,9 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'text_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
         });
     }
 }
@@ -3872,9 +3882,9 @@ export class CLIPVisionModel extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'vision_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'vision_model',
         });
     }
 }
@@ -3909,9 +3919,9 @@ export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'vision_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'vision_model',
         });
     }
 }
@@ -3997,9 +4007,9 @@ export class SiglipTextModel extends SiglipPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'text_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
         });
     }
 }
@@ -4034,9 +4044,9 @@ export class SiglipVisionModel extends CLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'vision_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'vision_model',
         });
     }
 }
@@ -4093,9 +4103,9 @@ export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'text_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
         });
     }
 }
@@ -4104,9 +4114,9 @@ export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'vision_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'vision_model',
         });
     }
 }
@@ -4453,6 +4463,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                 const image_nums = vision_tokens.filter(x => x == image_token_id).length;
                 const video_nums = vision_tokens.filter(x => x == video_token_id).length;
+                /** @type {number[][]} */
                 let llm_pos_ids_list = [];
                 let st = 0;
                 let remain_images = image_nums;
@@ -4522,6 +4533,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                 // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
                 // meaning to perform concatenation along dim=1, we can do the following:
                 const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
+                /** @type {number[]} */
                 const llm_positions = new Array(num_items);
                 let index = 0;
                 for (let x = 0; x < 3; ++x) {
@@ -4562,9 +4574,10 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                     { length: 3 * data.length },
                     (_, i) => data[i % data.length]
                 );
+                /** @type {bigint[]} */
                 const mrope_position_deltas = Array.from(
                     { length: dims[0] },
-                    (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
+                    (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
                 );
                 return [
@@ -5135,7 +5148,7 @@ export class DPTModel extends DPTPreTrainedModel { }
  *
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
  * ```javascript
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
  *
  * // Load model and processor
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -5144,7 +5157,7 @@ export class DPTModel extends DPTPreTrainedModel { }
  *
  * // Load image from URL
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
- * const image = await RawImage.fromURL(url);
+ * const image = await RawImage.read(url);
  *
  * // Prepare image for the model
  * const inputs = await processor(image);
@@ -5153,10 +5166,15 @@ export class DPTModel extends DPTPreTrainedModel { }
  * const { predicted_depth } = await model(inputs);
  *
  * // Interpolate to original size
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
+     * size: image.size.reverse(),
+     * mode: 'bilinear',
+ * })).squeeze(1);
  *
  * // Visualize the prediction
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
+ * const min = prediction.min().item();
+ * const max = prediction.max().item();
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
  * const depth = RawImage.fromTensor(formatted);
  * // RawImage {
  * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -5206,11 +5224,7 @@ export class GLPNPreTrainedModel extends PreTrainedModel { }
 export class GLPNModel extends GLPNPreTrainedModel { }
 /**
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
- *
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
- * ```javascript
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
  *
  * // Load model and processor
  * const model_id = 'Xenova/glpn-kitti';
@@ -5219,7 +5233,7 @@ export class GLPNModel extends GLPNPreTrainedModel { }
  *
  * // Load image from URL
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
- * const image = await RawImage.fromURL(url);
+ * const image = await RawImage.read(url);
  *
  * // Prepare image for the model
  * const inputs = await processor(image);
@@ -5228,13 +5242,18 @@ export class GLPNModel extends GLPNPreTrainedModel { }
  * const { predicted_depth } = await model(inputs);
  *
  * // Interpolate to original size
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
+     * size: image.size.reverse(),
+     * mode: 'bilinear',
+ * })).squeeze(1);
  *
  * // Visualize the prediction
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
+ * const min = prediction.min().item();
+ * const max = prediction.max().item();
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
  * const depth = RawImage.fromTensor(formatted);
  * // RawImage {
- * //   data: Uint8Array(307200) [ 207, 169, 154, ... ],
+ * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
  * //   width: 640,
  * //   height: 480,
  * //   channels: 1
@@ -5389,6 +5408,26 @@ export class Dinov2ForImageClassification extends Dinov2PreTrainedModel {
 }
 //////////////////////////////////////////////////
+//////////////////////////////////////////////////
+export class Dinov2WithRegistersPreTrainedModel extends PreTrainedModel { }
+/**
+ * The bare Dinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class Dinov2WithRegistersModel extends Dinov2WithRegistersPreTrainedModel { }
+/**
+ * Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet.
+ */
+export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegistersPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
 //////////////////////////////////////////////////
 export class YolosPreTrainedModel extends PreTrainedModel { }
@@ -6181,10 +6220,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
         const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
+        // @ts-expect-error TS2339
         const r = encoder_outputs.dims[1] / this.config.reduction_factor;
         const maxlen = Math.floor(r * maxlenratio);
         const minlen = Math.floor(r * minlenratio);
+        // @ts-expect-error TS2339
         const num_mel_bins = this.config.num_mel_bins;
         let spectrogramParts = [];
@@ -6338,9 +6379,9 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'text_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
         });
     }
 }
@@ -6375,9 +6416,9 @@ export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
     /** @type {typeof PreTrainedModel.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options = {}) {
         return super.from_pretrained(pretrained_model_name_or_path, {
-            // Update default model file name if not provided
-            model_file_name: 'audio_model',
             ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'audio_model',
         });
     }
 }
@@ -6549,11 +6590,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
      */
     _apply_and_filter_by_delay_pattern_mask(outputs) {
         const [bs_x_codebooks, seqLength] = outputs.dims;
+        // @ts-expect-error TS2339
         const num_codebooks = this.config.decoder.num_codebooks;
         const upperBound = (seqLength - num_codebooks);
         let newDataSize = 0;
         for (let i = 0; i < outputs.size; ++i) {
+            // @ts-expect-error TS2339
             if (outputs.data[i] === this.config.decoder.pad_token_id) {
                 continue;
             }
@@ -6583,7 +6626,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
         let clonedInputIds = structuredClone(input_ids);
         for (let i = 0; i < clonedInputIds.length; ++i) {
             for (let j = 0; j < clonedInputIds[i].length; ++j) {
+                // @ts-expect-error TS2339
                 if ((i % this.config.decoder.num_codebooks) >= j) {
+                    // @ts-expect-error TS2339
                     clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
                 }
             }
@@ -6740,6 +6785,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
         'past_key_values',
     ];
+    /**
+     * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
+     */
     constructor(...args) {
         super(...args);
@@ -7018,6 +7066,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['convnext', ['ConvNextModel', ConvNextModel]],
     ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]],
     ['dinov2', ['Dinov2Model', Dinov2Model]],
+    ['dinov2_with_registers', ['Dinov2WithRegistersModel', Dinov2WithRegistersModel]],
     ['resnet', ['ResNetModel', ResNetModel]],
     ['swin', ['SwinModel', SwinModel]],
     ['swin2sr', ['Swin2SRModel', Swin2SRModel]],
@@ -7263,6 +7312,7 @@ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]],
     ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]],
     ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]],
+    ['dinov2_with_registers', ['Dinov2WithRegistersForImageClassification', Dinov2WithRegistersForImageClassification]],
     ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]],
     ['swin', ['SwinForImageClassification', SwinForImageClassification]],
     ['segformer', ['SegformerForImageClassification', SegformerForImageClassification]],
@@ -7706,10 +7756,17 @@ export class SequenceClassifierOutput extends ModelOutput {
     /**
      * @param {Object} output The output of the model.
      * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
+     * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+     * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
      */
-    constructor({ logits }) {
+    constructor({ logits, ...attentions }) {
         super();
         this.logits = logits;
+        const attentions_list = Object.values(attentions);
+        if (attentions_list.length > 0) {
+            // Only set attentions if they are not empty
+            this.attentions = attentions_list;
+        }
     }
 }

package/src/ops/registry.js CHANGED Viewed

@@ -36,6 +36,16 @@ export class TensorOpRegistry {
         // executionProviders: ['webgpu'],
     };
+    static get nearest_interpolate_4d() {
+        if (!this._nearest_interpolate_4d) {
+            this._nearest_interpolate_4d = wrap(
+                [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
+                this.session_options,
+                'y',
+            );
+        }
+        return this._nearest_interpolate_4d;
+    }
     static get bilinear_interpolate_4d() {
         if (!this._bilinear_interpolate_4d) {
             this._bilinear_interpolate_4d = wrap(

package/src/pipelines.js CHANGED Viewed

@@ -69,7 +69,7 @@ import {
 import {
     Tensor,
     mean_pooling,
-    interpolate,
+    interpolate_4d,
     quantize_embeddings,
     topk,
 } from './utils/tensor.js';
@@ -294,6 +294,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi
         // TODO: Use softmax tensor function
         const function_to_apply =
+            // @ts-expect-error TS2339
             this.model.config.problem_type === 'multi_label_classification'
                 ? batch => batch.sigmoid()
                 : batch => new Tensor(
@@ -302,6 +303,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi
                     batch.dims,
                 ); // single_label_classification (default)
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         const toReturn = [];
@@ -404,6 +406,7 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP
         const outputs = await this.model(model_inputs)
         const logits = outputs.logits;
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         const toReturn = [];
@@ -743,11 +746,14 @@ export class Text2TextGenerationPipeline extends (/** @type {new (options: TextP
         // Add global prefix, if present
+        // @ts-expect-error TS2339
         if (this.model.config.prefix) {
+            // @ts-expect-error TS2339
             texts = texts.map(x => this.model.config.prefix + x)
         }
         // Handle task specific params:
+        // @ts-expect-error TS2339
         const task_specific_params = this.model.config.task_specific_params
         if (task_specific_params && task_specific_params[this.task]) {
             // Add prefixes, if present
@@ -1486,6 +1492,7 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio
         const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
         const preparedAudios = await prepareAudios(audio, sampling_rate);
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         const toReturn = [];
@@ -1796,6 +1803,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
             audio = [/** @type {AudioInput} */ (audio)];
         }
+        // @ts-expect-error TS2339
         const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
         const hop_length = this.processor.feature_extractor.config.hop_length;
@@ -1861,7 +1869,9 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
                 // TODO: Right now we only get top beam
                 if (return_timestamps === 'word') {
+                    // @ts-expect-error TS2339
                     chunk.tokens = data.sequences.tolist()[0];
+                    // @ts-expect-error TS2339
                     chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
                         (/** @type {number} */ x) => round(x, 2)
                     );
@@ -1906,7 +1916,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
             const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
             const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
-            const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
+            const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
             toReturn.push({ text });
         }
         return single ? toReturn[0] : toReturn;
@@ -2055,6 +2065,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image
         const { pixel_values } = await this.processor(preparedImages);
         const output = await this.model({ pixel_values });
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         /** @type {ImageClassificationOutput[]} */
@@ -2169,6 +2180,7 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
             }
         }
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         /** @type {ImageSegmentationPipelineOutput[]} */
@@ -2395,6 +2407,7 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe
         const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
         // Add labels
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         // Format output
@@ -2614,6 +2627,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
         // Run model
         const output = await this.model.generate({
             inputs: pixel_values,
+            // @ts-expect-error TS2339
             max_length: this.model.config.decoder.max_position_embeddings,
             decoder_input_ids,
             ...generate_kwargs,
@@ -2729,6 +2743,7 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi
         // Generate waveform
         const { waveform } = await this.model(inputs);
+        // @ts-expect-error TS2339
         const sampling_rate = this.model.config.sampling_rate;
         return {
             audio: waveform.data,
@@ -2886,11 +2901,23 @@ export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipe
         const toReturn = [];
         for (let i = 0; i < preparedImages.length; ++i) {
-            const prediction = interpolate(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
-            const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
+            const batch = predicted_depth[i];
+            const [height, width] = batch.dims.slice(-2);
+            const [new_width, new_height] = preparedImages[i].size;
+            // Interpolate to original size
+            const prediction = (await interpolate_4d(batch.view(1, 1, height, width), {
+                size: [new_height, new_width],
+                mode: 'bilinear',
+            })).view(new_height, new_width);
+            const minval = /** @type {number} */(prediction.min().item());
+            const maxval = /** @type {number} */(prediction.max().item());
+            const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
+            const depth = RawImage.fromTensor(formatted);
             toReturn.push({
-                predicted_depth: predicted_depth[i],
-                depth: RawImage.fromTensor(formatted),
+                predicted_depth: prediction,
+                depth,
             });
         }
@@ -3368,4 +3395,4 @@ async function loadItems(mapping, model, pretrainedOptions) {
     }
     return result;
-}
+}

package/src/tokenizers.js CHANGED Viewed

@@ -47,10 +47,8 @@ import {
 import { Template } from '@huggingface/jinja';
 import {
-    WHISPER_LANGUAGE_MAPPING,
-    whisper_language_to_code,
+    WHISPER_LANGUAGE_MAPPING
 } from './models/whisper/common_whisper.js';
-import { GITHUB_ISSUE_URL } from './utils/constants.js';
 /**
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
@@ -535,7 +533,7 @@ class Unigram extends TokenizerModel {
      * Create a new Unigram tokenizer model.
      * @param {Object} config The configuration object for the Unigram model.
      * @param {number} config.unk_id The ID of the unknown token
-     * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
+     * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
      * @param {Object} moreConfig Additional configuration object for the Unigram model.
      */
     constructor(config, moreConfig) {
@@ -543,11 +541,10 @@ class Unigram extends TokenizerModel {
         const vocabSize = config.vocab.length;
         this.vocab = new Array(vocabSize);
+        /** @type {number[]} */
         this.scores = new Array(vocabSize);
         for (let i = 0; i < vocabSize; ++i) {
-            const piece = config.vocab[i];
-            this.vocab[i] = piece[0];
-            this.scores[i] = piece[1];
+            [this.vocab[i], this.scores[i]] = config.vocab[i];
         }
         this.unk_token_id = config.unk_id;

package/src/utils/dtypes.js CHANGED Viewed

@@ -1,3 +1,5 @@
+/// <reference types="@webgpu/types" />
 import { apis } from "../env.js";
 import { DEVICE_TYPES } from "./devices.js";

package/src/utils/hub.js CHANGED Viewed

@@ -121,7 +121,7 @@ class FileResponse {
      */
     async arrayBuffer() {
         const data = await fs.promises.readFile(this.filePath);
-        return data.buffer;
+        return /** @type {ArrayBuffer} */ (data.buffer);
     }
     /**

package/src/utils/maths.js CHANGED Viewed

@@ -225,8 +225,9 @@ export function magnitude(arr) {
 /**
  * Returns the value and index of the minimum element in an array.
- * @param {number[]|TypedArray} arr array of numbers.
- * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
+ * @template {number[]|bigint[]|AnyTypedArray} T
+ * @param {T} arr array of numbers.
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
  * @throws {Error} If array is empty.
  */
 export function min(arr) {
@@ -239,14 +240,15 @@ export function min(arr) {
             indexOfMin = i;
         }
     }
-    return [min, indexOfMin];
+    return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
 }
 /**
  * Returns the value and index of the maximum element in an array.
- * @param {number[]|AnyTypedArray} arr array of numbers.
- * @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
+ * @template {number[]|bigint[]|AnyTypedArray} T
+ * @param {T} arr array of numbers.
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
  * @throws {Error} If array is empty.
  */
 export function max(arr) {
@@ -259,7 +261,7 @@ export function max(arr) {
             indexOfMax = i;
         }
     }
-    return [Number(max), indexOfMax];
+    return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
 }
 function isPowerOfTwo(number) {