npm - @huggingface/transformers - Versions diffs - 3.2.3 → 3.3.0 - Mend

@huggingface/transformers 3.2.3 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

package/README.md +5 -3
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/ort.bundle.min.mjs +2776 -0
package/dist/transformers.cjs +792 -330
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +1150 -656
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +1 -1
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +1 -1
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +1 -1
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +798 -331
package/dist/transformers.mjs.map +1 -1
package/package.json +3 -3
package/src/base/feature_extraction_utils.js +9 -9
package/src/base/image_processors_utils.js +12 -1
package/src/base/processing_utils.js +24 -3
package/src/configs.js +5 -0
package/src/env.js +1 -2
package/src/generation/streamers.js +5 -2
package/src/models/auto/feature_extraction_auto.js +0 -16
package/src/models/auto/processing_auto.js +0 -16
package/src/models/convnext/image_processing_convnext.js +1 -0
package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
package/src/models/florence2/processing_florence2.js +3 -0
package/src/models/grounding_dino/image_processing_grounding_dino.js +29 -0
package/src/models/grounding_dino/processing_grounding_dino.js +101 -0
package/src/models/idefics3/image_processing_idefics3.js +2 -0
package/src/models/image_processors.js +1 -0
package/src/models/janus/image_processing_janus.js +1 -0
package/src/models/mgp_str/processing_mgp_str.js +2 -0
package/src/models/paligemma/processing_paligemma.js +1 -0
package/src/models/phi3_v/processing_phi3_v.js +1 -1
package/src/models/processors.js +3 -2
package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
package/src/models/whisper/feature_extraction_whisper.js +1 -1
package/src/models.js +72 -20
package/src/ops/registry.js +10 -0
package/src/pipelines.js +73 -23
package/src/tokenizers.js +4 -7
package/src/utils/audio.js +113 -1
package/src/utils/core.js +26 -0
package/src/utils/dtypes.js +2 -0
package/src/utils/hub.js +1 -1
package/src/utils/image.js +5 -18
package/src/utils/maths.js +8 -6
package/src/utils/tensor.js +134 -114
package/types/base/feature_extraction_utils.d.ts +7 -7
package/types/base/image_processors_utils.d.ts +7 -0
package/types/base/image_processors_utils.d.ts.map +1 -1
package/types/base/processing_utils.d.ts +25 -19
package/types/base/processing_utils.d.ts.map +1 -1
package/types/configs.d.ts.map +1 -1
package/types/generation/parameters.d.ts +1 -1
package/types/generation/streamers.d.ts +3 -1
package/types/generation/streamers.d.ts.map +1 -1
package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
package/types/models/auto/image_processing_auto.d.ts.map +1 -1
package/types/models/auto/processing_auto.d.ts.map +1 -1
package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
package/types/models/florence2/processing_florence2.d.ts.map +1 -1
package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +20 -0
package/types/models/grounding_dino/image_processing_grounding_dino.d.ts.map +1 -0
package/types/models/grounding_dino/processing_grounding_dino.d.ts +27 -0
package/types/models/grounding_dino/processing_grounding_dino.d.ts.map +1 -0
package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
package/types/models/image_processors.d.ts +1 -0
package/types/models/janus/image_processing_janus.d.ts.map +1 -1
package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
package/types/models/processors.d.ts +3 -2
package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
package/types/models/whisper/generation_whisper.d.ts +1 -1
package/types/models/whisper/generation_whisper.d.ts.map +1 -1
package/types/models.d.ts +40 -17
package/types/models.d.ts.map +1 -1
package/types/ops/registry.d.ts +1 -0
package/types/ops/registry.d.ts.map +1 -1
package/types/pipelines.d.ts +7 -12
package/types/pipelines.d.ts.map +1 -1
package/types/tokenizers.d.ts.map +1 -1
package/types/tsconfig.tsbuildinfo +1 -0
package/types/utils/audio.d.ts +25 -0
package/types/utils/audio.d.ts.map +1 -1
package/types/utils/core.d.ts +6 -0
package/types/utils/core.d.ts.map +1 -1
package/types/utils/dtypes.d.ts.map +1 -1
package/types/utils/hub.d.ts +1 -1
package/types/utils/hub.d.ts.map +1 -1
package/types/utils/image.d.ts +3 -2
package/types/utils/image.d.ts.map +1 -1
package/types/utils/maths.d.ts +8 -6
package/types/utils/maths.d.ts.map +1 -1
package/types/utils/tensor.d.ts +22 -6
package/types/utils/tensor.d.ts.map +1 -1

package/dist/transformers.cjs CHANGED Viewed

@@ -4158,23 +4158,23 @@ class FeatureExtractor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Ca
     }
     /**
-     * Instantiate one of the processor classes of the library from a pretrained model.
+     * Instantiate one of the feature extractor classes of the library from a pretrained model.
      *
-     * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
-     * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
+     * The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
+     * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
      *
      * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
-     * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
+     * - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
      *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
      *   user or organization name, like `dbmdz/bert-base-german-cased`.
-     * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
-     * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
+     * - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
+     * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
      *
-     * @returns {Promise<FeatureExtractor>} A new instance of the Processor class.
+     * @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
      */
     static async from_pretrained(pretrained_model_name_or_path, options) {
-        const preprocessorConfig = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
-        return new this(preprocessorConfig);
+        const config = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
+        return new this(config);
     }
 }
@@ -4207,6 +4207,7 @@ function validate_audio_inputs(audio, feature_extractor) {
 __webpack_require__.r(__webpack_exports__);
 /* harmony export */ __webpack_require__.d(__webpack_exports__, {
 /* harmony export */   ImageProcessor: () => (/* binding */ ImageProcessor),
+/* harmony export */   center_to_corners_format: () => (/* binding */ center_to_corners_format),
 /* harmony export */   post_process_instance_segmentation: () => (/* binding */ post_process_instance_segmentation),
 /* harmony export */   post_process_object_detection: () => (/* binding */ post_process_object_detection),
 /* harmony export */   post_process_panoptic_segmentation: () => (/* binding */ post_process_panoptic_segmentation),
@@ -4825,14 +4826,20 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
         this.do_thumbnail = config.do_thumbnail;
         this.size = config.size ?? config.image_size;
         this.do_resize = config.do_resize ?? (this.size !== undefined);
+        // @ts-expect-error TS2339
         this.size_divisibility = config.size_divisibility ?? config.size_divisor;
         this.do_center_crop = config.do_center_crop;
+        // @ts-expect-error TS2339
         this.crop_size = config.crop_size;
+        // @ts-expect-error TS2339
         this.do_convert_rgb = config.do_convert_rgb ?? true;
+        // @ts-expect-error TS2339
         this.do_crop_margin = config.do_crop_margin;
+        // @ts-expect-error TS2339
         this.pad_size = config.pad_size;
+        // @ts-expect-error TS2339
         this.do_pad = config.do_pad;
         if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
@@ -5041,6 +5048,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
         // Support both formats for backwards compatibility
         else if (Number.isInteger(size)) {
             shortest_edge = size;
+            // @ts-expect-error TS2339
             longest_edge = this.config.max_size ?? shortest_edge;
         } else if (size !== undefined) {
@@ -5109,6 +5117,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
         } else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
             // Custom resize logic for Qwen2-VL models
             const { min_pixels, max_pixels } = size;
+            // @ts-expect-error TS2339
             const factor = this.config.patch_size * this.config.merge_size;
             return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
         } else {
@@ -5124,6 +5133,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
     async resize(image) {
         const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
         return await image.resize(newWidth, newHeight, {
+            // @ts-expect-error TS2322
             resample: this.resample,
         });
     }
@@ -5174,6 +5184,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
         // Resize the image using thumbnail method.
         if (this.do_thumbnail) {
+            // @ts-expect-error TS2345
             image = await this.thumbnail(image, this.size, this.resample);
         }
@@ -5198,6 +5209,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
         // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
         // occurs with data in the hwc format (height, width, channels),
         // to emulate the behavior of the original Python code (w/ numpy).
+        /** @type {Float32Array} */
         let pixelData = Float32Array.from(image.data);
         let imgDims = [image.height, image.width, image.channels];
@@ -5356,6 +5368,7 @@ __webpack_require__.r(__webpack_exports__);
 /**
  * @typedef {Object} ProcessorProperties Additional processor-specific properties.
  * @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
+ * @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
  */
@@ -5389,7 +5402,7 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
     }
     /**
-     * @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
+     * @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
      */
     get tokenizer() {
         return this.components.tokenizer;
@@ -5402,6 +5415,11 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
         return this.components.feature_extractor;
     }
+    /**
+     * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
+     * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
+     * @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
+     */
     apply_chat_template(messages, options = {}) {
         if (!this.tokenizer) {
             throw new Error('Unable to apply chat template without a tokenizer.');
@@ -5412,6 +5430,10 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
         });
     }
+    /**
+     * @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
+     * @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
+     */
     batch_decode(...args) {
         if (!this.tokenizer) {
             throw new Error('Unable to decode without a tokenizer.');
@@ -5419,6 +5441,17 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
         return this.tokenizer.batch_decode(...args);
     }
+    /**
+     * @param {Parameters<PreTrainedTokenizer['decode']>} args
+     * @returns {ReturnType<PreTrainedTokenizer['decode']>}
+     */
+    decode(...args) {
+        if (!this.tokenizer) {
+            throw new Error('Unable to decode without a tokenizer.');
+        }
+        return this.tokenizer.decode(...args);
+    }
     /**
      * Calls the feature_extractor function with the given input.
@@ -5439,8 +5472,8 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
     /**
      * Instantiate one of the processor classes of the library from a pretrained model.
      *
-     * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
-     * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
+     * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
+     * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
      *
      * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
      * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
@@ -5560,15 +5593,19 @@ function getNormalizedConfig(config) {
         case 'florence2':
         case 'llava_onevision':
         case 'idefics3':
+            // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.text_config);
             break;
         case 'moondream1':
+            // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.phi_config);
             break;
         case 'musicgen':
+            // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.decoder);
             break;
         case 'multi_modality':
+            // @ts-expect-error TS2339
             init_normalized_config = getNormalizedConfig(config.language_config);
             break;
@@ -5689,6 +5726,7 @@ function getNormalizedConfig(config) {
             break;
         case 'vision-encoder-decoder':
+            // @ts-expect-error TS2339
             const decoderConfig = getNormalizedConfig(config.decoder);
             const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
@@ -5932,7 +5970,7 @@ __webpack_require__.r(__webpack_exports__);
-const VERSION = '3.2.3';
+const VERSION = '3.3.0';
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
@@ -6068,7 +6106,6 @@ function isEmpty(obj) {
 }
 /***/ }),
 /***/ "./src/generation/configuration_utils.js":
@@ -7683,6 +7720,7 @@ class TextStreamer extends BaseStreamer {
      * @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
      * @param {Object} options
      * @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
+     * @param {boolean} [options.skip_special_tokens=true] Whether to skip special tokens when decoding
      * @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
      * @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
      * @param {Object} [options.decode_kwargs={}] Additional keyword arguments to pass to the tokenizer's decode method
@@ -7691,6 +7729,7 @@ class TextStreamer extends BaseStreamer {
         skip_prompt = false,
         callback_function = null,
         token_callback_function = null,
+        skip_special_tokens = true,
         decode_kwargs = {},
         ...kwargs
     } = {}) {
@@ -7699,7 +7738,7 @@ class TextStreamer extends BaseStreamer {
         this.skip_prompt = skip_prompt;
         this.callback_function = callback_function ?? stdout_write;
         this.token_callback_function = token_callback_function;
-        this.decode_kwargs = { ...decode_kwargs, ...kwargs };
+        this.decode_kwargs = { skip_special_tokens, ...decode_kwargs, ...kwargs };
         // variables used in the streaming process
         this.token_cache = [];
@@ -7815,9 +7854,10 @@ class WhisperTextStreamer extends TextStreamer {
     } = {}) {
         super(tokenizer, {
             skip_prompt,
+            skip_special_tokens,
             callback_function,
             token_callback_function,
-            decode_kwargs: { skip_special_tokens, ...decode_kwargs },
+            decode_kwargs,
         });
         this.timestamp_begin = tokenizer.timestamp_begin;
@@ -8071,6 +8111,8 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony export */   GraniteForCausalLM: () => (/* binding */ GraniteForCausalLM),
 /* harmony export */   GraniteModel: () => (/* binding */ GraniteModel),
 /* harmony export */   GranitePreTrainedModel: () => (/* binding */ GranitePreTrainedModel),
+/* harmony export */   GroundingDinoForObjectDetection: () => (/* binding */ GroundingDinoForObjectDetection),
+/* harmony export */   GroundingDinoPreTrainedModel: () => (/* binding */ GroundingDinoPreTrainedModel),
 /* harmony export */   GroupViTModel: () => (/* binding */ GroupViTModel),
 /* harmony export */   GroupViTPreTrainedModel: () => (/* binding */ GroupViTPreTrainedModel),
 /* harmony export */   HieraForImageClassification: () => (/* binding */ HieraForImageClassification),
@@ -8279,6 +8321,8 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony export */   Starcoder2ForCausalLM: () => (/* binding */ Starcoder2ForCausalLM),
 /* harmony export */   Starcoder2Model: () => (/* binding */ Starcoder2Model),
 /* harmony export */   Starcoder2PreTrainedModel: () => (/* binding */ Starcoder2PreTrainedModel),
+/* harmony export */   StyleTextToSpeech2Model: () => (/* binding */ StyleTextToSpeech2Model),
+/* harmony export */   StyleTextToSpeech2PreTrainedModel: () => (/* binding */ StyleTextToSpeech2PreTrainedModel),
 /* harmony export */   Swin2SRForImageSuperResolution: () => (/* binding */ Swin2SRForImageSuperResolution),
 /* harmony export */   Swin2SRModel: () => (/* binding */ Swin2SRModel),
 /* harmony export */   Swin2SRPreTrainedModel: () => (/* binding */ Swin2SRPreTrainedModel),
@@ -8594,8 +8638,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
     } else if (session_options.externalData !== undefined) {
         externalDataPromises = session_options.externalData.map(async (ext) => {
             // if the external data is a string, fetch the file and replace the string with its content
+            // @ts-expect-error TS2339
             if (typeof ext.data === "string") {
+                // @ts-expect-error TS2339
                 const ext_buffer = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, ext.data, true, options);
+                // @ts-expect-error TS2698
                 return { ...ext, data: ext_buffer };
             }
             return ext;
@@ -8853,14 +8900,23 @@ async function encoderForward(self, model_inputs) {
         encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids });
     }
     if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) {
+        if (!encoderFeeds.input_ids) {
+            throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.');
+        }
         // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it,
         // but they weren't created by the tokenizer.
-        encoderFeeds.token_type_ids = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.Tensor(
-            'int64',
-            new BigInt64Array(encoderFeeds.input_ids.data.length),
-            encoderFeeds.input_ids.dims
-        )
+        encoderFeeds.token_type_ids = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.zeros_like)(encoderFeeds.input_ids);
     }
+    if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) {
+        if (!encoderFeeds.pixel_values) {
+            throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.');
+        }
+        // Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it,
+        // but they weren't created by the processor.
+        const dims = encoderFeeds.pixel_values.dims;
+        encoderFeeds.pixel_mask = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.ones)([dims[0], dims[2], dims[3]]);
+    }
     return await sessionRun(session, encoderFeeds);
 }
@@ -9843,6 +9899,7 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
                 if (this.config.model_type === 'musicgen') {
                     // Custom logic (TODO: move to Musicgen class)
                     decoder_input_ids = Array.from({
+                        // @ts-expect-error TS2339
                         length: batch_size * this.config.decoder.num_codebooks
                     }, () => [decoder_start_token_id]);
@@ -10172,11 +10229,13 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
     async encode_image({ pixel_values }) {
         // image_inputs === { pixel_values }
         const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
+        // @ts-expect-error TS2339
         if (!this.config.num_image_tokens) {
             console.warn(
                 'The number of image tokens was not set in the model configuration. ' +
                 `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
             )
+            // @ts-expect-error TS2339
             this.config.num_image_tokens = features.dims[1];
         }
         return features;
@@ -11604,6 +11663,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         if (generation_config.return_token_timestamps) {
             outputs["token_timestamps"] = this._extract_token_timestamps(
+                // @ts-expect-error TS2345
                 outputs,
                 generation_config.alignment_heads,
                 generation_config.num_frames,
@@ -11639,6 +11699,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
             );
         }
+        // @ts-expect-error TS2339
         let median_filter_width = this.config.median_filter_width;
         if (median_filter_width === undefined) {
             console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -11649,6 +11710,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
         const batch = generate_outputs.cross_attentions;
         // Create a list with `decoder_layers` elements, each a tensor of shape
         // (batch size, attention_heads, output length, input length).
+        // @ts-expect-error TS2339
         const cross_attentions = Array.from({ length: this.config.decoder_layers },
             // Concatenate the cross attentions for each layer across sequence length dimension.
             (_, i) => (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.cat)(batch.map(x => x[i]), 2)
@@ -11792,6 +11854,7 @@ class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
         attention_mask,
     }) {
+        // @ts-expect-error TS2339
         const image_token_index = this.config.image_token_index;
         const idsList = input_ids.tolist();
@@ -12777,6 +12840,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                 const image_nums = vision_tokens.filter(x => x == image_token_id).length;
                 const video_nums = vision_tokens.filter(x => x == video_token_id).length;
+                /** @type {number[][]} */
                 let llm_pos_ids_list = [];
                 let st = 0;
                 let remain_images = image_nums;
@@ -12846,6 +12910,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                 // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
                 // meaning to perform concatenation along dim=1, we can do the following:
                 const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
+                /** @type {number[]} */
                 const llm_positions = new Array(num_items);
                 let index = 0;
                 for (let x = 0; x < 3; ++x) {
@@ -12886,9 +12951,10 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
                     { length: 3 * data.length },
                     (_, i) => data[i % data.length]
                 );
+                /** @type {bigint[]} */
                 const mrope_position_deltas = Array.from(
                     { length: dims[0] },
-                    (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
+                    (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
                 );
                 return [
@@ -13459,7 +13525,7 @@ class DPTModel extends DPTPreTrainedModel { }
  *
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
  * ```javascript
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
  *
  * // Load model and processor
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -13468,7 +13534,7 @@ class DPTModel extends DPTPreTrainedModel { }
  *
  * // Load image from URL
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
- * const image = await RawImage.fromURL(url);
+ * const image = await RawImage.read(url);
  *
  * // Prepare image for the model
  * const inputs = await processor(image);
@@ -13477,10 +13543,15 @@ class DPTModel extends DPTPreTrainedModel { }
  * const { predicted_depth } = await model(inputs);
  *
  * // Interpolate to original size
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
+     * size: image.size.reverse(),
+     * mode: 'bilinear',
+ * })).squeeze(1);
  *
  * // Visualize the prediction
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
+ * const min = prediction.min().item();
+ * const max = prediction.max().item();
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
  * const depth = RawImage.fromTensor(formatted);
  * // RawImage {
  * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -13530,11 +13601,7 @@ class GLPNPreTrainedModel extends PreTrainedModel { }
 class GLPNModel extends GLPNPreTrainedModel { }
 /**
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
- *
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
- * ```javascript
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
  *
  * // Load model and processor
  * const model_id = 'Xenova/glpn-kitti';
@@ -13543,7 +13610,7 @@ class GLPNModel extends GLPNPreTrainedModel { }
  *
  * // Load image from URL
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
- * const image = await RawImage.fromURL(url);
+ * const image = await RawImage.read(url);
  *
  * // Prepare image for the model
  * const inputs = await processor(image);
@@ -13552,13 +13619,18 @@ class GLPNModel extends GLPNPreTrainedModel { }
  * const { predicted_depth } = await model(inputs);
  *
  * // Interpolate to original size
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
+     * size: image.size.reverse(),
+     * mode: 'bilinear',
+ * })).squeeze(1);
  *
  * // Visualize the prediction
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
+ * const min = prediction.min().item();
+ * const max = prediction.max().item();
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
  * const depth = RawImage.fromTensor(formatted);
  * // RawImage {
- * //   data: Uint8Array(307200) [ 207, 169, 154, ... ],
+ * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
  * //   width: 640,
  * //   height: 480,
  * //   channels: 1
@@ -13733,6 +13805,8 @@ class Dinov2WithRegistersForImageClassification extends Dinov2WithRegistersPreTr
     }
 }
 //////////////////////////////////////////////////
+class GroundingDinoPreTrainedModel extends PreTrainedModel { }
+class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel { }
 //////////////////////////////////////////////////
 class YolosPreTrainedModel extends PreTrainedModel { }
@@ -14431,6 +14505,9 @@ class WavLMForAudioFrameClassification extends WavLMPreTrainedModel {
     }
 }
+class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel { }
+class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel { }
 //////////////////////////////////////////////////
 // SpeechT5 models
 /**
@@ -14525,10 +14602,12 @@ class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
         const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
+        // @ts-expect-error TS2339
         const r = encoder_outputs.dims[1] / this.config.reduction_factor;
         const maxlen = Math.floor(r * maxlenratio);
         const minlen = Math.floor(r * minlenratio);
+        // @ts-expect-error TS2339
         const num_mel_bins = this.config.num_mel_bins;
         let spectrogramParts = [];
@@ -14893,11 +14972,13 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
      */
     _apply_and_filter_by_delay_pattern_mask(outputs) {
         const [bs_x_codebooks, seqLength] = outputs.dims;
+        // @ts-expect-error TS2339
         const num_codebooks = this.config.decoder.num_codebooks;
         const upperBound = (seqLength - num_codebooks);
         let newDataSize = 0;
         for (let i = 0; i < outputs.size; ++i) {
+            // @ts-expect-error TS2339
             if (outputs.data[i] === this.config.decoder.pad_token_id) {
                 continue;
             }
@@ -14927,7 +15008,9 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
         let clonedInputIds = structuredClone(input_ids);
         for (let i = 0; i < clonedInputIds.length; ++i) {
             for (let j = 0; j < clonedInputIds[i].length; ++j) {
+                // @ts-expect-error TS2339
                 if ((i % this.config.decoder.num_codebooks) >= j) {
+                    // @ts-expect-error TS2339
                     clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
                 }
             }
@@ -15084,6 +15167,9 @@ class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
         'past_key_values',
     ];
+    /**
+     * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
+     */
     constructor(...args) {
         super(...args);
@@ -15385,6 +15471,8 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['maskformer', ['MaskFormerModel', MaskFormerModel]],
     ['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]],
+    ['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]],
 ]);
 const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -15629,6 +15717,7 @@ const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
 const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
     ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]],
     ['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]],
+    ['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]],
 ]);
 const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
@@ -16052,10 +16141,17 @@ class SequenceClassifierOutput extends ModelOutput {
     /**
      * @param {Object} output The output of the model.
      * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
+     * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+     * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
      */
-    constructor({ logits }) {
+    constructor({ logits, ...attentions }) {
         super();
         this.logits = logits;
+        const attentions_list = Object.values(attentions);
+        if (attentions_list.length > 0) {
+            // Only set attentions if they are not empty
+            this.attentions = attentions_list;
+        }
     }
 }
@@ -16313,22 +16409,6 @@ __webpack_require__.r(__webpack_exports__);
 class AutoFeatureExtractor {
-    /**
-     * Instantiate one of the feature extractor classes of the library from a pretrained model.
-     *
-     * The processor class to instantiate is selected based on the `feature_extractor_type` property of
-     * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
-     *
-     * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
-     * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
-     *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-     *   user or organization name, like `dbmdz/bert-base-german-cased`.
-     * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
-     * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
-     *
-     * @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
-     */
     /** @type {typeof FeatureExtractor.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options={}) {
@@ -16457,22 +16537,6 @@ __webpack_require__.r(__webpack_exports__);
  */
 class AutoProcessor {
-    /**
-     * Instantiate one of the processor classes of the library from a pretrained model.
-     *
-     * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
-     * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
-     *
-     * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
-     * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
-     *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-     *   user or organization name, like `dbmdz/bert-base-german-cased`.
-     * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
-     * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
-     *
-     * @returns {Promise<Processor>} A new instance of the Processor class.
-     */
     /** @type {typeof Processor.from_pretrained} */
     static async from_pretrained(pretrained_model_name_or_path, options={}) {
@@ -16796,6 +16860,7 @@ class ConvNextImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
         /**
          * Percentage of the image to crop. Only has an effect if this.size < 384.
          */
+        // @ts-expect-error TS2339
         this.crop_pct = this.config.crop_pct ?? (224 / 256);
     }
@@ -17003,6 +17068,7 @@ __webpack_require__.r(__webpack_exports__);
 class EfficientNetImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
     constructor(config) {
         super(config);
+        // @ts-expect-error TS2339
         this.include_top = this.config.include_top ?? true;
         if (this.include_top) {
             this.image_std = this.image_std.map(x => x * x);
@@ -17086,8 +17152,11 @@ class Florence2Processor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
         super(config, components);
         const {
+            // @ts-expect-error TS2339
             tasks_answer_post_processing_type,
+            // @ts-expect-error TS2339
             task_prompts_without_inputs,
+            // @ts-expect-error TS2339
             task_prompts_with_input,
         } = this.image_processor.config;
@@ -17223,6 +17292,170 @@ __webpack_require__.r(__webpack_exports__);
 class GLPNFeatureExtractor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor { }
+/***/ }),
+/***/ "./src/models/grounding_dino/image_processing_grounding_dino.js":
+/*!**********************************************************************!*\
+  !*** ./src/models/grounding_dino/image_processing_grounding_dino.js ***!
+  \**********************************************************************/
+/***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
+"use strict";
+__webpack_require__.r(__webpack_exports__);
+/* harmony export */ __webpack_require__.d(__webpack_exports__, {
+/* harmony export */   GroundingDinoImageProcessor: () => (/* binding */ GroundingDinoImageProcessor)
+/* harmony export */ });
+/* harmony import */ var _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/image_processors_utils.js */ "./src/base/image_processors_utils.js");
+/* harmony import */ var _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../../utils/tensor.js */ "./src/utils/tensor.js");
+/**
+ * @typedef {object} GroundingDinoFeatureExtractorResultProps
+ * @property {import('../../utils/tensor.js').Tensor} pixel_mask
+ * @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
+ */
+class GroundingDinoImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
+    /**
+     * Calls the feature extraction process on an array of images, preprocesses
+     * each image, and concatenates the resulting features into a single Tensor.
+     * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
+     * @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
+     */
+    async _call(images) {
+        const result = await super._call(images);
+        const dims = result.pixel_values.dims;
+        const pixel_mask = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.ones)([dims[0], dims[2], dims[3]]);
+        return { ...result, pixel_mask };
+    }
+}
+/***/ }),
+/***/ "./src/models/grounding_dino/processing_grounding_dino.js":
+/*!****************************************************************!*\
+  !*** ./src/models/grounding_dino/processing_grounding_dino.js ***!
+  \****************************************************************/
+/***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
+"use strict";
+__webpack_require__.r(__webpack_exports__);
+/* harmony export */ __webpack_require__.d(__webpack_exports__, {
+/* harmony export */   GroundingDinoProcessor: () => (/* binding */ GroundingDinoProcessor)
+/* harmony export */ });
+/* harmony import */ var _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/processing_utils.js */ "./src/base/processing_utils.js");
+/* harmony import */ var _auto_image_processing_auto_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../auto/image_processing_auto.js */ "./src/models/auto/image_processing_auto.js");
+/* harmony import */ var _tokenizers_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ../../tokenizers.js */ "./src/tokenizers.js");
+/* harmony import */ var _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ../../base/image_processors_utils.js */ "./src/base/image_processors_utils.js");
+/**
+ * Get token ids of phrases from posmaps and input_ids.
+ * @param {import('../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`.
+ * @param {import('../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`.
+ */
+function get_phrases_from_posmap(posmaps, input_ids) {
+    const left_idx = 0;
+    const right_idx = posmaps.dims.at(-1) - 1;
+    const posmaps_list = posmaps.tolist();
+    posmaps_list.fill(false, 0, left_idx + 1);
+    posmaps_list.fill(false, right_idx);
+    const input_ids_list = input_ids.tolist();
+    return posmaps_list
+        .map((val, idx) => val ? idx : null)
+        .filter(idx => idx !== null)
+        .map(i => input_ids_list[i]);
+}
+class GroundingDinoProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__.Processor {
+    static tokenizer_class = _tokenizers_js__WEBPACK_IMPORTED_MODULE_2__.AutoTokenizer
+    static image_processor_class = _auto_image_processing_auto_js__WEBPACK_IMPORTED_MODULE_1__.AutoImageProcessor
+    /**
+     * @typedef {import('../../utils/image.js').RawImage} RawImage
+     */
+    /**
+     *
+     * @param {RawImage|RawImage[]|RawImage[][]} images
+     * @param {string|string[]} text
+     * @returns {Promise<any>}
+     */
+    async _call(images, text, options = {}) {
+        const image_inputs = images ? await this.image_processor(images, options) : {};
+        const text_inputs = text ? this.tokenizer(text, options) : {};
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+    post_process_grounded_object_detection(outputs, input_ids, {
+        box_threshold = 0.25,
+        text_threshold = 0.25,
+        target_sizes = null
+    } = {}) {
+        const { logits, pred_boxes } = outputs;
+        const batch_size = logits.dims[0];
+        if (target_sizes !== null && target_sizes.length !== batch_size) {
+            throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        }
+        const num_queries = logits.dims.at(1);
+        const probs = logits.sigmoid(); // (batch_size, num_queries, 256)
+        const scores = probs.max(-1).tolist(); // (batch_size, num_queries)
+        // Convert to [x0, y0, x1, y1] format
+        const boxes = pred_boxes.tolist() // (batch_size, num_queries, 4)
+            .map(batch => batch.map(box => (0,_base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_3__.center_to_corners_format)(box)));
+        const results = [];
+        for (let i = 0; i < batch_size; ++i) {
+            const target_size = target_sizes !== null ? target_sizes[i] : null;
+            // Convert from relative [0, 1] to absolute [0, height] coordinates
+            if (target_size !== null) {
+                boxes[i] = boxes[i].map(box => box.map((x, j) => x * target_size[(j + 1) % 2]));
+            }
+            const batch_scores = scores[i];
+            const final_scores = [];
+            const final_phrases = [];
+            const final_boxes = [];
+            for (let j = 0; j < num_queries; ++j) {
+                const score = batch_scores[j];
+                if (score <= box_threshold) {
+                    continue;
+                }
+                const box = boxes[i][j];
+                const prob = probs[i][j];
+                final_scores.push(score);
+                final_boxes.push(box);
+                const phrases = get_phrases_from_posmap(prob.gt(text_threshold), input_ids[i]);
+                final_phrases.push(phrases);
+            }
+            results.push({ scores: final_scores, boxes: final_boxes, labels: this.batch_decode(final_phrases) });
+        }
+        return results;
+    }
+}
 /***/ }),
 /***/ "./src/models/idefics3/image_processing_idefics3.js":
@@ -17384,6 +17617,8 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
                     const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
                     const end_offset = (i + 1) * pixel_attention_mask_stride;
+                    // @ts-expect-error
                     pixel_attention_mask_data.fill(false, start_offset, end_offset);
                 }
             }
@@ -17652,42 +17887,43 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony export */   DonutImageProcessor: () => (/* reexport safe */ _donut_image_processing_donut_js__WEBPACK_IMPORTED_MODULE_7__.DonutImageProcessor),
 /* harmony export */   EfficientNetImageProcessor: () => (/* reexport safe */ _efficientnet_image_processing_efficientnet_js__WEBPACK_IMPORTED_MODULE_9__.EfficientNetImageProcessor),
 /* harmony export */   GLPNFeatureExtractor: () => (/* reexport safe */ _glpn_image_processing_glpn_js__WEBPACK_IMPORTED_MODULE_10__.GLPNFeatureExtractor),
-/* harmony export */   Idefics3ImageProcessor: () => (/* reexport safe */ _idefics3_image_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_11__.Idefics3ImageProcessor),
-/* harmony export */   JinaCLIPImageProcessor: () => (/* reexport safe */ _jina_clip_image_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_13__.JinaCLIPImageProcessor),
-/* harmony export */   LlavaOnevisionImageProcessor: () => (/* reexport safe */ _llava_onevision_image_processing_llava_onevision_js__WEBPACK_IMPORTED_MODULE_14__.LlavaOnevisionImageProcessor),
-/* harmony export */   Mask2FormerImageProcessor: () => (/* reexport safe */ _mask2former_image_processing_mask2former_js__WEBPACK_IMPORTED_MODULE_15__.Mask2FormerImageProcessor),
-/* harmony export */   MaskFormerFeatureExtractor: () => (/* reexport safe */ _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_16__.MaskFormerFeatureExtractor),
-/* harmony export */   MaskFormerImageProcessor: () => (/* reexport safe */ _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_16__.MaskFormerImageProcessor),
-/* harmony export */   MobileNetV1FeatureExtractor: () => (/* reexport safe */ _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_17__.MobileNetV1FeatureExtractor),
-/* harmony export */   MobileNetV1ImageProcessor: () => (/* reexport safe */ _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_17__.MobileNetV1ImageProcessor),
-/* harmony export */   MobileNetV2FeatureExtractor: () => (/* reexport safe */ _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_18__.MobileNetV2FeatureExtractor),
-/* harmony export */   MobileNetV2ImageProcessor: () => (/* reexport safe */ _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_18__.MobileNetV2ImageProcessor),
-/* harmony export */   MobileNetV3FeatureExtractor: () => (/* reexport safe */ _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_19__.MobileNetV3FeatureExtractor),
-/* harmony export */   MobileNetV3ImageProcessor: () => (/* reexport safe */ _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_19__.MobileNetV3ImageProcessor),
-/* harmony export */   MobileNetV4FeatureExtractor: () => (/* reexport safe */ _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_20__.MobileNetV4FeatureExtractor),
-/* harmony export */   MobileNetV4ImageProcessor: () => (/* reexport safe */ _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_20__.MobileNetV4ImageProcessor),
-/* harmony export */   MobileViTFeatureExtractor: () => (/* reexport safe */ _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_21__.MobileViTFeatureExtractor),
-/* harmony export */   MobileViTImageProcessor: () => (/* reexport safe */ _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_21__.MobileViTImageProcessor),
-/* harmony export */   NougatImageProcessor: () => (/* reexport safe */ _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_22__.NougatImageProcessor),
-/* harmony export */   OwlViTFeatureExtractor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__.OwlViTFeatureExtractor),
-/* harmony export */   OwlViTImageProcessor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__.OwlViTImageProcessor),
-/* harmony export */   Owlv2ImageProcessor: () => (/* reexport safe */ _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_23__.Owlv2ImageProcessor),
-/* harmony export */   Phi3VImageProcessor: () => (/* reexport safe */ _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_25__.Phi3VImageProcessor),
-/* harmony export */   PvtImageProcessor: () => (/* reexport safe */ _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_26__.PvtImageProcessor),
-/* harmony export */   Qwen2VLImageProcessor: () => (/* reexport safe */ _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_27__.Qwen2VLImageProcessor),
-/* harmony export */   RTDetrImageProcessor: () => (/* reexport safe */ _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_28__.RTDetrImageProcessor),
-/* harmony export */   SamImageProcessor: () => (/* reexport safe */ _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_29__.SamImageProcessor),
-/* harmony export */   SegformerFeatureExtractor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__.SegformerFeatureExtractor),
-/* harmony export */   SegformerImageProcessor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__.SegformerImageProcessor),
-/* harmony export */   SiglipImageProcessor: () => (/* reexport safe */ _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_31__.SiglipImageProcessor),
-/* harmony export */   Swin2SRImageProcessor: () => (/* reexport safe */ _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_32__.Swin2SRImageProcessor),
-/* harmony export */   VLMImageProcessor: () => (/* reexport safe */ _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_12__.VLMImageProcessor),
-/* harmony export */   ViTFeatureExtractor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__.ViTFeatureExtractor),
-/* harmony export */   ViTImageProcessor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__.ViTImageProcessor),
-/* harmony export */   VitMatteImageProcessor: () => (/* reexport safe */ _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_34__.VitMatteImageProcessor),
-/* harmony export */   VitPoseImageProcessor: () => (/* reexport safe */ _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_35__.VitPoseImageProcessor),
-/* harmony export */   YolosFeatureExtractor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__.YolosFeatureExtractor),
-/* harmony export */   YolosImageProcessor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__.YolosImageProcessor)
+/* harmony export */   GroundingDinoImageProcessor: () => (/* reexport safe */ _grounding_dino_image_processing_grounding_dino_js__WEBPACK_IMPORTED_MODULE_11__.GroundingDinoImageProcessor),
+/* harmony export */   Idefics3ImageProcessor: () => (/* reexport safe */ _idefics3_image_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_12__.Idefics3ImageProcessor),
+/* harmony export */   JinaCLIPImageProcessor: () => (/* reexport safe */ _jina_clip_image_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_14__.JinaCLIPImageProcessor),
+/* harmony export */   LlavaOnevisionImageProcessor: () => (/* reexport safe */ _llava_onevision_image_processing_llava_onevision_js__WEBPACK_IMPORTED_MODULE_15__.LlavaOnevisionImageProcessor),
+/* harmony export */   Mask2FormerImageProcessor: () => (/* reexport safe */ _mask2former_image_processing_mask2former_js__WEBPACK_IMPORTED_MODULE_16__.Mask2FormerImageProcessor),
+/* harmony export */   MaskFormerFeatureExtractor: () => (/* reexport safe */ _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_17__.MaskFormerFeatureExtractor),
+/* harmony export */   MaskFormerImageProcessor: () => (/* reexport safe */ _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_17__.MaskFormerImageProcessor),
+/* harmony export */   MobileNetV1FeatureExtractor: () => (/* reexport safe */ _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_18__.MobileNetV1FeatureExtractor),
+/* harmony export */   MobileNetV1ImageProcessor: () => (/* reexport safe */ _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_18__.MobileNetV1ImageProcessor),
+/* harmony export */   MobileNetV2FeatureExtractor: () => (/* reexport safe */ _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_19__.MobileNetV2FeatureExtractor),
+/* harmony export */   MobileNetV2ImageProcessor: () => (/* reexport safe */ _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_19__.MobileNetV2ImageProcessor),
+/* harmony export */   MobileNetV3FeatureExtractor: () => (/* reexport safe */ _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_20__.MobileNetV3FeatureExtractor),
+/* harmony export */   MobileNetV3ImageProcessor: () => (/* reexport safe */ _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_20__.MobileNetV3ImageProcessor),
+/* harmony export */   MobileNetV4FeatureExtractor: () => (/* reexport safe */ _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_21__.MobileNetV4FeatureExtractor),
+/* harmony export */   MobileNetV4ImageProcessor: () => (/* reexport safe */ _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_21__.MobileNetV4ImageProcessor),
+/* harmony export */   MobileViTFeatureExtractor: () => (/* reexport safe */ _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_22__.MobileViTFeatureExtractor),
+/* harmony export */   MobileViTImageProcessor: () => (/* reexport safe */ _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_22__.MobileViTImageProcessor),
+/* harmony export */   NougatImageProcessor: () => (/* reexport safe */ _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_23__.NougatImageProcessor),
+/* harmony export */   OwlViTFeatureExtractor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_25__.OwlViTFeatureExtractor),
+/* harmony export */   OwlViTImageProcessor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_25__.OwlViTImageProcessor),
+/* harmony export */   Owlv2ImageProcessor: () => (/* reexport safe */ _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_24__.Owlv2ImageProcessor),
+/* harmony export */   Phi3VImageProcessor: () => (/* reexport safe */ _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_26__.Phi3VImageProcessor),
+/* harmony export */   PvtImageProcessor: () => (/* reexport safe */ _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_27__.PvtImageProcessor),
+/* harmony export */   Qwen2VLImageProcessor: () => (/* reexport safe */ _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_28__.Qwen2VLImageProcessor),
+/* harmony export */   RTDetrImageProcessor: () => (/* reexport safe */ _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_29__.RTDetrImageProcessor),
+/* harmony export */   SamImageProcessor: () => (/* reexport safe */ _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_30__.SamImageProcessor),
+/* harmony export */   SegformerFeatureExtractor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_31__.SegformerFeatureExtractor),
+/* harmony export */   SegformerImageProcessor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_31__.SegformerImageProcessor),
+/* harmony export */   SiglipImageProcessor: () => (/* reexport safe */ _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_32__.SiglipImageProcessor),
+/* harmony export */   Swin2SRImageProcessor: () => (/* reexport safe */ _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_33__.Swin2SRImageProcessor),
+/* harmony export */   VLMImageProcessor: () => (/* reexport safe */ _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_13__.VLMImageProcessor),
+/* harmony export */   ViTFeatureExtractor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_34__.ViTFeatureExtractor),
+/* harmony export */   ViTImageProcessor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_34__.ViTImageProcessor),
+/* harmony export */   VitMatteImageProcessor: () => (/* reexport safe */ _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_35__.VitMatteImageProcessor),
+/* harmony export */   VitPoseImageProcessor: () => (/* reexport safe */ _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_36__.VitPoseImageProcessor),
+/* harmony export */   YolosFeatureExtractor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_37__.YolosFeatureExtractor),
+/* harmony export */   YolosImageProcessor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_37__.YolosImageProcessor)
 /* harmony export */ });
 /* harmony import */ var _beit_image_processing_beit_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./beit/image_processing_beit.js */ "./src/models/beit/image_processing_beit.js");
 /* harmony import */ var _bit_image_processing_bit_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./bit/image_processing_bit.js */ "./src/models/bit/image_processing_bit.js");
@@ -17700,32 +17936,34 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony import */ var _dpt_image_processing_dpt_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./dpt/image_processing_dpt.js */ "./src/models/dpt/image_processing_dpt.js");
 /* harmony import */ var _efficientnet_image_processing_efficientnet_js__WEBPACK_IMPORTED_MODULE_9__ = __webpack_require__(/*! ./efficientnet/image_processing_efficientnet.js */ "./src/models/efficientnet/image_processing_efficientnet.js");
 /* harmony import */ var _glpn_image_processing_glpn_js__WEBPACK_IMPORTED_MODULE_10__ = __webpack_require__(/*! ./glpn/image_processing_glpn.js */ "./src/models/glpn/image_processing_glpn.js");
-/* harmony import */ var _idefics3_image_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./idefics3/image_processing_idefics3.js */ "./src/models/idefics3/image_processing_idefics3.js");
-/* harmony import */ var _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./janus/image_processing_janus.js */ "./src/models/janus/image_processing_janus.js");
-/* harmony import */ var _jina_clip_image_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./jina_clip/image_processing_jina_clip.js */ "./src/models/jina_clip/image_processing_jina_clip.js");
-/* harmony import */ var _llava_onevision_image_processing_llava_onevision_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./llava_onevision/image_processing_llava_onevision.js */ "./src/models/llava_onevision/image_processing_llava_onevision.js");
-/* harmony import */ var _mask2former_image_processing_mask2former_js__WEBPACK_IMPORTED_MODULE_15__ = __webpack_require__(/*! ./mask2former/image_processing_mask2former.js */ "./src/models/mask2former/image_processing_mask2former.js");
-/* harmony import */ var _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_16__ = __webpack_require__(/*! ./maskformer/image_processing_maskformer.js */ "./src/models/maskformer/image_processing_maskformer.js");
-/* harmony import */ var _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_17__ = __webpack_require__(/*! ./mobilenet_v1/image_processing_mobilenet_v1.js */ "./src/models/mobilenet_v1/image_processing_mobilenet_v1.js");
-/* harmony import */ var _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_18__ = __webpack_require__(/*! ./mobilenet_v2/image_processing_mobilenet_v2.js */ "./src/models/mobilenet_v2/image_processing_mobilenet_v2.js");
-/* harmony import */ var _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_19__ = __webpack_require__(/*! ./mobilenet_v3/image_processing_mobilenet_v3.js */ "./src/models/mobilenet_v3/image_processing_mobilenet_v3.js");
-/* harmony import */ var _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_20__ = __webpack_require__(/*! ./mobilenet_v4/image_processing_mobilenet_v4.js */ "./src/models/mobilenet_v4/image_processing_mobilenet_v4.js");
-/* harmony import */ var _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_21__ = __webpack_require__(/*! ./mobilevit/image_processing_mobilevit.js */ "./src/models/mobilevit/image_processing_mobilevit.js");
-/* harmony import */ var _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_22__ = __webpack_require__(/*! ./nougat/image_processing_nougat.js */ "./src/models/nougat/image_processing_nougat.js");
-/* harmony import */ var _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_23__ = __webpack_require__(/*! ./owlv2/image_processing_owlv2.js */ "./src/models/owlv2/image_processing_owlv2.js");
-/* harmony import */ var _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__ = __webpack_require__(/*! ./owlvit/image_processing_owlvit.js */ "./src/models/owlvit/image_processing_owlvit.js");
-/* harmony import */ var _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_25__ = __webpack_require__(/*! ./phi3_v/image_processing_phi3_v.js */ "./src/models/phi3_v/image_processing_phi3_v.js");
-/* harmony import */ var _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_26__ = __webpack_require__(/*! ./pvt/image_processing_pvt.js */ "./src/models/pvt/image_processing_pvt.js");
-/* harmony import */ var _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_27__ = __webpack_require__(/*! ./qwen2_vl/image_processing_qwen2_vl.js */ "./src/models/qwen2_vl/image_processing_qwen2_vl.js");
-/* harmony import */ var _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_28__ = __webpack_require__(/*! ./rt_detr/image_processing_rt_detr.js */ "./src/models/rt_detr/image_processing_rt_detr.js");
-/* harmony import */ var _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_29__ = __webpack_require__(/*! ./sam/image_processing_sam.js */ "./src/models/sam/image_processing_sam.js");
-/* harmony import */ var _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__ = __webpack_require__(/*! ./segformer/image_processing_segformer.js */ "./src/models/segformer/image_processing_segformer.js");
-/* harmony import */ var _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_31__ = __webpack_require__(/*! ./siglip/image_processing_siglip.js */ "./src/models/siglip/image_processing_siglip.js");
-/* harmony import */ var _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_32__ = __webpack_require__(/*! ./swin2sr/image_processing_swin2sr.js */ "./src/models/swin2sr/image_processing_swin2sr.js");
-/* harmony import */ var _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__ = __webpack_require__(/*! ./vit/image_processing_vit.js */ "./src/models/vit/image_processing_vit.js");
-/* harmony import */ var _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_34__ = __webpack_require__(/*! ./vitmatte/image_processing_vitmatte.js */ "./src/models/vitmatte/image_processing_vitmatte.js");
-/* harmony import */ var _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_35__ = __webpack_require__(/*! ./vitpose/image_processing_vitpose.js */ "./src/models/vitpose/image_processing_vitpose.js");
-/* harmony import */ var _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__ = __webpack_require__(/*! ./yolos/image_processing_yolos.js */ "./src/models/yolos/image_processing_yolos.js");
+/* harmony import */ var _grounding_dino_image_processing_grounding_dino_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./grounding_dino/image_processing_grounding_dino.js */ "./src/models/grounding_dino/image_processing_grounding_dino.js");
+/* harmony import */ var _idefics3_image_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./idefics3/image_processing_idefics3.js */ "./src/models/idefics3/image_processing_idefics3.js");
+/* harmony import */ var _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./janus/image_processing_janus.js */ "./src/models/janus/image_processing_janus.js");
+/* harmony import */ var _jina_clip_image_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./jina_clip/image_processing_jina_clip.js */ "./src/models/jina_clip/image_processing_jina_clip.js");
+/* harmony import */ var _llava_onevision_image_processing_llava_onevision_js__WEBPACK_IMPORTED_MODULE_15__ = __webpack_require__(/*! ./llava_onevision/image_processing_llava_onevision.js */ "./src/models/llava_onevision/image_processing_llava_onevision.js");
+/* harmony import */ var _mask2former_image_processing_mask2former_js__WEBPACK_IMPORTED_MODULE_16__ = __webpack_require__(/*! ./mask2former/image_processing_mask2former.js */ "./src/models/mask2former/image_processing_mask2former.js");
+/* harmony import */ var _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_17__ = __webpack_require__(/*! ./maskformer/image_processing_maskformer.js */ "./src/models/maskformer/image_processing_maskformer.js");
+/* harmony import */ var _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_18__ = __webpack_require__(/*! ./mobilenet_v1/image_processing_mobilenet_v1.js */ "./src/models/mobilenet_v1/image_processing_mobilenet_v1.js");
+/* harmony import */ var _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_19__ = __webpack_require__(/*! ./mobilenet_v2/image_processing_mobilenet_v2.js */ "./src/models/mobilenet_v2/image_processing_mobilenet_v2.js");
+/* harmony import */ var _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_20__ = __webpack_require__(/*! ./mobilenet_v3/image_processing_mobilenet_v3.js */ "./src/models/mobilenet_v3/image_processing_mobilenet_v3.js");
+/* harmony import */ var _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_21__ = __webpack_require__(/*! ./mobilenet_v4/image_processing_mobilenet_v4.js */ "./src/models/mobilenet_v4/image_processing_mobilenet_v4.js");
+/* harmony import */ var _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_22__ = __webpack_require__(/*! ./mobilevit/image_processing_mobilevit.js */ "./src/models/mobilevit/image_processing_mobilevit.js");
+/* harmony import */ var _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_23__ = __webpack_require__(/*! ./nougat/image_processing_nougat.js */ "./src/models/nougat/image_processing_nougat.js");
+/* harmony import */ var _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_24__ = __webpack_require__(/*! ./owlv2/image_processing_owlv2.js */ "./src/models/owlv2/image_processing_owlv2.js");
+/* harmony import */ var _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_25__ = __webpack_require__(/*! ./owlvit/image_processing_owlvit.js */ "./src/models/owlvit/image_processing_owlvit.js");
+/* harmony import */ var _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_26__ = __webpack_require__(/*! ./phi3_v/image_processing_phi3_v.js */ "./src/models/phi3_v/image_processing_phi3_v.js");
+/* harmony import */ var _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_27__ = __webpack_require__(/*! ./pvt/image_processing_pvt.js */ "./src/models/pvt/image_processing_pvt.js");
+/* harmony import */ var _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_28__ = __webpack_require__(/*! ./qwen2_vl/image_processing_qwen2_vl.js */ "./src/models/qwen2_vl/image_processing_qwen2_vl.js");
+/* harmony import */ var _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_29__ = __webpack_require__(/*! ./rt_detr/image_processing_rt_detr.js */ "./src/models/rt_detr/image_processing_rt_detr.js");
+/* harmony import */ var _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_30__ = __webpack_require__(/*! ./sam/image_processing_sam.js */ "./src/models/sam/image_processing_sam.js");
+/* harmony import */ var _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_31__ = __webpack_require__(/*! ./segformer/image_processing_segformer.js */ "./src/models/segformer/image_processing_segformer.js");
+/* harmony import */ var _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_32__ = __webpack_require__(/*! ./siglip/image_processing_siglip.js */ "./src/models/siglip/image_processing_siglip.js");
+/* harmony import */ var _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_33__ = __webpack_require__(/*! ./swin2sr/image_processing_swin2sr.js */ "./src/models/swin2sr/image_processing_swin2sr.js");
+/* harmony import */ var _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_34__ = __webpack_require__(/*! ./vit/image_processing_vit.js */ "./src/models/vit/image_processing_vit.js");
+/* harmony import */ var _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_35__ = __webpack_require__(/*! ./vitmatte/image_processing_vitmatte.js */ "./src/models/vitmatte/image_processing_vitmatte.js");
+/* harmony import */ var _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_36__ = __webpack_require__(/*! ./vitpose/image_processing_vitpose.js */ "./src/models/vitpose/image_processing_vitpose.js");
+/* harmony import */ var _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_37__ = __webpack_require__(/*! ./yolos/image_processing_yolos.js */ "./src/models/yolos/image_processing_yolos.js");
@@ -17793,6 +18031,7 @@ class VLMImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTE
             },
             ...config,
         });
+        // @ts-expect-error TS2339
         this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
     }
@@ -18241,6 +18480,8 @@ class MgpstrProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE
      * - bpe_preds: The list of BPE decoded sentences.
      * - wp_preds: The list of wp decoded sentences.
      */
+    // @ts-expect-error The type of this method is not compatible with the one
+    // in the base class. It might be a good idea to fix this.
     batch_decode([char_logits, bpe_logits, wp_logits]) {
         const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
         const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
@@ -18634,6 +18875,7 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
         }
         const bos_token = this.tokenizer.bos_token;
+        // @ts-expect-error TS2339
         const image_seq_length = this.image_processor.config.image_seq_length;
         let input_strings;
         if (text.some((t) => t.includes(IMAGE_TOKEN))) {
@@ -18886,7 +19128,7 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
      *
      * @param {string|string[]} text
      * @param {RawImage|RawImage[]} images
-     * @param  {...any} args
+     * @param  { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
      * @returns {Promise<any>}
      */
     async _call(text, images = null, {
@@ -18937,36 +19179,39 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
 __webpack_require__.r(__webpack_exports__);
 /* harmony export */ __webpack_require__.d(__webpack_exports__, {
 /* harmony export */   Florence2Processor: () => (/* reexport safe */ _florence2_processing_florence2_js__WEBPACK_IMPORTED_MODULE_0__.Florence2Processor),
-/* harmony export */   Idefics3Processor: () => (/* reexport safe */ _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_3__.Idefics3Processor),
-/* harmony export */   JinaCLIPProcessor: () => (/* reexport safe */ _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_5__.JinaCLIPProcessor),
-/* harmony export */   MgpstrProcessor: () => (/* reexport safe */ _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_1__.MgpstrProcessor),
-/* harmony export */   MoonshineProcessor: () => (/* reexport safe */ _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_2__.MoonshineProcessor),
-/* harmony export */   OwlViTProcessor: () => (/* reexport safe */ _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_6__.OwlViTProcessor),
-/* harmony export */   PaliGemmaProcessor: () => (/* reexport safe */ _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_8__.PaliGemmaProcessor),
-/* harmony export */   Phi3VProcessor: () => (/* reexport safe */ _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_7__.Phi3VProcessor),
-/* harmony export */   PyAnnoteProcessor: () => (/* reexport safe */ _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_9__.PyAnnoteProcessor),
-/* harmony export */   Qwen2VLProcessor: () => (/* reexport safe */ _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_10__.Qwen2VLProcessor),
-/* harmony export */   SamProcessor: () => (/* reexport safe */ _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_11__.SamProcessor),
-/* harmony export */   SpeechT5Processor: () => (/* reexport safe */ _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_12__.SpeechT5Processor),
-/* harmony export */   VLChatProcessor: () => (/* reexport safe */ _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_4__.VLChatProcessor),
-/* harmony export */   Wav2Vec2ProcessorWithLM: () => (/* reexport safe */ _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_13__.Wav2Vec2ProcessorWithLM),
-/* harmony export */   WhisperProcessor: () => (/* reexport safe */ _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_14__.WhisperProcessor)
+/* harmony export */   GroundingDinoProcessor: () => (/* reexport safe */ _grounding_dino_processing_grounding_dino_js__WEBPACK_IMPORTED_MODULE_1__.GroundingDinoProcessor),
+/* harmony export */   Idefics3Processor: () => (/* reexport safe */ _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_2__.Idefics3Processor),
+/* harmony export */   JinaCLIPProcessor: () => (/* reexport safe */ _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_4__.JinaCLIPProcessor),
+/* harmony export */   MgpstrProcessor: () => (/* reexport safe */ _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_5__.MgpstrProcessor),
+/* harmony export */   MoonshineProcessor: () => (/* reexport safe */ _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_6__.MoonshineProcessor),
+/* harmony export */   OwlViTProcessor: () => (/* reexport safe */ _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_7__.OwlViTProcessor),
+/* harmony export */   PaliGemmaProcessor: () => (/* reexport safe */ _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_9__.PaliGemmaProcessor),
+/* harmony export */   Phi3VProcessor: () => (/* reexport safe */ _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_8__.Phi3VProcessor),
+/* harmony export */   PyAnnoteProcessor: () => (/* reexport safe */ _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_10__.PyAnnoteProcessor),
+/* harmony export */   Qwen2VLProcessor: () => (/* reexport safe */ _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_11__.Qwen2VLProcessor),
+/* harmony export */   SamProcessor: () => (/* reexport safe */ _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_12__.SamProcessor),
+/* harmony export */   SpeechT5Processor: () => (/* reexport safe */ _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_13__.SpeechT5Processor),
+/* harmony export */   VLChatProcessor: () => (/* reexport safe */ _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_3__.VLChatProcessor),
+/* harmony export */   Wav2Vec2ProcessorWithLM: () => (/* reexport safe */ _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_14__.Wav2Vec2ProcessorWithLM),
+/* harmony export */   WhisperProcessor: () => (/* reexport safe */ _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_15__.WhisperProcessor)
 /* harmony export */ });
 /* harmony import */ var _florence2_processing_florence2_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./florence2/processing_florence2.js */ "./src/models/florence2/processing_florence2.js");
-/* harmony import */ var _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./mgp_str/processing_mgp_str.js */ "./src/models/mgp_str/processing_mgp_str.js");
-/* harmony import */ var _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ./moonshine/processing_moonshine.js */ "./src/models/moonshine/processing_moonshine.js");
-/* harmony import */ var _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ./idefics3/processing_idefics3.js */ "./src/models/idefics3/processing_idefics3.js");
-/* harmony import */ var _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_4__ = __webpack_require__(/*! ./janus/processing_janus.js */ "./src/models/janus/processing_janus.js");
-/* harmony import */ var _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./jina_clip/processing_jina_clip.js */ "./src/models/jina_clip/processing_jina_clip.js");
-/* harmony import */ var _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! ./owlvit/processing_owlvit.js */ "./src/models/owlvit/processing_owlvit.js");
-/* harmony import */ var _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./phi3_v/processing_phi3_v.js */ "./src/models/phi3_v/processing_phi3_v.js");
-/* harmony import */ var _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./paligemma/processing_paligemma.js */ "./src/models/paligemma/processing_paligemma.js");
-/* harmony import */ var _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_9__ = __webpack_require__(/*! ./pyannote/processing_pyannote.js */ "./src/models/pyannote/processing_pyannote.js");
-/* harmony import */ var _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_10__ = __webpack_require__(/*! ./qwen2_vl/processing_qwen2_vl.js */ "./src/models/qwen2_vl/processing_qwen2_vl.js");
-/* harmony import */ var _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./sam/processing_sam.js */ "./src/models/sam/processing_sam.js");
-/* harmony import */ var _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./speecht5/processing_speecht5.js */ "./src/models/speecht5/processing_speecht5.js");
-/* harmony import */ var _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./wav2vec2/processing_wav2vec2.js */ "./src/models/wav2vec2/processing_wav2vec2.js");
-/* harmony import */ var _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./whisper/processing_whisper.js */ "./src/models/whisper/processing_whisper.js");
+/* harmony import */ var _grounding_dino_processing_grounding_dino_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./grounding_dino/processing_grounding_dino.js */ "./src/models/grounding_dino/processing_grounding_dino.js");
+/* harmony import */ var _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ./idefics3/processing_idefics3.js */ "./src/models/idefics3/processing_idefics3.js");
+/* harmony import */ var _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ./janus/processing_janus.js */ "./src/models/janus/processing_janus.js");
+/* harmony import */ var _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_4__ = __webpack_require__(/*! ./jina_clip/processing_jina_clip.js */ "./src/models/jina_clip/processing_jina_clip.js");
+/* harmony import */ var _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./mgp_str/processing_mgp_str.js */ "./src/models/mgp_str/processing_mgp_str.js");
+/* harmony import */ var _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! ./moonshine/processing_moonshine.js */ "./src/models/moonshine/processing_moonshine.js");
+/* harmony import */ var _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./owlvit/processing_owlvit.js */ "./src/models/owlvit/processing_owlvit.js");
+/* harmony import */ var _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./phi3_v/processing_phi3_v.js */ "./src/models/phi3_v/processing_phi3_v.js");
+/* harmony import */ var _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_9__ = __webpack_require__(/*! ./paligemma/processing_paligemma.js */ "./src/models/paligemma/processing_paligemma.js");
+/* harmony import */ var _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_10__ = __webpack_require__(/*! ./pyannote/processing_pyannote.js */ "./src/models/pyannote/processing_pyannote.js");
+/* harmony import */ var _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./qwen2_vl/processing_qwen2_vl.js */ "./src/models/qwen2_vl/processing_qwen2_vl.js");
+/* harmony import */ var _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./sam/processing_sam.js */ "./src/models/sam/processing_sam.js");
+/* harmony import */ var _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./speecht5/processing_speecht5.js */ "./src/models/speecht5/processing_speecht5.js");
+/* harmony import */ var _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./wav2vec2/processing_wav2vec2.js */ "./src/models/wav2vec2/processing_wav2vec2.js");
+/* harmony import */ var _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_15__ = __webpack_require__(/*! ./whisper/processing_whisper.js */ "./src/models/whisper/processing_whisper.js");
@@ -19073,6 +19318,7 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
             let current_speaker = -1;
             for (let i = 0; i < scores.length; ++i) {
+                /** @type {number[]} */
                 const probabilities = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.softmax)(scores[i]);
                 const [score, id] = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.max)(probabilities);
                 const [start, end] = [i, i + 1];
@@ -19260,6 +19506,7 @@ class Qwen2VLProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODUL
         }
         if (image_grid_thw) {
+            // @ts-expect-error TS2551
             let merge_length = this.image_processor.config.merge_size ** 2;
             let index = 0;
@@ -19751,8 +19998,8 @@ class SeamlessM4TFeatureExtractor extends _base_feature_extraction_utils_js__WEB
                         'int64',
                         new BigInt64Array(numPaddedFrames),
                         [1, numPaddedFrames],
-                    )
-                    padded_attention_mask.data.fill(1n, 0, num_frames);
+                    );
+                    /** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
                 }
             }
         }
@@ -20565,7 +20812,7 @@ class WhisperFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK
         )
         const data = features.data;
-        const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(data)[0];
+        const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(/** @type {Float32Array} */(data))[0];
         for (let i = 0; i < data.length; ++i) {
             data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
@@ -20828,6 +21075,16 @@ class TensorOpRegistry {
         // executionProviders: ['webgpu'],
     };
+    static get nearest_interpolate_4d() {
+        if (!this._nearest_interpolate_4d) {
+            this._nearest_interpolate_4d = wrap(
+                [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
+                this.session_options,
+                'y',
+            );
+        }
+        return this._nearest_interpolate_4d;
+    }
     static get bilinear_interpolate_4d() {
         if (!this._bilinear_interpolate_4d) {
             this._bilinear_interpolate_4d = wrap(
@@ -21202,6 +21459,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
         // TODO: Use softmax tensor function
         const function_to_apply =
+            // @ts-expect-error TS2339
             this.model.config.problem_type === 'multi_label_classification'
                 ? batch => batch.sigmoid()
                 : batch => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor(
@@ -21210,6 +21468,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
                     batch.dims,
                 ); // single_label_classification (default)
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         const toReturn = [];
@@ -21312,6 +21571,7 @@ class TokenClassificationPipeline extends (/** @type {new (options: TextPipeline
         const outputs = await this.model(model_inputs)
         const logits = outputs.logits;
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         const toReturn = [];
@@ -21651,11 +21911,14 @@ class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipeline
         // Add global prefix, if present
+        // @ts-expect-error TS2339
         if (this.model.config.prefix) {
+            // @ts-expect-error TS2339
             texts = texts.map(x => this.model.config.prefix + x)
         }
         // Handle task specific params:
+        // @ts-expect-error TS2339
         const task_specific_params = this.model.config.task_specific_params
         if (task_specific_params && task_specific_params[this.task]) {
             // Add prefixes, if present
@@ -22394,6 +22657,7 @@ class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelin
         const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
         const preparedAudios = await prepareAudios(audio, sampling_rate);
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         const toReturn = [];
@@ -22704,6 +22968,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
             audio = [/** @type {AudioInput} */ (audio)];
         }
+        // @ts-expect-error TS2339
         const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
         const hop_length = this.processor.feature_extractor.config.hop_length;
@@ -22769,7 +23034,9 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
                 // TODO: Right now we only get top beam
                 if (return_timestamps === 'word') {
+                    // @ts-expect-error TS2339
                     chunk.tokens = data.sequences.tolist()[0];
+                    // @ts-expect-error TS2339
                     chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
                         (/** @type {number} */ x) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.round)(x, 2)
                     );
@@ -22814,7 +23081,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
             const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
             const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
-            const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
+            const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
             toReturn.push({ text });
         }
         return single ? toReturn[0] : toReturn;
@@ -22963,6 +23230,7 @@ class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelin
         const { pixel_values } = await this.processor(preparedImages);
         const output = await this.model({ pixel_values });
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         /** @type {ImageClassificationOutput[]} */
@@ -23077,6 +23345,7 @@ class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineC
             }
         }
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         /** @type {ImageSegmentationPipelineOutput[]} */
@@ -23303,6 +23572,7 @@ class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineCon
         const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
         // Add labels
+        // @ts-expect-error TS2339
         const id2label = this.model.config.id2label;
         // Format output
@@ -23447,13 +23717,35 @@ class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: TextImag
             // Run model with both text and pixel inputs
             const output = await this.model({ ...text_inputs, pixel_values });
-            // @ts-ignore
-            const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
-            let result = processed.boxes.map((box, i) => ({
-                score: processed.scores[i],
-                label: candidate_labels[processed.classes[i]],
-                box: get_bounding_box(box, !percentage),
-            })).sort((a, b) => b.score - a.score);
+            let result;
+            if('post_process_grounded_object_detection' in this.processor) {
+                // @ts-ignore
+                const processed = this.processor.post_process_grounded_object_detection(
+                    output,
+                    text_inputs.input_ids,
+                    {
+                        // TODO: support separate threshold values
+                        box_threshold: threshold,
+                        text_threshold: threshold,
+                        target_sizes: imageSize,
+                    },
+                )[0];
+                result = processed.boxes.map((box, i) => ({
+                    score: processed.scores[i],
+                    label: processed.labels[i],
+                    box: get_bounding_box(box, !percentage),
+                }))
+            } else {
+                // @ts-ignore
+                const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
+                result = processed.boxes.map((box, i) => ({
+                    score: processed.scores[i],
+                    label: candidate_labels[processed.classes[i]],
+                    box: get_bounding_box(box, !percentage),
+                }))
+            }
+            result.sort((a, b) => b.score - a.score);
             if (top_k !== null) {
                 result = result.slice(0, top_k);
             }
@@ -23522,6 +23814,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
         // Run model
         const output = await this.model.generate({
             inputs: pixel_values,
+            // @ts-expect-error TS2339
             max_length: this.model.config.decoder.max_position_embeddings,
             decoder_input_ids,
             ...generate_kwargs,
@@ -23572,7 +23865,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { quantized: false });
  * const speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
  * const out = await synthesizer('Hello, my dog is cute', { speaker_embeddings });
- * // {
+ * // RawAudio {
  * //   audio: Float32Array(26112) [-0.00005657337896991521, 0.00020583874720614403, ...],
  * //   sampling_rate: 16000
  * // }
@@ -23592,7 +23885,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
  * ```javascript
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/mms-tts-fra');
  * const out = await synthesizer('Bonjour');
- * // {
+ * // RawAudio {
  * //   audio: Float32Array(23808) [-0.00037693005288019776, 0.0003325853613205254, ...],
  * //   sampling_rate: 16000
  * // }
@@ -23637,11 +23930,12 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
         // Generate waveform
         const { waveform } = await this.model(inputs);
+        // @ts-expect-error TS2339
         const sampling_rate = this.model.config.sampling_rate;
-        return {
-            audio: waveform.data,
+        return new _utils_audio_js__WEBPACK_IMPORTED_MODULE_7__.RawAudio(
+            waveform.data,
             sampling_rate,
-        }
+        )
     }
     async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) {
@@ -23681,10 +23975,10 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
         const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder });
         const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
-        return {
-            audio: waveform.data,
+        return new _utils_audio_js__WEBPACK_IMPORTED_MODULE_7__.RawAudio(
+            waveform.data,
             sampling_rate,
-        }
+        )
     }
 }
@@ -23794,11 +24088,23 @@ class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineCon
         const toReturn = [];
         for (let i = 0; i < preparedImages.length; ++i) {
-            const prediction = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate)(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
-            const formatted = prediction.mul_(255 / (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.max)(prediction.data)[0]).to('uint8');
+            const batch = predicted_depth[i];
+            const [height, width] = batch.dims.slice(-2);
+            const [new_width, new_height] = preparedImages[i].size;
+            // Interpolate to original size
+            const prediction = (await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate_4d)(batch.view(1, 1, height, width), {
+                size: [new_height, new_width],
+                mode: 'bilinear',
+            })).view(new_height, new_width);
+            const minval = /** @type {number} */(prediction.min().item());
+            const maxval = /** @type {number} */(prediction.max().item());
+            const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
+            const depth = _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted);
             toReturn.push({
-                predicted_depth: predicted_depth[i],
-                depth: _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted),
+                predicted_depth: prediction,
+                depth,
             });
         }
@@ -24278,6 +24584,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
     return result;
 }
 /***/ }),
 /***/ "./src/tokenizers.js":
@@ -24347,7 +24654,6 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony import */ var _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./utils/data-structures.js */ "./src/utils/data-structures.js");
 /* harmony import */ var _huggingface_jinja__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! @huggingface/jinja */ "./node_modules/@huggingface/jinja/dist/index.js");
 /* harmony import */ var _models_whisper_common_whisper_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./models/whisper/common_whisper.js */ "./src/models/whisper/common_whisper.js");
-/* harmony import */ var _utils_constants_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./utils/constants.js */ "./src/utils/constants.js");
 /**
  * @file Tokenizers are used to prepare textual inputs for a model.
@@ -24384,7 +24690,6 @@ __webpack_require__.r(__webpack_exports__);
 /**
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
  * @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used.
@@ -24868,7 +25173,7 @@ class Unigram extends TokenizerModel {
      * Create a new Unigram tokenizer model.
      * @param {Object} config The configuration object for the Unigram model.
      * @param {number} config.unk_id The ID of the unknown token
-     * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
+     * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
      * @param {Object} moreConfig Additional configuration object for the Unigram model.
      */
     constructor(config, moreConfig) {
@@ -24876,11 +25181,10 @@ class Unigram extends TokenizerModel {
         const vocabSize = config.vocab.length;
         this.vocab = new Array(vocabSize);
+        /** @type {number[]} */
         this.scores = new Array(vocabSize);
         for (let i = 0; i < vocabSize; ++i) {
-            const piece = config.vocab[i];
-            this.vocab[i] = piece[0];
-            this.scores[i] = piece[1];
+            [this.vocab[i], this.scores[i]] = config.vocab[i];
         }
         this.unk_token_id = config.unk_id;
@@ -28720,6 +29024,7 @@ class AutoTokenizer {
 "use strict";
 __webpack_require__.r(__webpack_exports__);
 /* harmony export */ __webpack_require__.d(__webpack_exports__, {
+/* harmony export */   RawAudio: () => (/* binding */ RawAudio),
 /* harmony export */   hamming: () => (/* binding */ hamming),
 /* harmony export */   hanning: () => (/* binding */ hanning),
 /* harmony export */   mel_filter_bank: () => (/* binding */ mel_filter_bank),
@@ -28730,7 +29035,9 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony import */ var _hub_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./hub.js */ "./src/utils/hub.js");
 /* harmony import */ var _maths_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./maths.js */ "./src/utils/maths.js");
 /* harmony import */ var _core_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ./core.js */ "./src/utils/core.js");
-/* harmony import */ var _tensor_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ./tensor.js */ "./src/utils/tensor.js");
+/* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
+/* harmony import */ var fs__WEBPACK_IMPORTED_MODULE_4__ = __webpack_require__(/*! fs */ "fs");
+/* harmony import */ var _tensor_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./tensor.js */ "./src/utils/tensor.js");
 /**
  * @file Helper module for audio processing.
  *
@@ -28746,6 +29053,8 @@ __webpack_require__.r(__webpack_exports__);
 /**
  * Helper function to read audio from a path/URL.
  * @param {string|URL} url The path/URL to load the audio from.
@@ -29339,10 +29648,10 @@ async function spectrogram(
     //  - mel_filters.shape=(80, 201)
     //  - magnitudes.shape=(3000, 201) => magnitudes.T.shape=(201, 3000)
     //  - mel_spec.shape=(80, 3000)
-    let mel_spec = await (0,_tensor_js__WEBPACK_IMPORTED_MODULE_3__.matmul)(
+    let mel_spec = await (0,_tensor_js__WEBPACK_IMPORTED_MODULE_5__.matmul)(
         // TODO: Make `mel_filters` a Tensor during initialization
-        new _tensor_js__WEBPACK_IMPORTED_MODULE_3__.Tensor('float32', mel_filters.flat(), [num_mel_filters, num_frequency_bins]),
-        new _tensor_js__WEBPACK_IMPORTED_MODULE_3__.Tensor('float32', transposedMagnitudeData, [num_frequency_bins, d1Max]),
+        new _tensor_js__WEBPACK_IMPORTED_MODULE_5__.Tensor('float32', mel_filters.flat(), [num_mel_filters, num_frequency_bins]),
+        new _tensor_js__WEBPACK_IMPORTED_MODULE_5__.Tensor('float32', transposedMagnitudeData, [num_frequency_bins, d1Max]),
     );
     if (transpose) {
         mel_spec = mel_spec.transpose(1, 0);
@@ -29432,6 +29741,116 @@ function window_function(window_length, name, {
     return window;
 }
+/**
+ * Encode audio data to a WAV file.
+ * WAV file specs : https://en.wikipedia.org/wiki/WAV#WAV_File_header
+ *
+ * Adapted from https://www.npmjs.com/package/audiobuffer-to-wav
+ * @param {Float32Array} samples The audio samples.
+ * @param {number} rate The sample rate.
+ * @returns {ArrayBuffer} The WAV audio buffer.
+ */
+function encodeWAV(samples, rate) {
+    let offset = 44;
+    const buffer = new ArrayBuffer(offset + samples.length * 4);
+    const view = new DataView(buffer);
+    /* RIFF identifier */
+    writeString(view, 0, "RIFF");
+    /* RIFF chunk length */
+    view.setUint32(4, 36 + samples.length * 4, true);
+    /* RIFF type */
+    writeString(view, 8, "WAVE");
+    /* format chunk identifier */
+    writeString(view, 12, "fmt ");
+    /* format chunk length */
+    view.setUint32(16, 16, true);
+    /* sample format (raw) */
+    view.setUint16(20, 3, true);
+    /* channel count */
+    view.setUint16(22, 1, true);
+    /* sample rate */
+    view.setUint32(24, rate, true);
+    /* byte rate (sample rate * block align) */
+    view.setUint32(28, rate * 4, true);
+    /* block align (channel count * bytes per sample) */
+    view.setUint16(32, 4, true);
+    /* bits per sample */
+    view.setUint16(34, 32, true);
+    /* data chunk identifier */
+    writeString(view, 36, "data");
+    /* data chunk length */
+    view.setUint32(40, samples.length * 4, true);
+    for (let i = 0; i < samples.length; ++i, offset += 4) {
+        view.setFloat32(offset, samples[i], true);
+    }
+    return buffer;
+}
+function writeString(view, offset, string) {
+    for (let i = 0; i < string.length; ++i) {
+        view.setUint8(offset + i, string.charCodeAt(i));
+    }
+}
+class RawAudio {
+    /**
+     * Create a new `RawAudio` object.
+     * @param {Float32Array} audio Audio data
+     * @param {number} sampling_rate Sampling rate of the audio data
+     */
+    constructor(audio, sampling_rate) {
+        this.audio = audio
+        this.sampling_rate = sampling_rate
+    }
+    /**
+     * Convert the audio to a wav file buffer.
+     * @returns {ArrayBuffer} The WAV file.
+     */
+    toWav() {
+        return encodeWAV(this.audio, this.sampling_rate)
+    }
+    /**
+     * Convert the audio to a blob.
+     * @returns {Blob}
+     */
+    toBlob() {
+        const wav = this.toWav();
+        const blob = new Blob([wav], { type: 'audio/wav' });
+        return blob;
+    }
+    /**
+     * Save the audio to a wav file.
+     * @param {string} path
+     */
+    async save(path) {
+        let fn;
+        if (_env_js__WEBPACK_IMPORTED_MODULE_3__.apis.IS_BROWSER_ENV) {
+            if (_env_js__WEBPACK_IMPORTED_MODULE_3__.apis.IS_WEBWORKER_ENV) {
+                throw new Error('Unable to save a file from a Web Worker.')
+            }
+            fn = _core_js__WEBPACK_IMPORTED_MODULE_2__.saveBlob;
+        } else if (_env_js__WEBPACK_IMPORTED_MODULE_3__.apis.IS_FS_AVAILABLE) {
+            fn = async (/** @type {string} */ path, /** @type {Blob} */ blob) => {
+                let buffer = await blob.arrayBuffer();
+                fs__WEBPACK_IMPORTED_MODULE_4__.writeFileSync(path, Buffer.from(buffer));
+            }
+        } else {
+            throw new Error('Unable to save because filesystem is disabled in this environment.')
+        }
+        await fn(path, this.toBlob())
+    }
+}
 /***/ }),
@@ -29487,7 +29906,8 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony export */   pick: () => (/* binding */ pick),
 /* harmony export */   pop: () => (/* binding */ pop),
 /* harmony export */   product: () => (/* binding */ product),
-/* harmony export */   reverseDictionary: () => (/* binding */ reverseDictionary)
+/* harmony export */   reverseDictionary: () => (/* binding */ reverseDictionary),
+/* harmony export */   saveBlob: () => (/* binding */ saveBlob)
 /* harmony export */ });
 /**
@@ -29680,6 +30100,32 @@ function calculateReflectOffset(i, w) {
     return Math.abs((i + w) % (2 * w) - w);
 }
+/**
+ * Save blob file on the web.
+ * @param {string} path The path to save the blob to
+ * @param {Blob} blob The blob to save
+ */
+function saveBlob(path, blob){
+    // Convert the canvas content to a data URL
+    const dataURL = URL.createObjectURL(blob);
+    // Create an anchor element with the data URL as the href attribute
+    const downloadLink = document.createElement('a');
+    downloadLink.href = dataURL;
+    // Set the download attribute to specify the desired filename for the downloaded image
+    downloadLink.download = path;
+    // Trigger the download
+    downloadLink.click();
+    // Clean up: remove the anchor element from the DOM
+    downloadLink.remove();
+    // Revoke the Object URL to free up memory
+    URL.revokeObjectURL(dataURL);
+}
 /**
  *
  * @param {Object} o
@@ -30243,6 +30689,8 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony export */ });
 /* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
 /* harmony import */ var _devices_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./devices.js */ "./src/utils/devices.js");
+/// <reference types="@webgpu/types" />
@@ -30498,7 +30946,7 @@ class FileResponse {
      */
     async arrayBuffer() {
         const data = await fs__WEBPACK_IMPORTED_MODULE_0__.promises.readFile(this.filePath);
-        return data.buffer;
+        return /** @type {ArrayBuffer} */ (data.buffer);
     }
     /**
@@ -31860,23 +32308,9 @@ class RawImage {
             // Convert image to Blob
             const blob = await this.toBlob(mime);
-            // Convert the canvas content to a data URL
-            const dataURL = URL.createObjectURL(blob);
+            (0,_core_js__WEBPACK_IMPORTED_MODULE_0__.saveBlob)(path, blob)
-            // Create an anchor element with the data URL as the href attribute
-            const downloadLink = document.createElement('a');
-            downloadLink.href = dataURL;
-            // Set the download attribute to specify the desired filename for the downloaded image
-            downloadLink.download = path;
-            // Trigger the download
-            downloadLink.click();
-            // Clean up: remove the anchor element from the DOM
-            downloadLink.remove();
-        } else if (!_env_js__WEBPACK_IMPORTED_MODULE_2__.env.useFS) {
+        } else if (!_env_js__WEBPACK_IMPORTED_MODULE_2__.apis.IS_FS_AVAILABLE) {
             throw new Error('Unable to save the image because filesystem is disabled in this environment.')
         } else {
@@ -31906,6 +32340,7 @@ class RawImage {
 const load_image = RawImage.read.bind(RawImage);
 /***/ }),
 /***/ "./src/utils/maths.js":
@@ -32159,8 +32594,9 @@ function magnitude(arr) {
 /**
  * Returns the value and index of the minimum element in an array.
- * @param {number[]|TypedArray} arr array of numbers.
- * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
+ * @template {number[]|bigint[]|AnyTypedArray} T
+ * @param {T} arr array of numbers.
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
  * @throws {Error} If array is empty.
  */
 function min(arr) {
@@ -32173,14 +32609,15 @@ function min(arr) {
             indexOfMin = i;
         }
     }
-    return [min, indexOfMin];
+    return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
 }
 /**
  * Returns the value and index of the maximum element in an array.
- * @param {number[]|AnyTypedArray} arr array of numbers.
- * @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
+ * @template {number[]|bigint[]|AnyTypedArray} T
+ * @param {T} arr array of numbers.
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
  * @throws {Error} If array is empty.
  */
 function max(arr) {
@@ -32193,7 +32630,7 @@ function max(arr) {
             indexOfMax = i;
         }
     }
-    return [Number(max), indexOfMax];
+    return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
 }
 function isPowerOfTwo(number) {
@@ -33491,8 +33928,6 @@ class Tensor {
         return this.permute(...dims);
     }
-    // TODO add .max() and .min() methods
     /**
      * Returns the sum of each row of the input tensor in the given dimension dim.
      *
@@ -33521,55 +33956,22 @@ class Tensor {
         }
         const this_data = this.data;
+        const fn = (a, b) => a + (b ** p);
         if (dim === null) {
             // @ts-ignore
-            let val = this_data.reduce((a, b) => a + (b ** p), 0) ** (1 / p);
+            const val = this_data.reduce(fn, 0) ** (1 / p);
             return new Tensor(this.type, [val], []);
         }
-        // Negative indexing
-        dim = safeIndex(dim, this.dims.length);
-        // Calculate the shape of the resulting array after summation
-        const resultDims = this.dims.slice(); // Copy the original dimensions
-        resultDims[dim] = 1; // Remove the specified axis
-        // Create a new array to store the accumulated values
-        // @ts-ignore
-        const result = new this_data.constructor(this_data.length / this.dims[dim]);
-        // Iterate over the data array
-        for (let i = 0; i < this_data.length; ++i) {
-            // Calculate the index in the resulting array
-            let resultIndex = 0;
-            for (let j = this.dims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) {
-                const size = this.dims[j];
-                if (j !== dim) {
-                    const index = num % size;
-                    resultIndex += index * resultMultiplier;
-                    resultMultiplier *= resultDims[j];
-                }
-                num = Math.floor(num / size);
-            }
-            // Accumulate the value at the current index
-            result[resultIndex] += (this_data[i]) ** p;
-        }
+        const [type, result, resultDims] = reduce_helper(fn, this, dim, keepdim);
         if (p !== 1) {
             for (let i = 0; i < result.length; ++i) {
                 result[i] = result[i] ** (1 / p);
             }
         }
-        if (!keepdim) {
-            resultDims.splice(dim, 1);
-        }
-        return new Tensor(this.type, result, resultDims);
+        return new Tensor(type, result, resultDims);
     }
     /**
@@ -33632,7 +34034,7 @@ class Tensor {
      * NOTE: The returned tensor shares the storage with the input tensor, so changing the contents of one will change the contents of the other.
      * If you would like a copy, use `tensor.clone()` before squeezing.
      *
-     * @param {number} [dim=null] If given, the input will be squeezed only in the specified dimensions.
+     * @param {number|number[]} [dim=null] If given, the input will be squeezed only in the specified dimensions.
      * @returns {Tensor} The squeezed tensor
      */
     squeeze(dim = null) {
@@ -33742,6 +34144,34 @@ class Tensor {
         return this.clone().neg_();
     }
+    /**
+     * Computes input > val element-wise.
+     * @param {number} val The value to compare with.
+     * @returns {Tensor} A boolean tensor that is `true` where input is greater than other and `false` elsewhere.
+     */
+    gt(val) {
+        const mask = new Uint8Array(this.data.length);
+        const this_data = this.data;
+        for (let i = 0; i < this_data.length; ++i) {
+            mask[i] = this_data[i] > val ? 1 : 0;
+        }
+        return new Tensor('bool', mask, this.dims);
+    }
+    /**
+     * Computes input < val element-wise.
+     * @param {number} val The value to compare with.
+     * @returns {Tensor} A boolean tensor that is `true` where input is less than other and `false` elsewhere.
+     */
+    lt(val) {
+        const mask = new Uint8Array(this.data.length);
+        const this_data = this.data;
+        for (let i = 0; i < this_data.length; ++i) {
+            mask[i] = this_data[i] < val ? 1 : 0;
+        }
+        return new Tensor('bool', mask, this.dims);
+    }
     /**
      * In-place version of @see {@link Tensor.clamp}
      */
@@ -33786,6 +34216,41 @@ class Tensor {
         return mean(this, dim, keepdim);
     }
+    min(dim = null, keepdim = false) {
+        if (dim === null) {
+            // None to reduce over all dimensions.
+            const val = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[0];
+            return new Tensor(this.type, [val], [/* scalar */]);
+        }
+        const [type, result, resultDims] = reduce_helper((a, b) => Math.min(a, b), this, dim, keepdim, Infinity);
+        return new Tensor(type, result, resultDims);
+    }
+    max(dim = null, keepdim = false) {
+        if (dim === null) {
+            // None to reduce over all dimensions.
+            const val = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[0];
+            return new Tensor(this.type, [val], [/* scalar */]);
+        }
+        const [type, result, resultDims] = reduce_helper((a, b) => Math.max(a, b), this, dim, keepdim, -Infinity);
+        return new Tensor(type, result, resultDims);
+    }
+    argmin(dim = null, keepdim = false) {
+        if (dim !== null) {
+            throw new Error("`dim !== null` not yet implemented.");
+        }
+        const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[1];
+        return new Tensor('int64', [BigInt(index)], []);
+    }
+    argmax(dim = null, keepdim = false) {
+        if (dim !== null) {
+            throw new Error("`dim !== null` not yet implemented.");
+        }
+        const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[1];
+        return new Tensor('int64', [BigInt(index)], []);
+    }
     /**
      * Performs Tensor dtype conversion.
      * @param {DataType} type The desired data type.
@@ -33919,7 +34384,7 @@ function interpolate(input, [out_height, out_width], mode = 'bilinear', align_co
  * @param {Tensor} input the input tensor
  * @param {Object} options the options for the interpolation
  * @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
- * @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
+ * @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
  * @returns {Promise<Tensor>} The interpolated tensor.
  */
 async function interpolate_4d(input, {
@@ -33949,7 +34414,9 @@ async function interpolate_4d(input, {
     }
     let op;
-    if (mode === 'bilinear') {
+    if (mode === 'nearest') {
+        op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.nearest_interpolate_4d;
+    } else if (mode === 'bilinear') {
         op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bilinear_interpolate_4d;
     } else if (mode === 'bicubic') {
         op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bicubic_interpolate_4d;
@@ -33990,13 +34457,13 @@ async function rfft(x, a) {
  * Returns the k largest elements of the given input tensor.
  * Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
  * @param {Tensor} x the input tensor
- * @param {number} k the k in "top-k"
+ * @param {number} [k] the k in "top-k"
  * @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
  */
 async function topk(x, k) {
     const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.top_k;
-    if (k === null) {
+    if (k == null) {
         k = x.dims.at(-1);
     } else {
         k = Math.min(k, x.dims.at(-1));
@@ -34025,10 +34492,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
 async function slice(data, starts, ends, axes, steps) {
     const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
     return await op({
-        x: data,
-        s: arrayToIndexTensor(starts),
-        e: arrayToIndexTensor(ends),
-        a: arrayToIndexTensor(axes),
+        x: data,
+        s: arrayToIndexTensor(starts),
+        e: arrayToIndexTensor(ends),
+        a: arrayToIndexTensor(axes),
         t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
     });
 }
@@ -34263,35 +34730,19 @@ function stack(tensors, dim = 0) {
 /**
- * Calculates the standard deviation and mean over the dimensions specified by dim. dim can be a single dimension or `null` to reduce over all dimensions.
- * @param {Tensor} input the input tenso
- * @param {number|null} dim the dimension to reduce. If None, all dimensions are reduced.
- * @param {number} correction difference between the sample size and sample degrees of freedom. Defaults to Bessel's correction, correction=1.
+ * @param {(previousValue: any, currentValue: any, currentIndex?: number, resultIndex?: number) => any} callbackfn
+ * @param {Tensor} input the input tensor.
+ * @param {number|null} dim the dimension to reduce.
  * @param {boolean} keepdim whether the output tensor has dim retained or not.
- * @returns {Tensor[]} A tuple of (std, mean) tensors.
+ * @returns {[DataType, any, number[]]} The reduced tensor data.
  */
-function std_mean(input, dim = null, correction = 1, keepdim = false) {
-    const inputData = /** @type {Float32Array} */(input.data);
+function reduce_helper(callbackfn, input, dim = null, keepdim = false, initialValue = null) {
+    const inputData = input.data;
     const inputDims = input.dims;
-    if (dim === null) {
-        // None to reduce over all dimensions.
-        const sum = inputData.reduce((a, b) => a + b, 0);
-        const mean = sum / inputData.length;
-        const std = Math.sqrt(inputData.reduce((a, b) => a + (b - mean) ** 2, 0) / (inputData.length - correction));
-        const meanTensor = new Tensor(input.type, [mean], [/* scalar */]);
-        const stdTensor = new Tensor(input.type, [std], [/* scalar */]);
-        return [stdTensor, meanTensor];
-    }
     // Negative indexing
     dim = safeIndex(dim, inputDims.length);
-    const meanTensor = mean(input, dim, keepdim);
-    const meanTensorData = meanTensor.data;
     // Calculate the shape of the resulting array after summation
     const resultDims = inputDims.slice(); // Copy the original dimensions
     resultDims[dim] = 1; // Remove the specified axis
@@ -34299,6 +34750,9 @@ function std_mean(input, dim = null, correction = 1, keepdim = false) {
     // Create a new array to store the accumulated values
     // @ts-ignore
     const result = new inputData.constructor(inputData.length / inputDims[dim]);
+    if (initialValue !== null) {
+        result.fill(initialValue);
+    }
     // Iterate over the data array
     for (let i = 0; i < inputData.length; ++i) {
@@ -34317,23 +34771,55 @@ function std_mean(input, dim = null, correction = 1, keepdim = false) {
         }
         // Accumulate the value at the current index
-        result[resultIndex] += (inputData[i] - meanTensorData[resultIndex]) ** 2;
+        result[resultIndex] = callbackfn(result[resultIndex], inputData[i], i, resultIndex);
     }
-    for (let i = 0; i < result.length; ++i) {
-        result[i] = Math.sqrt(result[i] / (inputDims[dim] - correction));
+    if (!keepdim) resultDims.splice(dim, 1);
+    return [input.type, result, resultDims];
+}
+/**
+ * Calculates the standard deviation and mean over the dimensions specified by dim. dim can be a single dimension or `null` to reduce over all dimensions.
+ * @param {Tensor} input the input tenso
+ * @param {number|null} dim the dimension to reduce. If None, all dimensions are reduced.
+ * @param {number} correction difference between the sample size and sample degrees of freedom. Defaults to Bessel's correction, correction=1.
+ * @param {boolean} keepdim whether the output tensor has dim retained or not.
+ * @returns {Tensor[]} A tuple of (std, mean) tensors.
+ */
+function std_mean(input, dim = null, correction = 1, keepdim = false) {
+    const inputData = /** @type {Float32Array} */(input.data);
+    const inputDims = input.dims;
+    if (dim === null) {
+        // None to reduce over all dimensions.
+        const sum = inputData.reduce((a, b) => a + b, 0);
+        const mean = sum / inputData.length;
+        const std = Math.sqrt(inputData.reduce((a, b) => a + (b - mean) ** 2, 0) / (inputData.length - correction));
+        const meanTensor = new Tensor(input.type, [mean], [/* scalar */]);
+        const stdTensor = new Tensor(input.type, [std], [/* scalar */]);
+        return [stdTensor, meanTensor];
     }
+    dim = safeIndex(dim, inputDims.length);
+    const meanTensor = mean(input, dim, keepdim);
+    const meanTensorData = meanTensor.data;
-    if (!keepdim) {
-        resultDims.splice(dim, 1);
+    // Compute squared sum
+    const [type, result, resultDims] = reduce_helper((a, b, i, j) => a + (b - meanTensorData[j]) ** 2, input, dim, keepdim);
+    // Square root of the squared sum
+    for (let i = 0; i < result.length; ++i) {
+        result[i] = Math.sqrt(result[i] / (inputDims[dim] - correction));
     }
-    const stdTensor = new Tensor(input.type, result, resultDims);
+    const stdTensor = new Tensor(type, result, resultDims);
     return [stdTensor, meanTensor];
 }
 /**
  * Returns the mean value of each row of the input tensor in the given dimension dim.
  * @param {Tensor} input the input tensor.
@@ -34342,58 +34828,27 @@ function std_mean(input, dim = null, correction = 1, keepdim = false) {
  * @returns {Tensor} A new tensor with means taken along the specified dimension.
  */
 function mean(input, dim = null, keepdim = false) {
+    const inputDims = input.dims;
     const inputData = /** @type {Float32Array} */(input.data);
     if (dim === null) {
         // None to reduce over all dimensions.
-        // @ts-ignore
         const val = inputData.reduce((a, b) => a + b, 0);
         return new Tensor(input.type, [val / inputData.length], [/* scalar */]);
     }
-    const inputDims = input.dims;
-    // Negative indexing
     dim = safeIndex(dim, inputDims.length);
-    // Calculate the shape of the resulting array after summation
-    const resultDims = inputDims.slice(); // Copy the original dimensions
-    resultDims[dim] = 1; // Remove the specified axis
-    // Create a new array to store the accumulated values
-    // @ts-ignore
-    const result = new inputData.constructor(inputData.length / inputDims[dim]);
-    // Iterate over the data array
-    for (let i = 0; i < inputData.length; ++i) {
-        // Calculate the index in the resulting array
-        let resultIndex = 0;
-        for (let j = inputDims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) {
-            const size = inputDims[j];
-            if (j !== dim) {
-                const index = num % size;
-                resultIndex += index * resultMultiplier;
-                resultMultiplier *= resultDims[j];
-            }
-            num = Math.floor(num / size);
-        }
-        // Accumulate the value at the current index
-        result[resultIndex] += inputData[i];
-    }
+    // Compute sum
+    const [type, result, resultDims] = reduce_helper((a, b) => a + b, input, dim, keepdim);
+    // Divide by number of elements in the dimension
     if (inputDims[dim] !== 1) {
         for (let i = 0; i < result.length; ++i) {
-            result[i] = result[i] / inputDims[dim];
+            result[i] /= inputDims[dim];
         }
     }
-    if (!keepdim) {
-        resultDims.splice(dim, 1);
-    }
-    return new Tensor(input.type, result, resultDims);
+    return new Tensor(type, result, resultDims);
 }
@@ -34893,6 +35348,10 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony export */   GraniteModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GraniteModel),
 /* harmony export */   GranitePreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GranitePreTrainedModel),
 /* harmony export */   Grok1Tokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.Grok1Tokenizer),
+/* harmony export */   GroundingDinoForObjectDetection: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroundingDinoForObjectDetection),
+/* harmony export */   GroundingDinoImageProcessor: () => (/* reexport safe */ _models_image_processors_js__WEBPACK_IMPORTED_MODULE_13__.GroundingDinoImageProcessor),
+/* harmony export */   GroundingDinoPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroundingDinoPreTrainedModel),
+/* harmony export */   GroundingDinoProcessor: () => (/* reexport safe */ _models_processors_js__WEBPACK_IMPORTED_MODULE_16__.GroundingDinoProcessor),
 /* harmony export */   GroupViTModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroupViTModel),
 /* harmony export */   GroupViTPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroupViTPreTrainedModel),
 /* harmony export */   HerbertTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.HerbertTokenizer),
@@ -35122,6 +35581,7 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony export */   RTDetrModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.RTDetrModel),
 /* harmony export */   RTDetrObjectDetectionOutput: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.RTDetrObjectDetectionOutput),
 /* harmony export */   RTDetrPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.RTDetrPreTrainedModel),
+/* harmony export */   RawAudio: () => (/* reexport safe */ _utils_audio_js__WEBPACK_IMPORTED_MODULE_5__.RawAudio),
 /* harmony export */   RawImage: () => (/* reexport safe */ _utils_image_js__WEBPACK_IMPORTED_MODULE_6__.RawImage),
 /* harmony export */   RepetitionPenaltyLogitsProcessor: () => (/* reexport safe */ _generation_logits_process_js__WEBPACK_IMPORTED_MODULE_20__.RepetitionPenaltyLogitsProcessor),
 /* harmony export */   ResNetForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ResNetForImageClassification),
@@ -35187,6 +35647,8 @@ __webpack_require__.r(__webpack_exports__);
 /* harmony export */   Starcoder2PreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Starcoder2PreTrainedModel),
 /* harmony export */   StoppingCriteria: () => (/* reexport safe */ _generation_stopping_criteria_js__WEBPACK_IMPORTED_MODULE_19__.StoppingCriteria),
 /* harmony export */   StoppingCriteriaList: () => (/* reexport safe */ _generation_stopping_criteria_js__WEBPACK_IMPORTED_MODULE_19__.StoppingCriteriaList),
+/* harmony export */   StyleTextToSpeech2Model: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.StyleTextToSpeech2Model),
+/* harmony export */   StyleTextToSpeech2PreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.StyleTextToSpeech2PreTrainedModel),
 /* harmony export */   SummarizationPipeline: () => (/* reexport safe */ _pipelines_js__WEBPACK_IMPORTED_MODULE_1__.SummarizationPipeline),
 /* harmony export */   SuppressTokensAtBeginLogitsProcessor: () => (/* reexport safe */ _generation_logits_process_js__WEBPACK_IMPORTED_MODULE_20__.SuppressTokensAtBeginLogitsProcessor),
 /* harmony export */   Swin2SRForImageSuperResolution: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Swin2SRForImageSuperResolution),