@huggingface/transformers 3.2.3 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/README.md +5 -3
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/ort.bundle.min.mjs +2776 -0
  4. package/dist/transformers.cjs +792 -330
  5. package/dist/transformers.cjs.map +1 -1
  6. package/dist/transformers.js +1150 -656
  7. package/dist/transformers.js.map +1 -1
  8. package/dist/transformers.min.cjs +1 -1
  9. package/dist/transformers.min.cjs.map +1 -1
  10. package/dist/transformers.min.js +1 -1
  11. package/dist/transformers.min.js.map +1 -1
  12. package/dist/transformers.min.mjs +1 -1
  13. package/dist/transformers.min.mjs.map +1 -1
  14. package/dist/transformers.mjs +798 -331
  15. package/dist/transformers.mjs.map +1 -1
  16. package/package.json +3 -3
  17. package/src/base/feature_extraction_utils.js +9 -9
  18. package/src/base/image_processors_utils.js +12 -1
  19. package/src/base/processing_utils.js +24 -3
  20. package/src/configs.js +5 -0
  21. package/src/env.js +1 -2
  22. package/src/generation/streamers.js +5 -2
  23. package/src/models/auto/feature_extraction_auto.js +0 -16
  24. package/src/models/auto/processing_auto.js +0 -16
  25. package/src/models/convnext/image_processing_convnext.js +1 -0
  26. package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
  27. package/src/models/florence2/processing_florence2.js +3 -0
  28. package/src/models/grounding_dino/image_processing_grounding_dino.js +29 -0
  29. package/src/models/grounding_dino/processing_grounding_dino.js +101 -0
  30. package/src/models/idefics3/image_processing_idefics3.js +2 -0
  31. package/src/models/image_processors.js +1 -0
  32. package/src/models/janus/image_processing_janus.js +1 -0
  33. package/src/models/mgp_str/processing_mgp_str.js +2 -0
  34. package/src/models/paligemma/processing_paligemma.js +1 -0
  35. package/src/models/phi3_v/processing_phi3_v.js +1 -1
  36. package/src/models/processors.js +3 -2
  37. package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
  38. package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
  39. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
  40. package/src/models/whisper/feature_extraction_whisper.js +1 -1
  41. package/src/models.js +72 -20
  42. package/src/ops/registry.js +10 -0
  43. package/src/pipelines.js +73 -23
  44. package/src/tokenizers.js +4 -7
  45. package/src/utils/audio.js +113 -1
  46. package/src/utils/core.js +26 -0
  47. package/src/utils/dtypes.js +2 -0
  48. package/src/utils/hub.js +1 -1
  49. package/src/utils/image.js +5 -18
  50. package/src/utils/maths.js +8 -6
  51. package/src/utils/tensor.js +134 -114
  52. package/types/base/feature_extraction_utils.d.ts +7 -7
  53. package/types/base/image_processors_utils.d.ts +7 -0
  54. package/types/base/image_processors_utils.d.ts.map +1 -1
  55. package/types/base/processing_utils.d.ts +25 -19
  56. package/types/base/processing_utils.d.ts.map +1 -1
  57. package/types/configs.d.ts.map +1 -1
  58. package/types/generation/parameters.d.ts +1 -1
  59. package/types/generation/streamers.d.ts +3 -1
  60. package/types/generation/streamers.d.ts.map +1 -1
  61. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  62. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  63. package/types/models/auto/processing_auto.d.ts.map +1 -1
  64. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
  65. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
  66. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  67. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +20 -0
  68. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts.map +1 -0
  69. package/types/models/grounding_dino/processing_grounding_dino.d.ts +27 -0
  70. package/types/models/grounding_dino/processing_grounding_dino.d.ts.map +1 -0
  71. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  72. package/types/models/image_processors.d.ts +1 -0
  73. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  74. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  75. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  76. package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
  77. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
  78. package/types/models/processors.d.ts +3 -2
  79. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  80. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  81. package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
  82. package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
  83. package/types/models/whisper/generation_whisper.d.ts +1 -1
  84. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  85. package/types/models.d.ts +40 -17
  86. package/types/models.d.ts.map +1 -1
  87. package/types/ops/registry.d.ts +1 -0
  88. package/types/ops/registry.d.ts.map +1 -1
  89. package/types/pipelines.d.ts +7 -12
  90. package/types/pipelines.d.ts.map +1 -1
  91. package/types/tokenizers.d.ts.map +1 -1
  92. package/types/tsconfig.tsbuildinfo +1 -0
  93. package/types/utils/audio.d.ts +25 -0
  94. package/types/utils/audio.d.ts.map +1 -1
  95. package/types/utils/core.d.ts +6 -0
  96. package/types/utils/core.d.ts.map +1 -1
  97. package/types/utils/dtypes.d.ts.map +1 -1
  98. package/types/utils/hub.d.ts +1 -1
  99. package/types/utils/hub.d.ts.map +1 -1
  100. package/types/utils/image.d.ts +3 -2
  101. package/types/utils/image.d.ts.map +1 -1
  102. package/types/utils/maths.d.ts +8 -6
  103. package/types/utils/maths.d.ts.map +1 -1
  104. package/types/utils/tensor.d.ts +22 -6
  105. package/types/utils/tensor.d.ts.map +1 -1
@@ -4158,23 +4158,23 @@ class FeatureExtractor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Ca
4158
4158
  }
4159
4159
 
4160
4160
  /**
4161
- * Instantiate one of the processor classes of the library from a pretrained model.
4161
+ * Instantiate one of the feature extractor classes of the library from a pretrained model.
4162
4162
  *
4163
- * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
4164
- * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
4163
+ * The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
4164
+ * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
4165
4165
  *
4166
4166
  * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
4167
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
4167
+ * - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
4168
4168
  * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
4169
4169
  * user or organization name, like `dbmdz/bert-base-german-cased`.
4170
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
4171
- * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
4170
+ * - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
4171
+ * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
4172
4172
  *
4173
- * @returns {Promise<FeatureExtractor>} A new instance of the Processor class.
4173
+ * @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
4174
4174
  */
4175
4175
  static async from_pretrained(pretrained_model_name_or_path, options) {
4176
- const preprocessorConfig = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
4177
- return new this(preprocessorConfig);
4176
+ const config = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
4177
+ return new this(config);
4178
4178
  }
4179
4179
  }
4180
4180
 
@@ -4207,6 +4207,7 @@ function validate_audio_inputs(audio, feature_extractor) {
4207
4207
  __webpack_require__.r(__webpack_exports__);
4208
4208
  /* harmony export */ __webpack_require__.d(__webpack_exports__, {
4209
4209
  /* harmony export */ ImageProcessor: () => (/* binding */ ImageProcessor),
4210
+ /* harmony export */ center_to_corners_format: () => (/* binding */ center_to_corners_format),
4210
4211
  /* harmony export */ post_process_instance_segmentation: () => (/* binding */ post_process_instance_segmentation),
4211
4212
  /* harmony export */ post_process_object_detection: () => (/* binding */ post_process_object_detection),
4212
4213
  /* harmony export */ post_process_panoptic_segmentation: () => (/* binding */ post_process_panoptic_segmentation),
@@ -4825,14 +4826,20 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
4825
4826
  this.do_thumbnail = config.do_thumbnail;
4826
4827
  this.size = config.size ?? config.image_size;
4827
4828
  this.do_resize = config.do_resize ?? (this.size !== undefined);
4829
+ // @ts-expect-error TS2339
4828
4830
  this.size_divisibility = config.size_divisibility ?? config.size_divisor;
4829
4831
 
4830
4832
  this.do_center_crop = config.do_center_crop;
4833
+ // @ts-expect-error TS2339
4831
4834
  this.crop_size = config.crop_size;
4835
+ // @ts-expect-error TS2339
4832
4836
  this.do_convert_rgb = config.do_convert_rgb ?? true;
4837
+ // @ts-expect-error TS2339
4833
4838
  this.do_crop_margin = config.do_crop_margin;
4834
4839
 
4840
+ // @ts-expect-error TS2339
4835
4841
  this.pad_size = config.pad_size;
4842
+ // @ts-expect-error TS2339
4836
4843
  this.do_pad = config.do_pad;
4837
4844
 
4838
4845
  if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
@@ -5041,6 +5048,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5041
5048
  // Support both formats for backwards compatibility
5042
5049
  else if (Number.isInteger(size)) {
5043
5050
  shortest_edge = size;
5051
+ // @ts-expect-error TS2339
5044
5052
  longest_edge = this.config.max_size ?? shortest_edge;
5045
5053
 
5046
5054
  } else if (size !== undefined) {
@@ -5109,6 +5117,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5109
5117
  } else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
5110
5118
  // Custom resize logic for Qwen2-VL models
5111
5119
  const { min_pixels, max_pixels } = size;
5120
+ // @ts-expect-error TS2339
5112
5121
  const factor = this.config.patch_size * this.config.merge_size;
5113
5122
  return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
5114
5123
  } else {
@@ -5124,6 +5133,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5124
5133
  async resize(image) {
5125
5134
  const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
5126
5135
  return await image.resize(newWidth, newHeight, {
5136
+ // @ts-expect-error TS2322
5127
5137
  resample: this.resample,
5128
5138
  });
5129
5139
  }
@@ -5174,6 +5184,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5174
5184
 
5175
5185
  // Resize the image using thumbnail method.
5176
5186
  if (this.do_thumbnail) {
5187
+ // @ts-expect-error TS2345
5177
5188
  image = await this.thumbnail(image, this.size, this.resample);
5178
5189
  }
5179
5190
 
@@ -5198,6 +5209,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5198
5209
  // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
5199
5210
  // occurs with data in the hwc format (height, width, channels),
5200
5211
  // to emulate the behavior of the original Python code (w/ numpy).
5212
+ /** @type {Float32Array} */
5201
5213
  let pixelData = Float32Array.from(image.data);
5202
5214
  let imgDims = [image.height, image.width, image.channels];
5203
5215
 
@@ -5356,6 +5368,7 @@ __webpack_require__.r(__webpack_exports__);
5356
5368
  /**
5357
5369
  * @typedef {Object} ProcessorProperties Additional processor-specific properties.
5358
5370
  * @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
5371
+ * @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
5359
5372
  */
5360
5373
 
5361
5374
 
@@ -5389,7 +5402,7 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5389
5402
  }
5390
5403
 
5391
5404
  /**
5392
- * @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
5405
+ * @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
5393
5406
  */
5394
5407
  get tokenizer() {
5395
5408
  return this.components.tokenizer;
@@ -5402,6 +5415,11 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5402
5415
  return this.components.feature_extractor;
5403
5416
  }
5404
5417
 
5418
+ /**
5419
+ * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
5420
+ * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
5421
+ * @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
5422
+ */
5405
5423
  apply_chat_template(messages, options = {}) {
5406
5424
  if (!this.tokenizer) {
5407
5425
  throw new Error('Unable to apply chat template without a tokenizer.');
@@ -5412,6 +5430,10 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5412
5430
  });
5413
5431
  }
5414
5432
 
5433
+ /**
5434
+ * @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
5435
+ * @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
5436
+ */
5415
5437
  batch_decode(...args) {
5416
5438
  if (!this.tokenizer) {
5417
5439
  throw new Error('Unable to decode without a tokenizer.');
@@ -5419,6 +5441,17 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5419
5441
  return this.tokenizer.batch_decode(...args);
5420
5442
  }
5421
5443
 
5444
+ /**
5445
+ * @param {Parameters<PreTrainedTokenizer['decode']>} args
5446
+ * @returns {ReturnType<PreTrainedTokenizer['decode']>}
5447
+ */
5448
+ decode(...args) {
5449
+ if (!this.tokenizer) {
5450
+ throw new Error('Unable to decode without a tokenizer.');
5451
+ }
5452
+ return this.tokenizer.decode(...args);
5453
+ }
5454
+
5422
5455
 
5423
5456
  /**
5424
5457
  * Calls the feature_extractor function with the given input.
@@ -5439,8 +5472,8 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5439
5472
  /**
5440
5473
  * Instantiate one of the processor classes of the library from a pretrained model.
5441
5474
  *
5442
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
5443
- * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
5475
+ * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
5476
+ * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
5444
5477
  *
5445
5478
  * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
5446
5479
  * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
@@ -5560,15 +5593,19 @@ function getNormalizedConfig(config) {
5560
5593
  case 'florence2':
5561
5594
  case 'llava_onevision':
5562
5595
  case 'idefics3':
5596
+ // @ts-expect-error TS2339
5563
5597
  init_normalized_config = getNormalizedConfig(config.text_config);
5564
5598
  break;
5565
5599
  case 'moondream1':
5600
+ // @ts-expect-error TS2339
5566
5601
  init_normalized_config = getNormalizedConfig(config.phi_config);
5567
5602
  break;
5568
5603
  case 'musicgen':
5604
+ // @ts-expect-error TS2339
5569
5605
  init_normalized_config = getNormalizedConfig(config.decoder);
5570
5606
  break;
5571
5607
  case 'multi_modality':
5608
+ // @ts-expect-error TS2339
5572
5609
  init_normalized_config = getNormalizedConfig(config.language_config);
5573
5610
  break;
5574
5611
 
@@ -5689,6 +5726,7 @@ function getNormalizedConfig(config) {
5689
5726
  break;
5690
5727
 
5691
5728
  case 'vision-encoder-decoder':
5729
+ // @ts-expect-error TS2339
5692
5730
  const decoderConfig = getNormalizedConfig(config.decoder);
5693
5731
 
5694
5732
  const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
@@ -5932,7 +5970,7 @@ __webpack_require__.r(__webpack_exports__);
5932
5970
 
5933
5971
 
5934
5972
 
5935
- const VERSION = '3.2.3';
5973
+ const VERSION = '3.3.0';
5936
5974
 
5937
5975
  // Check if various APIs are available (depends on environment)
5938
5976
  const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
@@ -6068,7 +6106,6 @@ function isEmpty(obj) {
6068
6106
  }
6069
6107
 
6070
6108
 
6071
-
6072
6109
  /***/ }),
6073
6110
 
6074
6111
  /***/ "./src/generation/configuration_utils.js":
@@ -7683,6 +7720,7 @@ class TextStreamer extends BaseStreamer {
7683
7720
  * @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
7684
7721
  * @param {Object} options
7685
7722
  * @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
7723
+ * @param {boolean} [options.skip_special_tokens=true] Whether to skip special tokens when decoding
7686
7724
  * @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
7687
7725
  * @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
7688
7726
  * @param {Object} [options.decode_kwargs={}] Additional keyword arguments to pass to the tokenizer's decode method
@@ -7691,6 +7729,7 @@ class TextStreamer extends BaseStreamer {
7691
7729
  skip_prompt = false,
7692
7730
  callback_function = null,
7693
7731
  token_callback_function = null,
7732
+ skip_special_tokens = true,
7694
7733
  decode_kwargs = {},
7695
7734
  ...kwargs
7696
7735
  } = {}) {
@@ -7699,7 +7738,7 @@ class TextStreamer extends BaseStreamer {
7699
7738
  this.skip_prompt = skip_prompt;
7700
7739
  this.callback_function = callback_function ?? stdout_write;
7701
7740
  this.token_callback_function = token_callback_function;
7702
- this.decode_kwargs = { ...decode_kwargs, ...kwargs };
7741
+ this.decode_kwargs = { skip_special_tokens, ...decode_kwargs, ...kwargs };
7703
7742
 
7704
7743
  // variables used in the streaming process
7705
7744
  this.token_cache = [];
@@ -7815,9 +7854,10 @@ class WhisperTextStreamer extends TextStreamer {
7815
7854
  } = {}) {
7816
7855
  super(tokenizer, {
7817
7856
  skip_prompt,
7857
+ skip_special_tokens,
7818
7858
  callback_function,
7819
7859
  token_callback_function,
7820
- decode_kwargs: { skip_special_tokens, ...decode_kwargs },
7860
+ decode_kwargs,
7821
7861
  });
7822
7862
  this.timestamp_begin = tokenizer.timestamp_begin;
7823
7863
 
@@ -8071,6 +8111,8 @@ __webpack_require__.r(__webpack_exports__);
8071
8111
  /* harmony export */ GraniteForCausalLM: () => (/* binding */ GraniteForCausalLM),
8072
8112
  /* harmony export */ GraniteModel: () => (/* binding */ GraniteModel),
8073
8113
  /* harmony export */ GranitePreTrainedModel: () => (/* binding */ GranitePreTrainedModel),
8114
+ /* harmony export */ GroundingDinoForObjectDetection: () => (/* binding */ GroundingDinoForObjectDetection),
8115
+ /* harmony export */ GroundingDinoPreTrainedModel: () => (/* binding */ GroundingDinoPreTrainedModel),
8074
8116
  /* harmony export */ GroupViTModel: () => (/* binding */ GroupViTModel),
8075
8117
  /* harmony export */ GroupViTPreTrainedModel: () => (/* binding */ GroupViTPreTrainedModel),
8076
8118
  /* harmony export */ HieraForImageClassification: () => (/* binding */ HieraForImageClassification),
@@ -8279,6 +8321,8 @@ __webpack_require__.r(__webpack_exports__);
8279
8321
  /* harmony export */ Starcoder2ForCausalLM: () => (/* binding */ Starcoder2ForCausalLM),
8280
8322
  /* harmony export */ Starcoder2Model: () => (/* binding */ Starcoder2Model),
8281
8323
  /* harmony export */ Starcoder2PreTrainedModel: () => (/* binding */ Starcoder2PreTrainedModel),
8324
+ /* harmony export */ StyleTextToSpeech2Model: () => (/* binding */ StyleTextToSpeech2Model),
8325
+ /* harmony export */ StyleTextToSpeech2PreTrainedModel: () => (/* binding */ StyleTextToSpeech2PreTrainedModel),
8282
8326
  /* harmony export */ Swin2SRForImageSuperResolution: () => (/* binding */ Swin2SRForImageSuperResolution),
8283
8327
  /* harmony export */ Swin2SRModel: () => (/* binding */ Swin2SRModel),
8284
8328
  /* harmony export */ Swin2SRPreTrainedModel: () => (/* binding */ Swin2SRPreTrainedModel),
@@ -8594,8 +8638,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
8594
8638
  } else if (session_options.externalData !== undefined) {
8595
8639
  externalDataPromises = session_options.externalData.map(async (ext) => {
8596
8640
  // if the external data is a string, fetch the file and replace the string with its content
8641
+ // @ts-expect-error TS2339
8597
8642
  if (typeof ext.data === "string") {
8643
+ // @ts-expect-error TS2339
8598
8644
  const ext_buffer = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, ext.data, true, options);
8645
+ // @ts-expect-error TS2698
8599
8646
  return { ...ext, data: ext_buffer };
8600
8647
  }
8601
8648
  return ext;
@@ -8853,14 +8900,23 @@ async function encoderForward(self, model_inputs) {
8853
8900
  encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids });
8854
8901
  }
8855
8902
  if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) {
8903
+ if (!encoderFeeds.input_ids) {
8904
+ throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.');
8905
+ }
8856
8906
  // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it,
8857
8907
  // but they weren't created by the tokenizer.
8858
- encoderFeeds.token_type_ids = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.Tensor(
8859
- 'int64',
8860
- new BigInt64Array(encoderFeeds.input_ids.data.length),
8861
- encoderFeeds.input_ids.dims
8862
- )
8908
+ encoderFeeds.token_type_ids = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.zeros_like)(encoderFeeds.input_ids);
8863
8909
  }
8910
+ if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) {
8911
+ if (!encoderFeeds.pixel_values) {
8912
+ throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.');
8913
+ }
8914
+ // Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it,
8915
+ // but they weren't created by the processor.
8916
+ const dims = encoderFeeds.pixel_values.dims;
8917
+ encoderFeeds.pixel_mask = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.ones)([dims[0], dims[2], dims[3]]);
8918
+ }
8919
+
8864
8920
  return await sessionRun(session, encoderFeeds);
8865
8921
  }
8866
8922
 
@@ -9843,6 +9899,7 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
9843
9899
  if (this.config.model_type === 'musicgen') {
9844
9900
  // Custom logic (TODO: move to Musicgen class)
9845
9901
  decoder_input_ids = Array.from({
9902
+ // @ts-expect-error TS2339
9846
9903
  length: batch_size * this.config.decoder.num_codebooks
9847
9904
  }, () => [decoder_start_token_id]);
9848
9905
 
@@ -10172,11 +10229,13 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
10172
10229
  async encode_image({ pixel_values }) {
10173
10230
  // image_inputs === { pixel_values }
10174
10231
  const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
10232
+ // @ts-expect-error TS2339
10175
10233
  if (!this.config.num_image_tokens) {
10176
10234
  console.warn(
10177
10235
  'The number of image tokens was not set in the model configuration. ' +
10178
10236
  `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
10179
10237
  )
10238
+ // @ts-expect-error TS2339
10180
10239
  this.config.num_image_tokens = features.dims[1];
10181
10240
  }
10182
10241
  return features;
@@ -11604,6 +11663,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11604
11663
 
11605
11664
  if (generation_config.return_token_timestamps) {
11606
11665
  outputs["token_timestamps"] = this._extract_token_timestamps(
11666
+ // @ts-expect-error TS2345
11607
11667
  outputs,
11608
11668
  generation_config.alignment_heads,
11609
11669
  generation_config.num_frames,
@@ -11639,6 +11699,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11639
11699
  );
11640
11700
  }
11641
11701
 
11702
+ // @ts-expect-error TS2339
11642
11703
  let median_filter_width = this.config.median_filter_width;
11643
11704
  if (median_filter_width === undefined) {
11644
11705
  console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -11649,6 +11710,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11649
11710
  const batch = generate_outputs.cross_attentions;
11650
11711
  // Create a list with `decoder_layers` elements, each a tensor of shape
11651
11712
  // (batch size, attention_heads, output length, input length).
11713
+ // @ts-expect-error TS2339
11652
11714
  const cross_attentions = Array.from({ length: this.config.decoder_layers },
11653
11715
  // Concatenate the cross attentions for each layer across sequence length dimension.
11654
11716
  (_, i) => (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.cat)(batch.map(x => x[i]), 2)
@@ -11792,6 +11854,7 @@ class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
11792
11854
  attention_mask,
11793
11855
  }) {
11794
11856
 
11857
+ // @ts-expect-error TS2339
11795
11858
  const image_token_index = this.config.image_token_index;
11796
11859
 
11797
11860
  const idsList = input_ids.tolist();
@@ -12777,6 +12840,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12777
12840
  const image_nums = vision_tokens.filter(x => x == image_token_id).length;
12778
12841
  const video_nums = vision_tokens.filter(x => x == video_token_id).length;
12779
12842
 
12843
+ /** @type {number[][]} */
12780
12844
  let llm_pos_ids_list = [];
12781
12845
  let st = 0;
12782
12846
  let remain_images = image_nums;
@@ -12846,6 +12910,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12846
12910
  // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
12847
12911
  // meaning to perform concatenation along dim=1, we can do the following:
12848
12912
  const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
12913
+ /** @type {number[]} */
12849
12914
  const llm_positions = new Array(num_items);
12850
12915
  let index = 0;
12851
12916
  for (let x = 0; x < 3; ++x) {
@@ -12886,9 +12951,10 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12886
12951
  { length: 3 * data.length },
12887
12952
  (_, i) => data[i % data.length]
12888
12953
  );
12954
+ /** @type {bigint[]} */
12889
12955
  const mrope_position_deltas = Array.from(
12890
12956
  { length: dims[0] },
12891
- (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
12957
+ (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
12892
12958
  );
12893
12959
 
12894
12960
  return [
@@ -13459,7 +13525,7 @@ class DPTModel extends DPTPreTrainedModel { }
13459
13525
  *
13460
13526
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
13461
13527
  * ```javascript
13462
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
13528
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
13463
13529
  *
13464
13530
  * // Load model and processor
13465
13531
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -13468,7 +13534,7 @@ class DPTModel extends DPTPreTrainedModel { }
13468
13534
  *
13469
13535
  * // Load image from URL
13470
13536
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
13471
- * const image = await RawImage.fromURL(url);
13537
+ * const image = await RawImage.read(url);
13472
13538
  *
13473
13539
  * // Prepare image for the model
13474
13540
  * const inputs = await processor(image);
@@ -13477,10 +13543,15 @@ class DPTModel extends DPTPreTrainedModel { }
13477
13543
  * const { predicted_depth } = await model(inputs);
13478
13544
  *
13479
13545
  * // Interpolate to original size
13480
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
13546
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
13547
+ * size: image.size.reverse(),
13548
+ * mode: 'bilinear',
13549
+ * })).squeeze(1);
13481
13550
  *
13482
13551
  * // Visualize the prediction
13483
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
13552
+ * const min = prediction.min().item();
13553
+ * const max = prediction.max().item();
13554
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
13484
13555
  * const depth = RawImage.fromTensor(formatted);
13485
13556
  * // RawImage {
13486
13557
  * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -13530,11 +13601,7 @@ class GLPNPreTrainedModel extends PreTrainedModel { }
13530
13601
  class GLPNModel extends GLPNPreTrainedModel { }
13531
13602
 
13532
13603
  /**
13533
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
13534
- *
13535
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
13536
- * ```javascript
13537
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
13604
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
13538
13605
  *
13539
13606
  * // Load model and processor
13540
13607
  * const model_id = 'Xenova/glpn-kitti';
@@ -13543,7 +13610,7 @@ class GLPNModel extends GLPNPreTrainedModel { }
13543
13610
  *
13544
13611
  * // Load image from URL
13545
13612
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
13546
- * const image = await RawImage.fromURL(url);
13613
+ * const image = await RawImage.read(url);
13547
13614
  *
13548
13615
  * // Prepare image for the model
13549
13616
  * const inputs = await processor(image);
@@ -13552,13 +13619,18 @@ class GLPNModel extends GLPNPreTrainedModel { }
13552
13619
  * const { predicted_depth } = await model(inputs);
13553
13620
  *
13554
13621
  * // Interpolate to original size
13555
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
13622
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
13623
+ * size: image.size.reverse(),
13624
+ * mode: 'bilinear',
13625
+ * })).squeeze(1);
13556
13626
  *
13557
13627
  * // Visualize the prediction
13558
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
13628
+ * const min = prediction.min().item();
13629
+ * const max = prediction.max().item();
13630
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
13559
13631
  * const depth = RawImage.fromTensor(formatted);
13560
13632
  * // RawImage {
13561
- * // data: Uint8Array(307200) [ 207, 169, 154, ... ],
13633
+ * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
13562
13634
  * // width: 640,
13563
13635
  * // height: 480,
13564
13636
  * // channels: 1
@@ -13733,6 +13805,8 @@ class Dinov2WithRegistersForImageClassification extends Dinov2WithRegistersPreTr
13733
13805
  }
13734
13806
  }
13735
13807
  //////////////////////////////////////////////////
13808
+ class GroundingDinoPreTrainedModel extends PreTrainedModel { }
13809
+ class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel { }
13736
13810
 
13737
13811
  //////////////////////////////////////////////////
13738
13812
  class YolosPreTrainedModel extends PreTrainedModel { }
@@ -14431,6 +14505,9 @@ class WavLMForAudioFrameClassification extends WavLMPreTrainedModel {
14431
14505
  }
14432
14506
  }
14433
14507
 
14508
+ class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel { }
14509
+ class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel { }
14510
+
14434
14511
  //////////////////////////////////////////////////
14435
14512
  // SpeechT5 models
14436
14513
  /**
@@ -14525,10 +14602,12 @@ class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
14525
14602
 
14526
14603
  const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
14527
14604
 
14605
+ // @ts-expect-error TS2339
14528
14606
  const r = encoder_outputs.dims[1] / this.config.reduction_factor;
14529
14607
  const maxlen = Math.floor(r * maxlenratio);
14530
14608
  const minlen = Math.floor(r * minlenratio);
14531
14609
 
14610
+ // @ts-expect-error TS2339
14532
14611
  const num_mel_bins = this.config.num_mel_bins;
14533
14612
 
14534
14613
  let spectrogramParts = [];
@@ -14893,11 +14972,13 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
14893
14972
  */
14894
14973
  _apply_and_filter_by_delay_pattern_mask(outputs) {
14895
14974
  const [bs_x_codebooks, seqLength] = outputs.dims;
14975
+ // @ts-expect-error TS2339
14896
14976
  const num_codebooks = this.config.decoder.num_codebooks;
14897
14977
  const upperBound = (seqLength - num_codebooks);
14898
14978
 
14899
14979
  let newDataSize = 0;
14900
14980
  for (let i = 0; i < outputs.size; ++i) {
14981
+ // @ts-expect-error TS2339
14901
14982
  if (outputs.data[i] === this.config.decoder.pad_token_id) {
14902
14983
  continue;
14903
14984
  }
@@ -14927,7 +15008,9 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
14927
15008
  let clonedInputIds = structuredClone(input_ids);
14928
15009
  for (let i = 0; i < clonedInputIds.length; ++i) {
14929
15010
  for (let j = 0; j < clonedInputIds[i].length; ++j) {
15011
+ // @ts-expect-error TS2339
14930
15012
  if ((i % this.config.decoder.num_codebooks) >= j) {
15013
+ // @ts-expect-error TS2339
14931
15014
  clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
14932
15015
  }
14933
15016
  }
@@ -15084,6 +15167,9 @@ class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
15084
15167
  'past_key_values',
15085
15168
  ];
15086
15169
 
15170
+ /**
15171
+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
15172
+ */
15087
15173
  constructor(...args) {
15088
15174
  super(...args);
15089
15175
 
@@ -15385,6 +15471,8 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
15385
15471
 
15386
15472
  ['maskformer', ['MaskFormerModel', MaskFormerModel]],
15387
15473
  ['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]],
15474
+
15475
+ ['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]],
15388
15476
  ]);
15389
15477
 
15390
15478
  const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -15629,6 +15717,7 @@ const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
15629
15717
  const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
15630
15718
  ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]],
15631
15719
  ['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]],
15720
+ ['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]],
15632
15721
  ]);
15633
15722
 
15634
15723
  const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
@@ -16052,10 +16141,17 @@ class SequenceClassifierOutput extends ModelOutput {
16052
16141
  /**
16053
16142
  * @param {Object} output The output of the model.
16054
16143
  * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
16144
+ * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
16145
+ * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
16055
16146
  */
16056
- constructor({ logits }) {
16147
+ constructor({ logits, ...attentions }) {
16057
16148
  super();
16058
16149
  this.logits = logits;
16150
+ const attentions_list = Object.values(attentions);
16151
+ if (attentions_list.length > 0) {
16152
+ // Only set attentions if they are not empty
16153
+ this.attentions = attentions_list;
16154
+ }
16059
16155
  }
16060
16156
  }
16061
16157
 
@@ -16313,22 +16409,6 @@ __webpack_require__.r(__webpack_exports__);
16313
16409
 
16314
16410
  class AutoFeatureExtractor {
16315
16411
 
16316
- /**
16317
- * Instantiate one of the feature extractor classes of the library from a pretrained model.
16318
- *
16319
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of
16320
- * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
16321
- *
16322
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
16323
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
16324
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
16325
- * user or organization name, like `dbmdz/bert-base-german-cased`.
16326
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
16327
- * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
16328
- *
16329
- * @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
16330
- */
16331
-
16332
16412
  /** @type {typeof FeatureExtractor.from_pretrained} */
16333
16413
  static async from_pretrained(pretrained_model_name_or_path, options={}) {
16334
16414
 
@@ -16457,22 +16537,6 @@ __webpack_require__.r(__webpack_exports__);
16457
16537
  */
16458
16538
  class AutoProcessor {
16459
16539
 
16460
- /**
16461
- * Instantiate one of the processor classes of the library from a pretrained model.
16462
- *
16463
- * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
16464
- * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
16465
- *
16466
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
16467
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
16468
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
16469
- * user or organization name, like `dbmdz/bert-base-german-cased`.
16470
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
16471
- * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
16472
- *
16473
- * @returns {Promise<Processor>} A new instance of the Processor class.
16474
- */
16475
-
16476
16540
  /** @type {typeof Processor.from_pretrained} */
16477
16541
  static async from_pretrained(pretrained_model_name_or_path, options={}) {
16478
16542
 
@@ -16796,6 +16860,7 @@ class ConvNextImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
16796
16860
  /**
16797
16861
  * Percentage of the image to crop. Only has an effect if this.size < 384.
16798
16862
  */
16863
+ // @ts-expect-error TS2339
16799
16864
  this.crop_pct = this.config.crop_pct ?? (224 / 256);
16800
16865
  }
16801
16866
 
@@ -17003,6 +17068,7 @@ __webpack_require__.r(__webpack_exports__);
17003
17068
  class EfficientNetImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
17004
17069
  constructor(config) {
17005
17070
  super(config);
17071
+ // @ts-expect-error TS2339
17006
17072
  this.include_top = this.config.include_top ?? true;
17007
17073
  if (this.include_top) {
17008
17074
  this.image_std = this.image_std.map(x => x * x);
@@ -17086,8 +17152,11 @@ class Florence2Processor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
17086
17152
  super(config, components);
17087
17153
 
17088
17154
  const {
17155
+ // @ts-expect-error TS2339
17089
17156
  tasks_answer_post_processing_type,
17157
+ // @ts-expect-error TS2339
17090
17158
  task_prompts_without_inputs,
17159
+ // @ts-expect-error TS2339
17091
17160
  task_prompts_with_input,
17092
17161
  } = this.image_processor.config;
17093
17162
 
@@ -17223,6 +17292,170 @@ __webpack_require__.r(__webpack_exports__);
17223
17292
  class GLPNFeatureExtractor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor { }
17224
17293
 
17225
17294
 
17295
+ /***/ }),
17296
+
17297
+ /***/ "./src/models/grounding_dino/image_processing_grounding_dino.js":
17298
+ /*!**********************************************************************!*\
17299
+ !*** ./src/models/grounding_dino/image_processing_grounding_dino.js ***!
17300
+ \**********************************************************************/
17301
+ /***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
17302
+
17303
+ "use strict";
17304
+ __webpack_require__.r(__webpack_exports__);
17305
+ /* harmony export */ __webpack_require__.d(__webpack_exports__, {
17306
+ /* harmony export */ GroundingDinoImageProcessor: () => (/* binding */ GroundingDinoImageProcessor)
17307
+ /* harmony export */ });
17308
+ /* harmony import */ var _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/image_processors_utils.js */ "./src/base/image_processors_utils.js");
17309
+ /* harmony import */ var _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../../utils/tensor.js */ "./src/utils/tensor.js");
17310
+
17311
+
17312
+
17313
+
17314
+
17315
+ /**
17316
+ * @typedef {object} GroundingDinoFeatureExtractorResultProps
17317
+ * @property {import('../../utils/tensor.js').Tensor} pixel_mask
17318
+ * @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
17319
+ */
17320
+
17321
+ class GroundingDinoImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
17322
+ /**
17323
+ * Calls the feature extraction process on an array of images, preprocesses
17324
+ * each image, and concatenates the resulting features into a single Tensor.
17325
+ * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
17326
+ * @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
17327
+ */
17328
+ async _call(images) {
17329
+ const result = await super._call(images);
17330
+
17331
+ const dims = result.pixel_values.dims;
17332
+ const pixel_mask = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.ones)([dims[0], dims[2], dims[3]]);
17333
+
17334
+ return { ...result, pixel_mask };
17335
+ }
17336
+ }
17337
+
17338
+
17339
+ /***/ }),
17340
+
17341
+ /***/ "./src/models/grounding_dino/processing_grounding_dino.js":
17342
+ /*!****************************************************************!*\
17343
+ !*** ./src/models/grounding_dino/processing_grounding_dino.js ***!
17344
+ \****************************************************************/
17345
+ /***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
17346
+
17347
+ "use strict";
17348
+ __webpack_require__.r(__webpack_exports__);
17349
+ /* harmony export */ __webpack_require__.d(__webpack_exports__, {
17350
+ /* harmony export */ GroundingDinoProcessor: () => (/* binding */ GroundingDinoProcessor)
17351
+ /* harmony export */ });
17352
+ /* harmony import */ var _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/processing_utils.js */ "./src/base/processing_utils.js");
17353
+ /* harmony import */ var _auto_image_processing_auto_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../auto/image_processing_auto.js */ "./src/models/auto/image_processing_auto.js");
17354
+ /* harmony import */ var _tokenizers_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ../../tokenizers.js */ "./src/tokenizers.js");
17355
+ /* harmony import */ var _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ../../base/image_processors_utils.js */ "./src/base/image_processors_utils.js");
17356
+
17357
+
17358
+
17359
+
17360
+
17361
+ /**
17362
+ * Get token ids of phrases from posmaps and input_ids.
17363
+ * @param {import('../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`.
17364
+ * @param {import('../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`.
17365
+ */
17366
+ function get_phrases_from_posmap(posmaps, input_ids) {
17367
+
17368
+ const left_idx = 0;
17369
+ const right_idx = posmaps.dims.at(-1) - 1;
17370
+
17371
+ const posmaps_list = posmaps.tolist();
17372
+ posmaps_list.fill(false, 0, left_idx + 1);
17373
+ posmaps_list.fill(false, right_idx);
17374
+
17375
+ const input_ids_list = input_ids.tolist();
17376
+ return posmaps_list
17377
+ .map((val, idx) => val ? idx : null)
17378
+ .filter(idx => idx !== null)
17379
+ .map(i => input_ids_list[i]);
17380
+ }
17381
+
17382
+ class GroundingDinoProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__.Processor {
17383
+ static tokenizer_class = _tokenizers_js__WEBPACK_IMPORTED_MODULE_2__.AutoTokenizer
17384
+ static image_processor_class = _auto_image_processing_auto_js__WEBPACK_IMPORTED_MODULE_1__.AutoImageProcessor
17385
+
17386
+ /**
17387
+ * @typedef {import('../../utils/image.js').RawImage} RawImage
17388
+ */
17389
+ /**
17390
+ *
17391
+ * @param {RawImage|RawImage[]|RawImage[][]} images
17392
+ * @param {string|string[]} text
17393
+ * @returns {Promise<any>}
17394
+ */
17395
+ async _call(images, text, options = {}) {
17396
+
17397
+ const image_inputs = images ? await this.image_processor(images, options) : {};
17398
+ const text_inputs = text ? this.tokenizer(text, options) : {};
17399
+
17400
+ return {
17401
+ ...text_inputs,
17402
+ ...image_inputs,
17403
+ }
17404
+ }
17405
+ post_process_grounded_object_detection(outputs, input_ids, {
17406
+ box_threshold = 0.25,
17407
+ text_threshold = 0.25,
17408
+ target_sizes = null
17409
+ } = {}) {
17410
+ const { logits, pred_boxes } = outputs;
17411
+ const batch_size = logits.dims[0];
17412
+
17413
+ if (target_sizes !== null && target_sizes.length !== batch_size) {
17414
+ throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
17415
+ }
17416
+ const num_queries = logits.dims.at(1);
17417
+
17418
+ const probs = logits.sigmoid(); // (batch_size, num_queries, 256)
17419
+ const scores = probs.max(-1).tolist(); // (batch_size, num_queries)
17420
+
17421
+ // Convert to [x0, y0, x1, y1] format
17422
+ const boxes = pred_boxes.tolist() // (batch_size, num_queries, 4)
17423
+ .map(batch => batch.map(box => (0,_base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_3__.center_to_corners_format)(box)));
17424
+
17425
+ const results = [];
17426
+ for (let i = 0; i < batch_size; ++i) {
17427
+ const target_size = target_sizes !== null ? target_sizes[i] : null;
17428
+
17429
+ // Convert from relative [0, 1] to absolute [0, height] coordinates
17430
+ if (target_size !== null) {
17431
+ boxes[i] = boxes[i].map(box => box.map((x, j) => x * target_size[(j + 1) % 2]));
17432
+ }
17433
+
17434
+ const batch_scores = scores[i];
17435
+ const final_scores = [];
17436
+ const final_phrases = [];
17437
+ const final_boxes = [];
17438
+ for (let j = 0; j < num_queries; ++j) {
17439
+ const score = batch_scores[j];
17440
+ if (score <= box_threshold) {
17441
+ continue;
17442
+ }
17443
+ const box = boxes[i][j];
17444
+ const prob = probs[i][j];
17445
+
17446
+ final_scores.push(score);
17447
+ final_boxes.push(box);
17448
+
17449
+ const phrases = get_phrases_from_posmap(prob.gt(text_threshold), input_ids[i]);
17450
+ final_phrases.push(phrases);
17451
+ }
17452
+ results.push({ scores: final_scores, boxes: final_boxes, labels: this.batch_decode(final_phrases) });
17453
+ }
17454
+ return results;
17455
+ }
17456
+ }
17457
+
17458
+
17226
17459
  /***/ }),
17227
17460
 
17228
17461
  /***/ "./src/models/idefics3/image_processing_idefics3.js":
@@ -17384,6 +17617,8 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
17384
17617
 
17385
17618
  const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
17386
17619
  const end_offset = (i + 1) * pixel_attention_mask_stride;
17620
+
17621
+ // @ts-expect-error
17387
17622
  pixel_attention_mask_data.fill(false, start_offset, end_offset);
17388
17623
  }
17389
17624
  }
@@ -17652,42 +17887,43 @@ __webpack_require__.r(__webpack_exports__);
17652
17887
  /* harmony export */ DonutImageProcessor: () => (/* reexport safe */ _donut_image_processing_donut_js__WEBPACK_IMPORTED_MODULE_7__.DonutImageProcessor),
17653
17888
  /* harmony export */ EfficientNetImageProcessor: () => (/* reexport safe */ _efficientnet_image_processing_efficientnet_js__WEBPACK_IMPORTED_MODULE_9__.EfficientNetImageProcessor),
17654
17889
  /* harmony export */ GLPNFeatureExtractor: () => (/* reexport safe */ _glpn_image_processing_glpn_js__WEBPACK_IMPORTED_MODULE_10__.GLPNFeatureExtractor),
17655
- /* harmony export */ Idefics3ImageProcessor: () => (/* reexport safe */ _idefics3_image_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_11__.Idefics3ImageProcessor),
17656
- /* harmony export */ JinaCLIPImageProcessor: () => (/* reexport safe */ _jina_clip_image_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_13__.JinaCLIPImageProcessor),
17657
- /* harmony export */ LlavaOnevisionImageProcessor: () => (/* reexport safe */ _llava_onevision_image_processing_llava_onevision_js__WEBPACK_IMPORTED_MODULE_14__.LlavaOnevisionImageProcessor),
17658
- /* harmony export */ Mask2FormerImageProcessor: () => (/* reexport safe */ _mask2former_image_processing_mask2former_js__WEBPACK_IMPORTED_MODULE_15__.Mask2FormerImageProcessor),
17659
- /* harmony export */ MaskFormerFeatureExtractor: () => (/* reexport safe */ _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_16__.MaskFormerFeatureExtractor),
17660
- /* harmony export */ MaskFormerImageProcessor: () => (/* reexport safe */ _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_16__.MaskFormerImageProcessor),
17661
- /* harmony export */ MobileNetV1FeatureExtractor: () => (/* reexport safe */ _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_17__.MobileNetV1FeatureExtractor),
17662
- /* harmony export */ MobileNetV1ImageProcessor: () => (/* reexport safe */ _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_17__.MobileNetV1ImageProcessor),
17663
- /* harmony export */ MobileNetV2FeatureExtractor: () => (/* reexport safe */ _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_18__.MobileNetV2FeatureExtractor),
17664
- /* harmony export */ MobileNetV2ImageProcessor: () => (/* reexport safe */ _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_18__.MobileNetV2ImageProcessor),
17665
- /* harmony export */ MobileNetV3FeatureExtractor: () => (/* reexport safe */ _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_19__.MobileNetV3FeatureExtractor),
17666
- /* harmony export */ MobileNetV3ImageProcessor: () => (/* reexport safe */ _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_19__.MobileNetV3ImageProcessor),
17667
- /* harmony export */ MobileNetV4FeatureExtractor: () => (/* reexport safe */ _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_20__.MobileNetV4FeatureExtractor),
17668
- /* harmony export */ MobileNetV4ImageProcessor: () => (/* reexport safe */ _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_20__.MobileNetV4ImageProcessor),
17669
- /* harmony export */ MobileViTFeatureExtractor: () => (/* reexport safe */ _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_21__.MobileViTFeatureExtractor),
17670
- /* harmony export */ MobileViTImageProcessor: () => (/* reexport safe */ _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_21__.MobileViTImageProcessor),
17671
- /* harmony export */ NougatImageProcessor: () => (/* reexport safe */ _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_22__.NougatImageProcessor),
17672
- /* harmony export */ OwlViTFeatureExtractor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__.OwlViTFeatureExtractor),
17673
- /* harmony export */ OwlViTImageProcessor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__.OwlViTImageProcessor),
17674
- /* harmony export */ Owlv2ImageProcessor: () => (/* reexport safe */ _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_23__.Owlv2ImageProcessor),
17675
- /* harmony export */ Phi3VImageProcessor: () => (/* reexport safe */ _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_25__.Phi3VImageProcessor),
17676
- /* harmony export */ PvtImageProcessor: () => (/* reexport safe */ _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_26__.PvtImageProcessor),
17677
- /* harmony export */ Qwen2VLImageProcessor: () => (/* reexport safe */ _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_27__.Qwen2VLImageProcessor),
17678
- /* harmony export */ RTDetrImageProcessor: () => (/* reexport safe */ _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_28__.RTDetrImageProcessor),
17679
- /* harmony export */ SamImageProcessor: () => (/* reexport safe */ _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_29__.SamImageProcessor),
17680
- /* harmony export */ SegformerFeatureExtractor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__.SegformerFeatureExtractor),
17681
- /* harmony export */ SegformerImageProcessor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__.SegformerImageProcessor),
17682
- /* harmony export */ SiglipImageProcessor: () => (/* reexport safe */ _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_31__.SiglipImageProcessor),
17683
- /* harmony export */ Swin2SRImageProcessor: () => (/* reexport safe */ _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_32__.Swin2SRImageProcessor),
17684
- /* harmony export */ VLMImageProcessor: () => (/* reexport safe */ _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_12__.VLMImageProcessor),
17685
- /* harmony export */ ViTFeatureExtractor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__.ViTFeatureExtractor),
17686
- /* harmony export */ ViTImageProcessor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__.ViTImageProcessor),
17687
- /* harmony export */ VitMatteImageProcessor: () => (/* reexport safe */ _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_34__.VitMatteImageProcessor),
17688
- /* harmony export */ VitPoseImageProcessor: () => (/* reexport safe */ _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_35__.VitPoseImageProcessor),
17689
- /* harmony export */ YolosFeatureExtractor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__.YolosFeatureExtractor),
17690
- /* harmony export */ YolosImageProcessor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__.YolosImageProcessor)
17890
+ /* harmony export */ GroundingDinoImageProcessor: () => (/* reexport safe */ _grounding_dino_image_processing_grounding_dino_js__WEBPACK_IMPORTED_MODULE_11__.GroundingDinoImageProcessor),
17891
+ /* harmony export */ Idefics3ImageProcessor: () => (/* reexport safe */ _idefics3_image_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_12__.Idefics3ImageProcessor),
17892
+ /* harmony export */ JinaCLIPImageProcessor: () => (/* reexport safe */ _jina_clip_image_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_14__.JinaCLIPImageProcessor),
17893
+ /* harmony export */ LlavaOnevisionImageProcessor: () => (/* reexport safe */ _llava_onevision_image_processing_llava_onevision_js__WEBPACK_IMPORTED_MODULE_15__.LlavaOnevisionImageProcessor),
17894
+ /* harmony export */ Mask2FormerImageProcessor: () => (/* reexport safe */ _mask2former_image_processing_mask2former_js__WEBPACK_IMPORTED_MODULE_16__.Mask2FormerImageProcessor),
17895
+ /* harmony export */ MaskFormerFeatureExtractor: () => (/* reexport safe */ _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_17__.MaskFormerFeatureExtractor),
17896
+ /* harmony export */ MaskFormerImageProcessor: () => (/* reexport safe */ _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_17__.MaskFormerImageProcessor),
17897
+ /* harmony export */ MobileNetV1FeatureExtractor: () => (/* reexport safe */ _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_18__.MobileNetV1FeatureExtractor),
17898
+ /* harmony export */ MobileNetV1ImageProcessor: () => (/* reexport safe */ _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_18__.MobileNetV1ImageProcessor),
17899
+ /* harmony export */ MobileNetV2FeatureExtractor: () => (/* reexport safe */ _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_19__.MobileNetV2FeatureExtractor),
17900
+ /* harmony export */ MobileNetV2ImageProcessor: () => (/* reexport safe */ _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_19__.MobileNetV2ImageProcessor),
17901
+ /* harmony export */ MobileNetV3FeatureExtractor: () => (/* reexport safe */ _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_20__.MobileNetV3FeatureExtractor),
17902
+ /* harmony export */ MobileNetV3ImageProcessor: () => (/* reexport safe */ _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_20__.MobileNetV3ImageProcessor),
17903
+ /* harmony export */ MobileNetV4FeatureExtractor: () => (/* reexport safe */ _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_21__.MobileNetV4FeatureExtractor),
17904
+ /* harmony export */ MobileNetV4ImageProcessor: () => (/* reexport safe */ _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_21__.MobileNetV4ImageProcessor),
17905
+ /* harmony export */ MobileViTFeatureExtractor: () => (/* reexport safe */ _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_22__.MobileViTFeatureExtractor),
17906
+ /* harmony export */ MobileViTImageProcessor: () => (/* reexport safe */ _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_22__.MobileViTImageProcessor),
17907
+ /* harmony export */ NougatImageProcessor: () => (/* reexport safe */ _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_23__.NougatImageProcessor),
17908
+ /* harmony export */ OwlViTFeatureExtractor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_25__.OwlViTFeatureExtractor),
17909
+ /* harmony export */ OwlViTImageProcessor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_25__.OwlViTImageProcessor),
17910
+ /* harmony export */ Owlv2ImageProcessor: () => (/* reexport safe */ _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_24__.Owlv2ImageProcessor),
17911
+ /* harmony export */ Phi3VImageProcessor: () => (/* reexport safe */ _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_26__.Phi3VImageProcessor),
17912
+ /* harmony export */ PvtImageProcessor: () => (/* reexport safe */ _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_27__.PvtImageProcessor),
17913
+ /* harmony export */ Qwen2VLImageProcessor: () => (/* reexport safe */ _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_28__.Qwen2VLImageProcessor),
17914
+ /* harmony export */ RTDetrImageProcessor: () => (/* reexport safe */ _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_29__.RTDetrImageProcessor),
17915
+ /* harmony export */ SamImageProcessor: () => (/* reexport safe */ _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_30__.SamImageProcessor),
17916
+ /* harmony export */ SegformerFeatureExtractor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_31__.SegformerFeatureExtractor),
17917
+ /* harmony export */ SegformerImageProcessor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_31__.SegformerImageProcessor),
17918
+ /* harmony export */ SiglipImageProcessor: () => (/* reexport safe */ _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_32__.SiglipImageProcessor),
17919
+ /* harmony export */ Swin2SRImageProcessor: () => (/* reexport safe */ _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_33__.Swin2SRImageProcessor),
17920
+ /* harmony export */ VLMImageProcessor: () => (/* reexport safe */ _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_13__.VLMImageProcessor),
17921
+ /* harmony export */ ViTFeatureExtractor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_34__.ViTFeatureExtractor),
17922
+ /* harmony export */ ViTImageProcessor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_34__.ViTImageProcessor),
17923
+ /* harmony export */ VitMatteImageProcessor: () => (/* reexport safe */ _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_35__.VitMatteImageProcessor),
17924
+ /* harmony export */ VitPoseImageProcessor: () => (/* reexport safe */ _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_36__.VitPoseImageProcessor),
17925
+ /* harmony export */ YolosFeatureExtractor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_37__.YolosFeatureExtractor),
17926
+ /* harmony export */ YolosImageProcessor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_37__.YolosImageProcessor)
17691
17927
  /* harmony export */ });
17692
17928
  /* harmony import */ var _beit_image_processing_beit_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./beit/image_processing_beit.js */ "./src/models/beit/image_processing_beit.js");
17693
17929
  /* harmony import */ var _bit_image_processing_bit_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./bit/image_processing_bit.js */ "./src/models/bit/image_processing_bit.js");
@@ -17700,32 +17936,34 @@ __webpack_require__.r(__webpack_exports__);
17700
17936
  /* harmony import */ var _dpt_image_processing_dpt_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./dpt/image_processing_dpt.js */ "./src/models/dpt/image_processing_dpt.js");
17701
17937
  /* harmony import */ var _efficientnet_image_processing_efficientnet_js__WEBPACK_IMPORTED_MODULE_9__ = __webpack_require__(/*! ./efficientnet/image_processing_efficientnet.js */ "./src/models/efficientnet/image_processing_efficientnet.js");
17702
17938
  /* harmony import */ var _glpn_image_processing_glpn_js__WEBPACK_IMPORTED_MODULE_10__ = __webpack_require__(/*! ./glpn/image_processing_glpn.js */ "./src/models/glpn/image_processing_glpn.js");
17703
- /* harmony import */ var _idefics3_image_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./idefics3/image_processing_idefics3.js */ "./src/models/idefics3/image_processing_idefics3.js");
17704
- /* harmony import */ var _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./janus/image_processing_janus.js */ "./src/models/janus/image_processing_janus.js");
17705
- /* harmony import */ var _jina_clip_image_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./jina_clip/image_processing_jina_clip.js */ "./src/models/jina_clip/image_processing_jina_clip.js");
17706
- /* harmony import */ var _llava_onevision_image_processing_llava_onevision_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./llava_onevision/image_processing_llava_onevision.js */ "./src/models/llava_onevision/image_processing_llava_onevision.js");
17707
- /* harmony import */ var _mask2former_image_processing_mask2former_js__WEBPACK_IMPORTED_MODULE_15__ = __webpack_require__(/*! ./mask2former/image_processing_mask2former.js */ "./src/models/mask2former/image_processing_mask2former.js");
17708
- /* harmony import */ var _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_16__ = __webpack_require__(/*! ./maskformer/image_processing_maskformer.js */ "./src/models/maskformer/image_processing_maskformer.js");
17709
- /* harmony import */ var _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_17__ = __webpack_require__(/*! ./mobilenet_v1/image_processing_mobilenet_v1.js */ "./src/models/mobilenet_v1/image_processing_mobilenet_v1.js");
17710
- /* harmony import */ var _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_18__ = __webpack_require__(/*! ./mobilenet_v2/image_processing_mobilenet_v2.js */ "./src/models/mobilenet_v2/image_processing_mobilenet_v2.js");
17711
- /* harmony import */ var _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_19__ = __webpack_require__(/*! ./mobilenet_v3/image_processing_mobilenet_v3.js */ "./src/models/mobilenet_v3/image_processing_mobilenet_v3.js");
17712
- /* harmony import */ var _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_20__ = __webpack_require__(/*! ./mobilenet_v4/image_processing_mobilenet_v4.js */ "./src/models/mobilenet_v4/image_processing_mobilenet_v4.js");
17713
- /* harmony import */ var _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_21__ = __webpack_require__(/*! ./mobilevit/image_processing_mobilevit.js */ "./src/models/mobilevit/image_processing_mobilevit.js");
17714
- /* harmony import */ var _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_22__ = __webpack_require__(/*! ./nougat/image_processing_nougat.js */ "./src/models/nougat/image_processing_nougat.js");
17715
- /* harmony import */ var _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_23__ = __webpack_require__(/*! ./owlv2/image_processing_owlv2.js */ "./src/models/owlv2/image_processing_owlv2.js");
17716
- /* harmony import */ var _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__ = __webpack_require__(/*! ./owlvit/image_processing_owlvit.js */ "./src/models/owlvit/image_processing_owlvit.js");
17717
- /* harmony import */ var _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_25__ = __webpack_require__(/*! ./phi3_v/image_processing_phi3_v.js */ "./src/models/phi3_v/image_processing_phi3_v.js");
17718
- /* harmony import */ var _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_26__ = __webpack_require__(/*! ./pvt/image_processing_pvt.js */ "./src/models/pvt/image_processing_pvt.js");
17719
- /* harmony import */ var _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_27__ = __webpack_require__(/*! ./qwen2_vl/image_processing_qwen2_vl.js */ "./src/models/qwen2_vl/image_processing_qwen2_vl.js");
17720
- /* harmony import */ var _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_28__ = __webpack_require__(/*! ./rt_detr/image_processing_rt_detr.js */ "./src/models/rt_detr/image_processing_rt_detr.js");
17721
- /* harmony import */ var _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_29__ = __webpack_require__(/*! ./sam/image_processing_sam.js */ "./src/models/sam/image_processing_sam.js");
17722
- /* harmony import */ var _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__ = __webpack_require__(/*! ./segformer/image_processing_segformer.js */ "./src/models/segformer/image_processing_segformer.js");
17723
- /* harmony import */ var _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_31__ = __webpack_require__(/*! ./siglip/image_processing_siglip.js */ "./src/models/siglip/image_processing_siglip.js");
17724
- /* harmony import */ var _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_32__ = __webpack_require__(/*! ./swin2sr/image_processing_swin2sr.js */ "./src/models/swin2sr/image_processing_swin2sr.js");
17725
- /* harmony import */ var _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__ = __webpack_require__(/*! ./vit/image_processing_vit.js */ "./src/models/vit/image_processing_vit.js");
17726
- /* harmony import */ var _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_34__ = __webpack_require__(/*! ./vitmatte/image_processing_vitmatte.js */ "./src/models/vitmatte/image_processing_vitmatte.js");
17727
- /* harmony import */ var _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_35__ = __webpack_require__(/*! ./vitpose/image_processing_vitpose.js */ "./src/models/vitpose/image_processing_vitpose.js");
17728
- /* harmony import */ var _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__ = __webpack_require__(/*! ./yolos/image_processing_yolos.js */ "./src/models/yolos/image_processing_yolos.js");
17939
+ /* harmony import */ var _grounding_dino_image_processing_grounding_dino_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./grounding_dino/image_processing_grounding_dino.js */ "./src/models/grounding_dino/image_processing_grounding_dino.js");
17940
+ /* harmony import */ var _idefics3_image_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./idefics3/image_processing_idefics3.js */ "./src/models/idefics3/image_processing_idefics3.js");
17941
+ /* harmony import */ var _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./janus/image_processing_janus.js */ "./src/models/janus/image_processing_janus.js");
17942
+ /* harmony import */ var _jina_clip_image_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./jina_clip/image_processing_jina_clip.js */ "./src/models/jina_clip/image_processing_jina_clip.js");
17943
+ /* harmony import */ var _llava_onevision_image_processing_llava_onevision_js__WEBPACK_IMPORTED_MODULE_15__ = __webpack_require__(/*! ./llava_onevision/image_processing_llava_onevision.js */ "./src/models/llava_onevision/image_processing_llava_onevision.js");
17944
+ /* harmony import */ var _mask2former_image_processing_mask2former_js__WEBPACK_IMPORTED_MODULE_16__ = __webpack_require__(/*! ./mask2former/image_processing_mask2former.js */ "./src/models/mask2former/image_processing_mask2former.js");
17945
+ /* harmony import */ var _maskformer_image_processing_maskformer_js__WEBPACK_IMPORTED_MODULE_17__ = __webpack_require__(/*! ./maskformer/image_processing_maskformer.js */ "./src/models/maskformer/image_processing_maskformer.js");
17946
+ /* harmony import */ var _mobilenet_v1_image_processing_mobilenet_v1_js__WEBPACK_IMPORTED_MODULE_18__ = __webpack_require__(/*! ./mobilenet_v1/image_processing_mobilenet_v1.js */ "./src/models/mobilenet_v1/image_processing_mobilenet_v1.js");
17947
+ /* harmony import */ var _mobilenet_v2_image_processing_mobilenet_v2_js__WEBPACK_IMPORTED_MODULE_19__ = __webpack_require__(/*! ./mobilenet_v2/image_processing_mobilenet_v2.js */ "./src/models/mobilenet_v2/image_processing_mobilenet_v2.js");
17948
+ /* harmony import */ var _mobilenet_v3_image_processing_mobilenet_v3_js__WEBPACK_IMPORTED_MODULE_20__ = __webpack_require__(/*! ./mobilenet_v3/image_processing_mobilenet_v3.js */ "./src/models/mobilenet_v3/image_processing_mobilenet_v3.js");
17949
+ /* harmony import */ var _mobilenet_v4_image_processing_mobilenet_v4_js__WEBPACK_IMPORTED_MODULE_21__ = __webpack_require__(/*! ./mobilenet_v4/image_processing_mobilenet_v4.js */ "./src/models/mobilenet_v4/image_processing_mobilenet_v4.js");
17950
+ /* harmony import */ var _mobilevit_image_processing_mobilevit_js__WEBPACK_IMPORTED_MODULE_22__ = __webpack_require__(/*! ./mobilevit/image_processing_mobilevit.js */ "./src/models/mobilevit/image_processing_mobilevit.js");
17951
+ /* harmony import */ var _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_23__ = __webpack_require__(/*! ./nougat/image_processing_nougat.js */ "./src/models/nougat/image_processing_nougat.js");
17952
+ /* harmony import */ var _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_24__ = __webpack_require__(/*! ./owlv2/image_processing_owlv2.js */ "./src/models/owlv2/image_processing_owlv2.js");
17953
+ /* harmony import */ var _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_25__ = __webpack_require__(/*! ./owlvit/image_processing_owlvit.js */ "./src/models/owlvit/image_processing_owlvit.js");
17954
+ /* harmony import */ var _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_26__ = __webpack_require__(/*! ./phi3_v/image_processing_phi3_v.js */ "./src/models/phi3_v/image_processing_phi3_v.js");
17955
+ /* harmony import */ var _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_27__ = __webpack_require__(/*! ./pvt/image_processing_pvt.js */ "./src/models/pvt/image_processing_pvt.js");
17956
+ /* harmony import */ var _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_28__ = __webpack_require__(/*! ./qwen2_vl/image_processing_qwen2_vl.js */ "./src/models/qwen2_vl/image_processing_qwen2_vl.js");
17957
+ /* harmony import */ var _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_29__ = __webpack_require__(/*! ./rt_detr/image_processing_rt_detr.js */ "./src/models/rt_detr/image_processing_rt_detr.js");
17958
+ /* harmony import */ var _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_30__ = __webpack_require__(/*! ./sam/image_processing_sam.js */ "./src/models/sam/image_processing_sam.js");
17959
+ /* harmony import */ var _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_31__ = __webpack_require__(/*! ./segformer/image_processing_segformer.js */ "./src/models/segformer/image_processing_segformer.js");
17960
+ /* harmony import */ var _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_32__ = __webpack_require__(/*! ./siglip/image_processing_siglip.js */ "./src/models/siglip/image_processing_siglip.js");
17961
+ /* harmony import */ var _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_33__ = __webpack_require__(/*! ./swin2sr/image_processing_swin2sr.js */ "./src/models/swin2sr/image_processing_swin2sr.js");
17962
+ /* harmony import */ var _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_34__ = __webpack_require__(/*! ./vit/image_processing_vit.js */ "./src/models/vit/image_processing_vit.js");
17963
+ /* harmony import */ var _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_35__ = __webpack_require__(/*! ./vitmatte/image_processing_vitmatte.js */ "./src/models/vitmatte/image_processing_vitmatte.js");
17964
+ /* harmony import */ var _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_36__ = __webpack_require__(/*! ./vitpose/image_processing_vitpose.js */ "./src/models/vitpose/image_processing_vitpose.js");
17965
+ /* harmony import */ var _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_37__ = __webpack_require__(/*! ./yolos/image_processing_yolos.js */ "./src/models/yolos/image_processing_yolos.js");
17966
+
17729
17967
 
17730
17968
 
17731
17969
 
@@ -17793,6 +18031,7 @@ class VLMImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTE
17793
18031
  },
17794
18032
  ...config,
17795
18033
  });
18034
+ // @ts-expect-error TS2339
17796
18035
  this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
17797
18036
  }
17798
18037
 
@@ -18241,6 +18480,8 @@ class MgpstrProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE
18241
18480
  * - bpe_preds: The list of BPE decoded sentences.
18242
18481
  * - wp_preds: The list of wp decoded sentences.
18243
18482
  */
18483
+ // @ts-expect-error The type of this method is not compatible with the one
18484
+ // in the base class. It might be a good idea to fix this.
18244
18485
  batch_decode([char_logits, bpe_logits, wp_logits]) {
18245
18486
  const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
18246
18487
  const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
@@ -18634,6 +18875,7 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
18634
18875
  }
18635
18876
 
18636
18877
  const bos_token = this.tokenizer.bos_token;
18878
+ // @ts-expect-error TS2339
18637
18879
  const image_seq_length = this.image_processor.config.image_seq_length;
18638
18880
  let input_strings;
18639
18881
  if (text.some((t) => t.includes(IMAGE_TOKEN))) {
@@ -18886,7 +19128,7 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
18886
19128
  *
18887
19129
  * @param {string|string[]} text
18888
19130
  * @param {RawImage|RawImage[]} images
18889
- * @param {...any} args
19131
+ * @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
18890
19132
  * @returns {Promise<any>}
18891
19133
  */
18892
19134
  async _call(text, images = null, {
@@ -18937,36 +19179,39 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
18937
19179
  __webpack_require__.r(__webpack_exports__);
18938
19180
  /* harmony export */ __webpack_require__.d(__webpack_exports__, {
18939
19181
  /* harmony export */ Florence2Processor: () => (/* reexport safe */ _florence2_processing_florence2_js__WEBPACK_IMPORTED_MODULE_0__.Florence2Processor),
18940
- /* harmony export */ Idefics3Processor: () => (/* reexport safe */ _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_3__.Idefics3Processor),
18941
- /* harmony export */ JinaCLIPProcessor: () => (/* reexport safe */ _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_5__.JinaCLIPProcessor),
18942
- /* harmony export */ MgpstrProcessor: () => (/* reexport safe */ _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_1__.MgpstrProcessor),
18943
- /* harmony export */ MoonshineProcessor: () => (/* reexport safe */ _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_2__.MoonshineProcessor),
18944
- /* harmony export */ OwlViTProcessor: () => (/* reexport safe */ _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_6__.OwlViTProcessor),
18945
- /* harmony export */ PaliGemmaProcessor: () => (/* reexport safe */ _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_8__.PaliGemmaProcessor),
18946
- /* harmony export */ Phi3VProcessor: () => (/* reexport safe */ _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_7__.Phi3VProcessor),
18947
- /* harmony export */ PyAnnoteProcessor: () => (/* reexport safe */ _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_9__.PyAnnoteProcessor),
18948
- /* harmony export */ Qwen2VLProcessor: () => (/* reexport safe */ _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_10__.Qwen2VLProcessor),
18949
- /* harmony export */ SamProcessor: () => (/* reexport safe */ _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_11__.SamProcessor),
18950
- /* harmony export */ SpeechT5Processor: () => (/* reexport safe */ _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_12__.SpeechT5Processor),
18951
- /* harmony export */ VLChatProcessor: () => (/* reexport safe */ _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_4__.VLChatProcessor),
18952
- /* harmony export */ Wav2Vec2ProcessorWithLM: () => (/* reexport safe */ _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_13__.Wav2Vec2ProcessorWithLM),
18953
- /* harmony export */ WhisperProcessor: () => (/* reexport safe */ _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_14__.WhisperProcessor)
19182
+ /* harmony export */ GroundingDinoProcessor: () => (/* reexport safe */ _grounding_dino_processing_grounding_dino_js__WEBPACK_IMPORTED_MODULE_1__.GroundingDinoProcessor),
19183
+ /* harmony export */ Idefics3Processor: () => (/* reexport safe */ _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_2__.Idefics3Processor),
19184
+ /* harmony export */ JinaCLIPProcessor: () => (/* reexport safe */ _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_4__.JinaCLIPProcessor),
19185
+ /* harmony export */ MgpstrProcessor: () => (/* reexport safe */ _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_5__.MgpstrProcessor),
19186
+ /* harmony export */ MoonshineProcessor: () => (/* reexport safe */ _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_6__.MoonshineProcessor),
19187
+ /* harmony export */ OwlViTProcessor: () => (/* reexport safe */ _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_7__.OwlViTProcessor),
19188
+ /* harmony export */ PaliGemmaProcessor: () => (/* reexport safe */ _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_9__.PaliGemmaProcessor),
19189
+ /* harmony export */ Phi3VProcessor: () => (/* reexport safe */ _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_8__.Phi3VProcessor),
19190
+ /* harmony export */ PyAnnoteProcessor: () => (/* reexport safe */ _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_10__.PyAnnoteProcessor),
19191
+ /* harmony export */ Qwen2VLProcessor: () => (/* reexport safe */ _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_11__.Qwen2VLProcessor),
19192
+ /* harmony export */ SamProcessor: () => (/* reexport safe */ _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_12__.SamProcessor),
19193
+ /* harmony export */ SpeechT5Processor: () => (/* reexport safe */ _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_13__.SpeechT5Processor),
19194
+ /* harmony export */ VLChatProcessor: () => (/* reexport safe */ _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_3__.VLChatProcessor),
19195
+ /* harmony export */ Wav2Vec2ProcessorWithLM: () => (/* reexport safe */ _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_14__.Wav2Vec2ProcessorWithLM),
19196
+ /* harmony export */ WhisperProcessor: () => (/* reexport safe */ _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_15__.WhisperProcessor)
18954
19197
  /* harmony export */ });
18955
19198
  /* harmony import */ var _florence2_processing_florence2_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./florence2/processing_florence2.js */ "./src/models/florence2/processing_florence2.js");
18956
- /* harmony import */ var _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./mgp_str/processing_mgp_str.js */ "./src/models/mgp_str/processing_mgp_str.js");
18957
- /* harmony import */ var _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ./moonshine/processing_moonshine.js */ "./src/models/moonshine/processing_moonshine.js");
18958
- /* harmony import */ var _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ./idefics3/processing_idefics3.js */ "./src/models/idefics3/processing_idefics3.js");
18959
- /* harmony import */ var _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_4__ = __webpack_require__(/*! ./janus/processing_janus.js */ "./src/models/janus/processing_janus.js");
18960
- /* harmony import */ var _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./jina_clip/processing_jina_clip.js */ "./src/models/jina_clip/processing_jina_clip.js");
18961
- /* harmony import */ var _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! ./owlvit/processing_owlvit.js */ "./src/models/owlvit/processing_owlvit.js");
18962
- /* harmony import */ var _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./phi3_v/processing_phi3_v.js */ "./src/models/phi3_v/processing_phi3_v.js");
18963
- /* harmony import */ var _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./paligemma/processing_paligemma.js */ "./src/models/paligemma/processing_paligemma.js");
18964
- /* harmony import */ var _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_9__ = __webpack_require__(/*! ./pyannote/processing_pyannote.js */ "./src/models/pyannote/processing_pyannote.js");
18965
- /* harmony import */ var _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_10__ = __webpack_require__(/*! ./qwen2_vl/processing_qwen2_vl.js */ "./src/models/qwen2_vl/processing_qwen2_vl.js");
18966
- /* harmony import */ var _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./sam/processing_sam.js */ "./src/models/sam/processing_sam.js");
18967
- /* harmony import */ var _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./speecht5/processing_speecht5.js */ "./src/models/speecht5/processing_speecht5.js");
18968
- /* harmony import */ var _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./wav2vec2/processing_wav2vec2.js */ "./src/models/wav2vec2/processing_wav2vec2.js");
18969
- /* harmony import */ var _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./whisper/processing_whisper.js */ "./src/models/whisper/processing_whisper.js");
19199
+ /* harmony import */ var _grounding_dino_processing_grounding_dino_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./grounding_dino/processing_grounding_dino.js */ "./src/models/grounding_dino/processing_grounding_dino.js");
19200
+ /* harmony import */ var _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ./idefics3/processing_idefics3.js */ "./src/models/idefics3/processing_idefics3.js");
19201
+ /* harmony import */ var _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ./janus/processing_janus.js */ "./src/models/janus/processing_janus.js");
19202
+ /* harmony import */ var _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_4__ = __webpack_require__(/*! ./jina_clip/processing_jina_clip.js */ "./src/models/jina_clip/processing_jina_clip.js");
19203
+ /* harmony import */ var _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./mgp_str/processing_mgp_str.js */ "./src/models/mgp_str/processing_mgp_str.js");
19204
+ /* harmony import */ var _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! ./moonshine/processing_moonshine.js */ "./src/models/moonshine/processing_moonshine.js");
19205
+ /* harmony import */ var _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./owlvit/processing_owlvit.js */ "./src/models/owlvit/processing_owlvit.js");
19206
+ /* harmony import */ var _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./phi3_v/processing_phi3_v.js */ "./src/models/phi3_v/processing_phi3_v.js");
19207
+ /* harmony import */ var _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_9__ = __webpack_require__(/*! ./paligemma/processing_paligemma.js */ "./src/models/paligemma/processing_paligemma.js");
19208
+ /* harmony import */ var _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_10__ = __webpack_require__(/*! ./pyannote/processing_pyannote.js */ "./src/models/pyannote/processing_pyannote.js");
19209
+ /* harmony import */ var _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./qwen2_vl/processing_qwen2_vl.js */ "./src/models/qwen2_vl/processing_qwen2_vl.js");
19210
+ /* harmony import */ var _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./sam/processing_sam.js */ "./src/models/sam/processing_sam.js");
19211
+ /* harmony import */ var _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./speecht5/processing_speecht5.js */ "./src/models/speecht5/processing_speecht5.js");
19212
+ /* harmony import */ var _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./wav2vec2/processing_wav2vec2.js */ "./src/models/wav2vec2/processing_wav2vec2.js");
19213
+ /* harmony import */ var _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_15__ = __webpack_require__(/*! ./whisper/processing_whisper.js */ "./src/models/whisper/processing_whisper.js");
19214
+
18970
19215
 
18971
19216
 
18972
19217
 
@@ -19073,6 +19318,7 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
19073
19318
 
19074
19319
  let current_speaker = -1;
19075
19320
  for (let i = 0; i < scores.length; ++i) {
19321
+ /** @type {number[]} */
19076
19322
  const probabilities = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.softmax)(scores[i]);
19077
19323
  const [score, id] = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.max)(probabilities);
19078
19324
  const [start, end] = [i, i + 1];
@@ -19260,6 +19506,7 @@ class Qwen2VLProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODUL
19260
19506
  }
19261
19507
 
19262
19508
  if (image_grid_thw) {
19509
+ // @ts-expect-error TS2551
19263
19510
  let merge_length = this.image_processor.config.merge_size ** 2;
19264
19511
  let index = 0;
19265
19512
 
@@ -19751,8 +19998,8 @@ class SeamlessM4TFeatureExtractor extends _base_feature_extraction_utils_js__WEB
19751
19998
  'int64',
19752
19999
  new BigInt64Array(numPaddedFrames),
19753
20000
  [1, numPaddedFrames],
19754
- )
19755
- padded_attention_mask.data.fill(1n, 0, num_frames);
20001
+ );
20002
+ /** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
19756
20003
  }
19757
20004
  }
19758
20005
  }
@@ -20565,7 +20812,7 @@ class WhisperFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK
20565
20812
  )
20566
20813
 
20567
20814
  const data = features.data;
20568
- const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(data)[0];
20815
+ const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(/** @type {Float32Array} */(data))[0];
20569
20816
 
20570
20817
  for (let i = 0; i < data.length; ++i) {
20571
20818
  data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
@@ -20828,6 +21075,16 @@ class TensorOpRegistry {
20828
21075
  // executionProviders: ['webgpu'],
20829
21076
  };
20830
21077
 
21078
+ static get nearest_interpolate_4d() {
21079
+ if (!this._nearest_interpolate_4d) {
21080
+ this._nearest_interpolate_4d = wrap(
21081
+ [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
21082
+ this.session_options,
21083
+ 'y',
21084
+ );
21085
+ }
21086
+ return this._nearest_interpolate_4d;
21087
+ }
20831
21088
  static get bilinear_interpolate_4d() {
20832
21089
  if (!this._bilinear_interpolate_4d) {
20833
21090
  this._bilinear_interpolate_4d = wrap(
@@ -21202,6 +21459,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
21202
21459
 
21203
21460
  // TODO: Use softmax tensor function
21204
21461
  const function_to_apply =
21462
+ // @ts-expect-error TS2339
21205
21463
  this.model.config.problem_type === 'multi_label_classification'
21206
21464
  ? batch => batch.sigmoid()
21207
21465
  : batch => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor(
@@ -21210,6 +21468,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
21210
21468
  batch.dims,
21211
21469
  ); // single_label_classification (default)
21212
21470
 
21471
+ // @ts-expect-error TS2339
21213
21472
  const id2label = this.model.config.id2label;
21214
21473
 
21215
21474
  const toReturn = [];
@@ -21312,6 +21571,7 @@ class TokenClassificationPipeline extends (/** @type {new (options: TextPipeline
21312
21571
  const outputs = await this.model(model_inputs)
21313
21572
 
21314
21573
  const logits = outputs.logits;
21574
+ // @ts-expect-error TS2339
21315
21575
  const id2label = this.model.config.id2label;
21316
21576
 
21317
21577
  const toReturn = [];
@@ -21651,11 +21911,14 @@ class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipeline
21651
21911
 
21652
21912
 
21653
21913
  // Add global prefix, if present
21914
+ // @ts-expect-error TS2339
21654
21915
  if (this.model.config.prefix) {
21916
+ // @ts-expect-error TS2339
21655
21917
  texts = texts.map(x => this.model.config.prefix + x)
21656
21918
  }
21657
21919
 
21658
21920
  // Handle task specific params:
21921
+ // @ts-expect-error TS2339
21659
21922
  const task_specific_params = this.model.config.task_specific_params
21660
21923
  if (task_specific_params && task_specific_params[this.task]) {
21661
21924
  // Add prefixes, if present
@@ -22394,6 +22657,7 @@ class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelin
22394
22657
  const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
22395
22658
  const preparedAudios = await prepareAudios(audio, sampling_rate);
22396
22659
 
22660
+ // @ts-expect-error TS2339
22397
22661
  const id2label = this.model.config.id2label;
22398
22662
 
22399
22663
  const toReturn = [];
@@ -22704,6 +22968,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22704
22968
  audio = [/** @type {AudioInput} */ (audio)];
22705
22969
  }
22706
22970
 
22971
+ // @ts-expect-error TS2339
22707
22972
  const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
22708
22973
  const hop_length = this.processor.feature_extractor.config.hop_length;
22709
22974
 
@@ -22769,7 +23034,9 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22769
23034
 
22770
23035
  // TODO: Right now we only get top beam
22771
23036
  if (return_timestamps === 'word') {
23037
+ // @ts-expect-error TS2339
22772
23038
  chunk.tokens = data.sequences.tolist()[0];
23039
+ // @ts-expect-error TS2339
22773
23040
  chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
22774
23041
  (/** @type {number} */ x) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.round)(x, 2)
22775
23042
  );
@@ -22814,7 +23081,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22814
23081
  const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
22815
23082
  const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
22816
23083
 
22817
- const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
23084
+ const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
22818
23085
  toReturn.push({ text });
22819
23086
  }
22820
23087
  return single ? toReturn[0] : toReturn;
@@ -22963,6 +23230,7 @@ class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelin
22963
23230
  const { pixel_values } = await this.processor(preparedImages);
22964
23231
  const output = await this.model({ pixel_values });
22965
23232
 
23233
+ // @ts-expect-error TS2339
22966
23234
  const id2label = this.model.config.id2label;
22967
23235
 
22968
23236
  /** @type {ImageClassificationOutput[]} */
@@ -23077,6 +23345,7 @@ class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineC
23077
23345
  }
23078
23346
  }
23079
23347
 
23348
+ // @ts-expect-error TS2339
23080
23349
  const id2label = this.model.config.id2label;
23081
23350
 
23082
23351
  /** @type {ImageSegmentationPipelineOutput[]} */
@@ -23303,6 +23572,7 @@ class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineCon
23303
23572
  const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
23304
23573
 
23305
23574
  // Add labels
23575
+ // @ts-expect-error TS2339
23306
23576
  const id2label = this.model.config.id2label;
23307
23577
 
23308
23578
  // Format output
@@ -23447,13 +23717,35 @@ class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: TextImag
23447
23717
  // Run model with both text and pixel inputs
23448
23718
  const output = await this.model({ ...text_inputs, pixel_values });
23449
23719
 
23450
- // @ts-ignore
23451
- const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
23452
- let result = processed.boxes.map((box, i) => ({
23453
- score: processed.scores[i],
23454
- label: candidate_labels[processed.classes[i]],
23455
- box: get_bounding_box(box, !percentage),
23456
- })).sort((a, b) => b.score - a.score);
23720
+ let result;
23721
+ if('post_process_grounded_object_detection' in this.processor) {
23722
+ // @ts-ignore
23723
+ const processed = this.processor.post_process_grounded_object_detection(
23724
+ output,
23725
+ text_inputs.input_ids,
23726
+ {
23727
+ // TODO: support separate threshold values
23728
+ box_threshold: threshold,
23729
+ text_threshold: threshold,
23730
+ target_sizes: imageSize,
23731
+ },
23732
+ )[0];
23733
+ result = processed.boxes.map((box, i) => ({
23734
+ score: processed.scores[i],
23735
+ label: processed.labels[i],
23736
+ box: get_bounding_box(box, !percentage),
23737
+ }))
23738
+ } else {
23739
+ // @ts-ignore
23740
+ const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
23741
+ result = processed.boxes.map((box, i) => ({
23742
+ score: processed.scores[i],
23743
+ label: candidate_labels[processed.classes[i]],
23744
+ box: get_bounding_box(box, !percentage),
23745
+ }))
23746
+ }
23747
+ result.sort((a, b) => b.score - a.score);
23748
+
23457
23749
  if (top_k !== null) {
23458
23750
  result = result.slice(0, top_k);
23459
23751
  }
@@ -23522,6 +23814,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
23522
23814
  // Run model
23523
23815
  const output = await this.model.generate({
23524
23816
  inputs: pixel_values,
23817
+ // @ts-expect-error TS2339
23525
23818
  max_length: this.model.config.decoder.max_position_embeddings,
23526
23819
  decoder_input_ids,
23527
23820
  ...generate_kwargs,
@@ -23572,7 +23865,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
23572
23865
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { quantized: false });
23573
23866
  * const speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
23574
23867
  * const out = await synthesizer('Hello, my dog is cute', { speaker_embeddings });
23575
- * // {
23868
+ * // RawAudio {
23576
23869
  * // audio: Float32Array(26112) [-0.00005657337896991521, 0.00020583874720614403, ...],
23577
23870
  * // sampling_rate: 16000
23578
23871
  * // }
@@ -23592,7 +23885,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
23592
23885
  * ```javascript
23593
23886
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/mms-tts-fra');
23594
23887
  * const out = await synthesizer('Bonjour');
23595
- * // {
23888
+ * // RawAudio {
23596
23889
  * // audio: Float32Array(23808) [-0.00037693005288019776, 0.0003325853613205254, ...],
23597
23890
  * // sampling_rate: 16000
23598
23891
  * // }
@@ -23637,11 +23930,12 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
23637
23930
  // Generate waveform
23638
23931
  const { waveform } = await this.model(inputs);
23639
23932
 
23933
+ // @ts-expect-error TS2339
23640
23934
  const sampling_rate = this.model.config.sampling_rate;
23641
- return {
23642
- audio: waveform.data,
23935
+ return new _utils_audio_js__WEBPACK_IMPORTED_MODULE_7__.RawAudio(
23936
+ waveform.data,
23643
23937
  sampling_rate,
23644
- }
23938
+ )
23645
23939
  }
23646
23940
 
23647
23941
  async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) {
@@ -23681,10 +23975,10 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
23681
23975
  const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder });
23682
23976
 
23683
23977
  const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
23684
- return {
23685
- audio: waveform.data,
23978
+ return new _utils_audio_js__WEBPACK_IMPORTED_MODULE_7__.RawAudio(
23979
+ waveform.data,
23686
23980
  sampling_rate,
23687
- }
23981
+ )
23688
23982
  }
23689
23983
  }
23690
23984
 
@@ -23794,11 +24088,23 @@ class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineCon
23794
24088
 
23795
24089
  const toReturn = [];
23796
24090
  for (let i = 0; i < preparedImages.length; ++i) {
23797
- const prediction = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate)(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
23798
- const formatted = prediction.mul_(255 / (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.max)(prediction.data)[0]).to('uint8');
24091
+ const batch = predicted_depth[i];
24092
+ const [height, width] = batch.dims.slice(-2);
24093
+ const [new_width, new_height] = preparedImages[i].size;
24094
+
24095
+ // Interpolate to original size
24096
+ const prediction = (await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate_4d)(batch.view(1, 1, height, width), {
24097
+ size: [new_height, new_width],
24098
+ mode: 'bilinear',
24099
+ })).view(new_height, new_width);
24100
+
24101
+ const minval = /** @type {number} */(prediction.min().item());
24102
+ const maxval = /** @type {number} */(prediction.max().item());
24103
+ const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
24104
+ const depth = _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted);
23799
24105
  toReturn.push({
23800
- predicted_depth: predicted_depth[i],
23801
- depth: _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted),
24106
+ predicted_depth: prediction,
24107
+ depth,
23802
24108
  });
23803
24109
  }
23804
24110
 
@@ -24278,6 +24584,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
24278
24584
  return result;
24279
24585
  }
24280
24586
 
24587
+
24281
24588
  /***/ }),
24282
24589
 
24283
24590
  /***/ "./src/tokenizers.js":
@@ -24347,7 +24654,6 @@ __webpack_require__.r(__webpack_exports__);
24347
24654
  /* harmony import */ var _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./utils/data-structures.js */ "./src/utils/data-structures.js");
24348
24655
  /* harmony import */ var _huggingface_jinja__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! @huggingface/jinja */ "./node_modules/@huggingface/jinja/dist/index.js");
24349
24656
  /* harmony import */ var _models_whisper_common_whisper_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./models/whisper/common_whisper.js */ "./src/models/whisper/common_whisper.js");
24350
- /* harmony import */ var _utils_constants_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./utils/constants.js */ "./src/utils/constants.js");
24351
24657
 
24352
24658
  /**
24353
24659
  * @file Tokenizers are used to prepare textual inputs for a model.
@@ -24384,7 +24690,6 @@ __webpack_require__.r(__webpack_exports__);
24384
24690
 
24385
24691
 
24386
24692
 
24387
-
24388
24693
  /**
24389
24694
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
24390
24695
  * @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used.
@@ -24868,7 +25173,7 @@ class Unigram extends TokenizerModel {
24868
25173
  * Create a new Unigram tokenizer model.
24869
25174
  * @param {Object} config The configuration object for the Unigram model.
24870
25175
  * @param {number} config.unk_id The ID of the unknown token
24871
- * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
25176
+ * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
24872
25177
  * @param {Object} moreConfig Additional configuration object for the Unigram model.
24873
25178
  */
24874
25179
  constructor(config, moreConfig) {
@@ -24876,11 +25181,10 @@ class Unigram extends TokenizerModel {
24876
25181
 
24877
25182
  const vocabSize = config.vocab.length;
24878
25183
  this.vocab = new Array(vocabSize);
25184
+ /** @type {number[]} */
24879
25185
  this.scores = new Array(vocabSize);
24880
25186
  for (let i = 0; i < vocabSize; ++i) {
24881
- const piece = config.vocab[i];
24882
- this.vocab[i] = piece[0];
24883
- this.scores[i] = piece[1];
25187
+ [this.vocab[i], this.scores[i]] = config.vocab[i];
24884
25188
  }
24885
25189
 
24886
25190
  this.unk_token_id = config.unk_id;
@@ -28720,6 +29024,7 @@ class AutoTokenizer {
28720
29024
  "use strict";
28721
29025
  __webpack_require__.r(__webpack_exports__);
28722
29026
  /* harmony export */ __webpack_require__.d(__webpack_exports__, {
29027
+ /* harmony export */ RawAudio: () => (/* binding */ RawAudio),
28723
29028
  /* harmony export */ hamming: () => (/* binding */ hamming),
28724
29029
  /* harmony export */ hanning: () => (/* binding */ hanning),
28725
29030
  /* harmony export */ mel_filter_bank: () => (/* binding */ mel_filter_bank),
@@ -28730,7 +29035,9 @@ __webpack_require__.r(__webpack_exports__);
28730
29035
  /* harmony import */ var _hub_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./hub.js */ "./src/utils/hub.js");
28731
29036
  /* harmony import */ var _maths_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./maths.js */ "./src/utils/maths.js");
28732
29037
  /* harmony import */ var _core_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ./core.js */ "./src/utils/core.js");
28733
- /* harmony import */ var _tensor_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ./tensor.js */ "./src/utils/tensor.js");
29038
+ /* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
29039
+ /* harmony import */ var fs__WEBPACK_IMPORTED_MODULE_4__ = __webpack_require__(/*! fs */ "fs");
29040
+ /* harmony import */ var _tensor_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./tensor.js */ "./src/utils/tensor.js");
28734
29041
  /**
28735
29042
  * @file Helper module for audio processing.
28736
29043
  *
@@ -28746,6 +29053,8 @@ __webpack_require__.r(__webpack_exports__);
28746
29053
 
28747
29054
 
28748
29055
 
29056
+
29057
+
28749
29058
  /**
28750
29059
  * Helper function to read audio from a path/URL.
28751
29060
  * @param {string|URL} url The path/URL to load the audio from.
@@ -29339,10 +29648,10 @@ async function spectrogram(
29339
29648
  // - mel_filters.shape=(80, 201)
29340
29649
  // - magnitudes.shape=(3000, 201) => magnitudes.T.shape=(201, 3000)
29341
29650
  // - mel_spec.shape=(80, 3000)
29342
- let mel_spec = await (0,_tensor_js__WEBPACK_IMPORTED_MODULE_3__.matmul)(
29651
+ let mel_spec = await (0,_tensor_js__WEBPACK_IMPORTED_MODULE_5__.matmul)(
29343
29652
  // TODO: Make `mel_filters` a Tensor during initialization
29344
- new _tensor_js__WEBPACK_IMPORTED_MODULE_3__.Tensor('float32', mel_filters.flat(), [num_mel_filters, num_frequency_bins]),
29345
- new _tensor_js__WEBPACK_IMPORTED_MODULE_3__.Tensor('float32', transposedMagnitudeData, [num_frequency_bins, d1Max]),
29653
+ new _tensor_js__WEBPACK_IMPORTED_MODULE_5__.Tensor('float32', mel_filters.flat(), [num_mel_filters, num_frequency_bins]),
29654
+ new _tensor_js__WEBPACK_IMPORTED_MODULE_5__.Tensor('float32', transposedMagnitudeData, [num_frequency_bins, d1Max]),
29346
29655
  );
29347
29656
  if (transpose) {
29348
29657
  mel_spec = mel_spec.transpose(1, 0);
@@ -29432,6 +29741,116 @@ function window_function(window_length, name, {
29432
29741
  return window;
29433
29742
  }
29434
29743
 
29744
+ /**
29745
+ * Encode audio data to a WAV file.
29746
+ * WAV file specs : https://en.wikipedia.org/wiki/WAV#WAV_File_header
29747
+ *
29748
+ * Adapted from https://www.npmjs.com/package/audiobuffer-to-wav
29749
+ * @param {Float32Array} samples The audio samples.
29750
+ * @param {number} rate The sample rate.
29751
+ * @returns {ArrayBuffer} The WAV audio buffer.
29752
+ */
29753
+ function encodeWAV(samples, rate) {
29754
+ let offset = 44;
29755
+ const buffer = new ArrayBuffer(offset + samples.length * 4);
29756
+ const view = new DataView(buffer);
29757
+
29758
+ /* RIFF identifier */
29759
+ writeString(view, 0, "RIFF");
29760
+ /* RIFF chunk length */
29761
+ view.setUint32(4, 36 + samples.length * 4, true);
29762
+ /* RIFF type */
29763
+ writeString(view, 8, "WAVE");
29764
+ /* format chunk identifier */
29765
+ writeString(view, 12, "fmt ");
29766
+ /* format chunk length */
29767
+ view.setUint32(16, 16, true);
29768
+ /* sample format (raw) */
29769
+ view.setUint16(20, 3, true);
29770
+ /* channel count */
29771
+ view.setUint16(22, 1, true);
29772
+ /* sample rate */
29773
+ view.setUint32(24, rate, true);
29774
+ /* byte rate (sample rate * block align) */
29775
+ view.setUint32(28, rate * 4, true);
29776
+ /* block align (channel count * bytes per sample) */
29777
+ view.setUint16(32, 4, true);
29778
+ /* bits per sample */
29779
+ view.setUint16(34, 32, true);
29780
+ /* data chunk identifier */
29781
+ writeString(view, 36, "data");
29782
+ /* data chunk length */
29783
+ view.setUint32(40, samples.length * 4, true);
29784
+
29785
+ for (let i = 0; i < samples.length; ++i, offset += 4) {
29786
+ view.setFloat32(offset, samples[i], true);
29787
+ }
29788
+
29789
+ return buffer;
29790
+ }
29791
+
29792
+ function writeString(view, offset, string) {
29793
+ for (let i = 0; i < string.length; ++i) {
29794
+ view.setUint8(offset + i, string.charCodeAt(i));
29795
+ }
29796
+ }
29797
+
29798
+
29799
+ class RawAudio {
29800
+
29801
+ /**
29802
+ * Create a new `RawAudio` object.
29803
+ * @param {Float32Array} audio Audio data
29804
+ * @param {number} sampling_rate Sampling rate of the audio data
29805
+ */
29806
+ constructor(audio, sampling_rate) {
29807
+ this.audio = audio
29808
+ this.sampling_rate = sampling_rate
29809
+ }
29810
+
29811
+ /**
29812
+ * Convert the audio to a wav file buffer.
29813
+ * @returns {ArrayBuffer} The WAV file.
29814
+ */
29815
+ toWav() {
29816
+ return encodeWAV(this.audio, this.sampling_rate)
29817
+ }
29818
+
29819
+ /**
29820
+ * Convert the audio to a blob.
29821
+ * @returns {Blob}
29822
+ */
29823
+ toBlob() {
29824
+ const wav = this.toWav();
29825
+ const blob = new Blob([wav], { type: 'audio/wav' });
29826
+ return blob;
29827
+ }
29828
+
29829
+ /**
29830
+ * Save the audio to a wav file.
29831
+ * @param {string} path
29832
+ */
29833
+ async save(path) {
29834
+ let fn;
29835
+
29836
+ if (_env_js__WEBPACK_IMPORTED_MODULE_3__.apis.IS_BROWSER_ENV) {
29837
+ if (_env_js__WEBPACK_IMPORTED_MODULE_3__.apis.IS_WEBWORKER_ENV) {
29838
+ throw new Error('Unable to save a file from a Web Worker.')
29839
+ }
29840
+ fn = _core_js__WEBPACK_IMPORTED_MODULE_2__.saveBlob;
29841
+ } else if (_env_js__WEBPACK_IMPORTED_MODULE_3__.apis.IS_FS_AVAILABLE) {
29842
+ fn = async (/** @type {string} */ path, /** @type {Blob} */ blob) => {
29843
+ let buffer = await blob.arrayBuffer();
29844
+ fs__WEBPACK_IMPORTED_MODULE_4__.writeFileSync(path, Buffer.from(buffer));
29845
+ }
29846
+ } else {
29847
+ throw new Error('Unable to save because filesystem is disabled in this environment.')
29848
+ }
29849
+
29850
+ await fn(path, this.toBlob())
29851
+ }
29852
+ }
29853
+
29435
29854
 
29436
29855
  /***/ }),
29437
29856
 
@@ -29487,7 +29906,8 @@ __webpack_require__.r(__webpack_exports__);
29487
29906
  /* harmony export */ pick: () => (/* binding */ pick),
29488
29907
  /* harmony export */ pop: () => (/* binding */ pop),
29489
29908
  /* harmony export */ product: () => (/* binding */ product),
29490
- /* harmony export */ reverseDictionary: () => (/* binding */ reverseDictionary)
29909
+ /* harmony export */ reverseDictionary: () => (/* binding */ reverseDictionary),
29910
+ /* harmony export */ saveBlob: () => (/* binding */ saveBlob)
29491
29911
  /* harmony export */ });
29492
29912
 
29493
29913
  /**
@@ -29680,6 +30100,32 @@ function calculateReflectOffset(i, w) {
29680
30100
  return Math.abs((i + w) % (2 * w) - w);
29681
30101
  }
29682
30102
 
30103
+ /**
30104
+ * Save blob file on the web.
30105
+ * @param {string} path The path to save the blob to
30106
+ * @param {Blob} blob The blob to save
30107
+ */
30108
+ function saveBlob(path, blob){
30109
+ // Convert the canvas content to a data URL
30110
+ const dataURL = URL.createObjectURL(blob);
30111
+
30112
+ // Create an anchor element with the data URL as the href attribute
30113
+ const downloadLink = document.createElement('a');
30114
+ downloadLink.href = dataURL;
30115
+
30116
+ // Set the download attribute to specify the desired filename for the downloaded image
30117
+ downloadLink.download = path;
30118
+
30119
+ // Trigger the download
30120
+ downloadLink.click();
30121
+
30122
+ // Clean up: remove the anchor element from the DOM
30123
+ downloadLink.remove();
30124
+
30125
+ // Revoke the Object URL to free up memory
30126
+ URL.revokeObjectURL(dataURL);
30127
+ }
30128
+
29683
30129
  /**
29684
30130
  *
29685
30131
  * @param {Object} o
@@ -30243,6 +30689,8 @@ __webpack_require__.r(__webpack_exports__);
30243
30689
  /* harmony export */ });
30244
30690
  /* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
30245
30691
  /* harmony import */ var _devices_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./devices.js */ "./src/utils/devices.js");
30692
+ /// <reference types="@webgpu/types" />
30693
+
30246
30694
 
30247
30695
 
30248
30696
 
@@ -30498,7 +30946,7 @@ class FileResponse {
30498
30946
  */
30499
30947
  async arrayBuffer() {
30500
30948
  const data = await fs__WEBPACK_IMPORTED_MODULE_0__.promises.readFile(this.filePath);
30501
- return data.buffer;
30949
+ return /** @type {ArrayBuffer} */ (data.buffer);
30502
30950
  }
30503
30951
 
30504
30952
  /**
@@ -31860,23 +32308,9 @@ class RawImage {
31860
32308
  // Convert image to Blob
31861
32309
  const blob = await this.toBlob(mime);
31862
32310
 
31863
- // Convert the canvas content to a data URL
31864
- const dataURL = URL.createObjectURL(blob);
32311
+ (0,_core_js__WEBPACK_IMPORTED_MODULE_0__.saveBlob)(path, blob)
31865
32312
 
31866
- // Create an anchor element with the data URL as the href attribute
31867
- const downloadLink = document.createElement('a');
31868
- downloadLink.href = dataURL;
31869
-
31870
- // Set the download attribute to specify the desired filename for the downloaded image
31871
- downloadLink.download = path;
31872
-
31873
- // Trigger the download
31874
- downloadLink.click();
31875
-
31876
- // Clean up: remove the anchor element from the DOM
31877
- downloadLink.remove();
31878
-
31879
- } else if (!_env_js__WEBPACK_IMPORTED_MODULE_2__.env.useFS) {
32313
+ } else if (!_env_js__WEBPACK_IMPORTED_MODULE_2__.apis.IS_FS_AVAILABLE) {
31880
32314
  throw new Error('Unable to save the image because filesystem is disabled in this environment.')
31881
32315
 
31882
32316
  } else {
@@ -31906,6 +32340,7 @@ class RawImage {
31906
32340
  const load_image = RawImage.read.bind(RawImage);
31907
32341
 
31908
32342
 
32343
+
31909
32344
  /***/ }),
31910
32345
 
31911
32346
  /***/ "./src/utils/maths.js":
@@ -32159,8 +32594,9 @@ function magnitude(arr) {
32159
32594
 
32160
32595
  /**
32161
32596
  * Returns the value and index of the minimum element in an array.
32162
- * @param {number[]|TypedArray} arr array of numbers.
32163
- * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
32597
+ * @template {number[]|bigint[]|AnyTypedArray} T
32598
+ * @param {T} arr array of numbers.
32599
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
32164
32600
  * @throws {Error} If array is empty.
32165
32601
  */
32166
32602
  function min(arr) {
@@ -32173,14 +32609,15 @@ function min(arr) {
32173
32609
  indexOfMin = i;
32174
32610
  }
32175
32611
  }
32176
- return [min, indexOfMin];
32612
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
32177
32613
  }
32178
32614
 
32179
32615
 
32180
32616
  /**
32181
32617
  * Returns the value and index of the maximum element in an array.
32182
- * @param {number[]|AnyTypedArray} arr array of numbers.
32183
- * @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
32618
+ * @template {number[]|bigint[]|AnyTypedArray} T
32619
+ * @param {T} arr array of numbers.
32620
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
32184
32621
  * @throws {Error} If array is empty.
32185
32622
  */
32186
32623
  function max(arr) {
@@ -32193,7 +32630,7 @@ function max(arr) {
32193
32630
  indexOfMax = i;
32194
32631
  }
32195
32632
  }
32196
- return [Number(max), indexOfMax];
32633
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
32197
32634
  }
32198
32635
 
32199
32636
  function isPowerOfTwo(number) {
@@ -33491,8 +33928,6 @@ class Tensor {
33491
33928
  return this.permute(...dims);
33492
33929
  }
33493
33930
 
33494
- // TODO add .max() and .min() methods
33495
-
33496
33931
  /**
33497
33932
  * Returns the sum of each row of the input tensor in the given dimension dim.
33498
33933
  *
@@ -33521,55 +33956,22 @@ class Tensor {
33521
33956
  }
33522
33957
 
33523
33958
  const this_data = this.data;
33959
+ const fn = (a, b) => a + (b ** p);
33524
33960
 
33525
33961
  if (dim === null) {
33526
33962
  // @ts-ignore
33527
- let val = this_data.reduce((a, b) => a + (b ** p), 0) ** (1 / p);
33963
+ const val = this_data.reduce(fn, 0) ** (1 / p);
33528
33964
  return new Tensor(this.type, [val], []);
33529
33965
  }
33530
33966
 
33531
- // Negative indexing
33532
- dim = safeIndex(dim, this.dims.length);
33533
-
33534
- // Calculate the shape of the resulting array after summation
33535
- const resultDims = this.dims.slice(); // Copy the original dimensions
33536
- resultDims[dim] = 1; // Remove the specified axis
33537
-
33538
- // Create a new array to store the accumulated values
33539
- // @ts-ignore
33540
- const result = new this_data.constructor(this_data.length / this.dims[dim]);
33541
-
33542
- // Iterate over the data array
33543
- for (let i = 0; i < this_data.length; ++i) {
33544
-
33545
- // Calculate the index in the resulting array
33546
- let resultIndex = 0;
33547
-
33548
- for (let j = this.dims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) {
33549
- const size = this.dims[j];
33550
- if (j !== dim) {
33551
- const index = num % size;
33552
- resultIndex += index * resultMultiplier;
33553
- resultMultiplier *= resultDims[j];
33554
- }
33555
- num = Math.floor(num / size);
33556
- }
33557
-
33558
- // Accumulate the value at the current index
33559
- result[resultIndex] += (this_data[i]) ** p;
33560
- }
33967
+ const [type, result, resultDims] = reduce_helper(fn, this, dim, keepdim);
33561
33968
 
33562
33969
  if (p !== 1) {
33563
33970
  for (let i = 0; i < result.length; ++i) {
33564
33971
  result[i] = result[i] ** (1 / p);
33565
33972
  }
33566
33973
  }
33567
-
33568
- if (!keepdim) {
33569
- resultDims.splice(dim, 1);
33570
- }
33571
-
33572
- return new Tensor(this.type, result, resultDims);
33974
+ return new Tensor(type, result, resultDims);
33573
33975
  }
33574
33976
 
33575
33977
  /**
@@ -33632,7 +34034,7 @@ class Tensor {
33632
34034
  * NOTE: The returned tensor shares the storage with the input tensor, so changing the contents of one will change the contents of the other.
33633
34035
  * If you would like a copy, use `tensor.clone()` before squeezing.
33634
34036
  *
33635
- * @param {number} [dim=null] If given, the input will be squeezed only in the specified dimensions.
34037
+ * @param {number|number[]} [dim=null] If given, the input will be squeezed only in the specified dimensions.
33636
34038
  * @returns {Tensor} The squeezed tensor
33637
34039
  */
33638
34040
  squeeze(dim = null) {
@@ -33742,6 +34144,34 @@ class Tensor {
33742
34144
  return this.clone().neg_();
33743
34145
  }
33744
34146
 
34147
+ /**
34148
+ * Computes input > val element-wise.
34149
+ * @param {number} val The value to compare with.
34150
+ * @returns {Tensor} A boolean tensor that is `true` where input is greater than other and `false` elsewhere.
34151
+ */
34152
+ gt(val) {
34153
+ const mask = new Uint8Array(this.data.length);
34154
+ const this_data = this.data;
34155
+ for (let i = 0; i < this_data.length; ++i) {
34156
+ mask[i] = this_data[i] > val ? 1 : 0;
34157
+ }
34158
+ return new Tensor('bool', mask, this.dims);
34159
+ }
34160
+
34161
+ /**
34162
+ * Computes input < val element-wise.
34163
+ * @param {number} val The value to compare with.
34164
+ * @returns {Tensor} A boolean tensor that is `true` where input is less than other and `false` elsewhere.
34165
+ */
34166
+ lt(val) {
34167
+ const mask = new Uint8Array(this.data.length);
34168
+ const this_data = this.data;
34169
+ for (let i = 0; i < this_data.length; ++i) {
34170
+ mask[i] = this_data[i] < val ? 1 : 0;
34171
+ }
34172
+ return new Tensor('bool', mask, this.dims);
34173
+ }
34174
+
33745
34175
  /**
33746
34176
  * In-place version of @see {@link Tensor.clamp}
33747
34177
  */
@@ -33786,6 +34216,41 @@ class Tensor {
33786
34216
  return mean(this, dim, keepdim);
33787
34217
  }
33788
34218
 
34219
+ min(dim = null, keepdim = false) {
34220
+ if (dim === null) {
34221
+ // None to reduce over all dimensions.
34222
+ const val = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[0];
34223
+ return new Tensor(this.type, [val], [/* scalar */]);
34224
+ }
34225
+ const [type, result, resultDims] = reduce_helper((a, b) => Math.min(a, b), this, dim, keepdim, Infinity);
34226
+ return new Tensor(type, result, resultDims);
34227
+ }
34228
+
34229
+ max(dim = null, keepdim = false) {
34230
+ if (dim === null) {
34231
+ // None to reduce over all dimensions.
34232
+ const val = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[0];
34233
+ return new Tensor(this.type, [val], [/* scalar */]);
34234
+ }
34235
+ const [type, result, resultDims] = reduce_helper((a, b) => Math.max(a, b), this, dim, keepdim, -Infinity);
34236
+ return new Tensor(type, result, resultDims);
34237
+ }
34238
+
34239
+ argmin(dim = null, keepdim = false) {
34240
+ if (dim !== null) {
34241
+ throw new Error("`dim !== null` not yet implemented.");
34242
+ }
34243
+ const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[1];
34244
+ return new Tensor('int64', [BigInt(index)], []);
34245
+ }
34246
+ argmax(dim = null, keepdim = false) {
34247
+ if (dim !== null) {
34248
+ throw new Error("`dim !== null` not yet implemented.");
34249
+ }
34250
+ const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[1];
34251
+ return new Tensor('int64', [BigInt(index)], []);
34252
+ }
34253
+
33789
34254
  /**
33790
34255
  * Performs Tensor dtype conversion.
33791
34256
  * @param {DataType} type The desired data type.
@@ -33919,7 +34384,7 @@ function interpolate(input, [out_height, out_width], mode = 'bilinear', align_co
33919
34384
  * @param {Tensor} input the input tensor
33920
34385
  * @param {Object} options the options for the interpolation
33921
34386
  * @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
33922
- * @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
34387
+ * @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
33923
34388
  * @returns {Promise<Tensor>} The interpolated tensor.
33924
34389
  */
33925
34390
  async function interpolate_4d(input, {
@@ -33949,7 +34414,9 @@ async function interpolate_4d(input, {
33949
34414
  }
33950
34415
 
33951
34416
  let op;
33952
- if (mode === 'bilinear') {
34417
+ if (mode === 'nearest') {
34418
+ op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.nearest_interpolate_4d;
34419
+ } else if (mode === 'bilinear') {
33953
34420
  op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bilinear_interpolate_4d;
33954
34421
  } else if (mode === 'bicubic') {
33955
34422
  op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bicubic_interpolate_4d;
@@ -33990,13 +34457,13 @@ async function rfft(x, a) {
33990
34457
  * Returns the k largest elements of the given input tensor.
33991
34458
  * Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
33992
34459
  * @param {Tensor} x the input tensor
33993
- * @param {number} k the k in "top-k"
34460
+ * @param {number} [k] the k in "top-k"
33994
34461
  * @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
33995
34462
  */
33996
34463
  async function topk(x, k) {
33997
34464
  const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.top_k;
33998
34465
 
33999
- if (k === null) {
34466
+ if (k == null) {
34000
34467
  k = x.dims.at(-1);
34001
34468
  } else {
34002
34469
  k = Math.min(k, x.dims.at(-1));
@@ -34025,10 +34492,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
34025
34492
  async function slice(data, starts, ends, axes, steps) {
34026
34493
  const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
34027
34494
  return await op({
34028
- x: data,
34029
- s: arrayToIndexTensor(starts),
34030
- e: arrayToIndexTensor(ends),
34031
- a: arrayToIndexTensor(axes),
34495
+ x: data,
34496
+ s: arrayToIndexTensor(starts),
34497
+ e: arrayToIndexTensor(ends),
34498
+ a: arrayToIndexTensor(axes),
34032
34499
  t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
34033
34500
  });
34034
34501
  }
@@ -34263,35 +34730,19 @@ function stack(tensors, dim = 0) {
34263
34730
 
34264
34731
 
34265
34732
  /**
34266
- * Calculates the standard deviation and mean over the dimensions specified by dim. dim can be a single dimension or `null` to reduce over all dimensions.
34267
- * @param {Tensor} input the input tenso
34268
- * @param {number|null} dim the dimension to reduce. If None, all dimensions are reduced.
34269
- * @param {number} correction difference between the sample size and sample degrees of freedom. Defaults to Bessel's correction, correction=1.
34733
+ * @param {(previousValue: any, currentValue: any, currentIndex?: number, resultIndex?: number) => any} callbackfn
34734
+ * @param {Tensor} input the input tensor.
34735
+ * @param {number|null} dim the dimension to reduce.
34270
34736
  * @param {boolean} keepdim whether the output tensor has dim retained or not.
34271
- * @returns {Tensor[]} A tuple of (std, mean) tensors.
34737
+ * @returns {[DataType, any, number[]]} The reduced tensor data.
34272
34738
  */
34273
- function std_mean(input, dim = null, correction = 1, keepdim = false) {
34274
- const inputData = /** @type {Float32Array} */(input.data);
34739
+ function reduce_helper(callbackfn, input, dim = null, keepdim = false, initialValue = null) {
34740
+ const inputData = input.data;
34275
34741
  const inputDims = input.dims;
34276
34742
 
34277
- if (dim === null) {
34278
- // None to reduce over all dimensions.
34279
- const sum = inputData.reduce((a, b) => a + b, 0);
34280
- const mean = sum / inputData.length;
34281
- const std = Math.sqrt(inputData.reduce((a, b) => a + (b - mean) ** 2, 0) / (inputData.length - correction));
34282
-
34283
- const meanTensor = new Tensor(input.type, [mean], [/* scalar */]);
34284
- const stdTensor = new Tensor(input.type, [std], [/* scalar */]);
34285
-
34286
- return [stdTensor, meanTensor];
34287
- }
34288
-
34289
34743
  // Negative indexing
34290
34744
  dim = safeIndex(dim, inputDims.length);
34291
34745
 
34292
- const meanTensor = mean(input, dim, keepdim);
34293
- const meanTensorData = meanTensor.data;
34294
-
34295
34746
  // Calculate the shape of the resulting array after summation
34296
34747
  const resultDims = inputDims.slice(); // Copy the original dimensions
34297
34748
  resultDims[dim] = 1; // Remove the specified axis
@@ -34299,6 +34750,9 @@ function std_mean(input, dim = null, correction = 1, keepdim = false) {
34299
34750
  // Create a new array to store the accumulated values
34300
34751
  // @ts-ignore
34301
34752
  const result = new inputData.constructor(inputData.length / inputDims[dim]);
34753
+ if (initialValue !== null) {
34754
+ result.fill(initialValue);
34755
+ }
34302
34756
 
34303
34757
  // Iterate over the data array
34304
34758
  for (let i = 0; i < inputData.length; ++i) {
@@ -34317,23 +34771,55 @@ function std_mean(input, dim = null, correction = 1, keepdim = false) {
34317
34771
  }
34318
34772
 
34319
34773
  // Accumulate the value at the current index
34320
- result[resultIndex] += (inputData[i] - meanTensorData[resultIndex]) ** 2;
34774
+ result[resultIndex] = callbackfn(result[resultIndex], inputData[i], i, resultIndex);
34321
34775
  }
34322
34776
 
34323
- for (let i = 0; i < result.length; ++i) {
34324
- result[i] = Math.sqrt(result[i] / (inputDims[dim] - correction));
34777
+ if (!keepdim) resultDims.splice(dim, 1);
34778
+
34779
+ return [input.type, result, resultDims];
34780
+ }
34781
+
34782
+
34783
+ /**
34784
+ * Calculates the standard deviation and mean over the dimensions specified by dim. dim can be a single dimension or `null` to reduce over all dimensions.
34785
+ * @param {Tensor} input the input tenso
34786
+ * @param {number|null} dim the dimension to reduce. If None, all dimensions are reduced.
34787
+ * @param {number} correction difference between the sample size and sample degrees of freedom. Defaults to Bessel's correction, correction=1.
34788
+ * @param {boolean} keepdim whether the output tensor has dim retained or not.
34789
+ * @returns {Tensor[]} A tuple of (std, mean) tensors.
34790
+ */
34791
+ function std_mean(input, dim = null, correction = 1, keepdim = false) {
34792
+ const inputData = /** @type {Float32Array} */(input.data);
34793
+ const inputDims = input.dims;
34794
+
34795
+ if (dim === null) {
34796
+ // None to reduce over all dimensions.
34797
+ const sum = inputData.reduce((a, b) => a + b, 0);
34798
+ const mean = sum / inputData.length;
34799
+ const std = Math.sqrt(inputData.reduce((a, b) => a + (b - mean) ** 2, 0) / (inputData.length - correction));
34800
+
34801
+ const meanTensor = new Tensor(input.type, [mean], [/* scalar */]);
34802
+ const stdTensor = new Tensor(input.type, [std], [/* scalar */]);
34803
+
34804
+ return [stdTensor, meanTensor];
34325
34805
  }
34806
+ dim = safeIndex(dim, inputDims.length);
34807
+ const meanTensor = mean(input, dim, keepdim);
34808
+ const meanTensorData = meanTensor.data;
34326
34809
 
34327
- if (!keepdim) {
34328
- resultDims.splice(dim, 1);
34810
+ // Compute squared sum
34811
+ const [type, result, resultDims] = reduce_helper((a, b, i, j) => a + (b - meanTensorData[j]) ** 2, input, dim, keepdim);
34812
+
34813
+ // Square root of the squared sum
34814
+ for (let i = 0; i < result.length; ++i) {
34815
+ result[i] = Math.sqrt(result[i] / (inputDims[dim] - correction));
34329
34816
  }
34330
34817
 
34331
- const stdTensor = new Tensor(input.type, result, resultDims);
34818
+ const stdTensor = new Tensor(type, result, resultDims);
34332
34819
 
34333
34820
  return [stdTensor, meanTensor];
34334
34821
  }
34335
34822
 
34336
-
34337
34823
  /**
34338
34824
  * Returns the mean value of each row of the input tensor in the given dimension dim.
34339
34825
  * @param {Tensor} input the input tensor.
@@ -34342,58 +34828,27 @@ function std_mean(input, dim = null, correction = 1, keepdim = false) {
34342
34828
  * @returns {Tensor} A new tensor with means taken along the specified dimension.
34343
34829
  */
34344
34830
  function mean(input, dim = null, keepdim = false) {
34831
+ const inputDims = input.dims;
34345
34832
  const inputData = /** @type {Float32Array} */(input.data);
34346
34833
 
34347
34834
  if (dim === null) {
34348
34835
  // None to reduce over all dimensions.
34349
- // @ts-ignore
34350
34836
  const val = inputData.reduce((a, b) => a + b, 0);
34351
34837
  return new Tensor(input.type, [val / inputData.length], [/* scalar */]);
34352
34838
  }
34353
- const inputDims = input.dims;
34354
-
34355
- // Negative indexing
34356
34839
  dim = safeIndex(dim, inputDims.length);
34357
34840
 
34358
- // Calculate the shape of the resulting array after summation
34359
- const resultDims = inputDims.slice(); // Copy the original dimensions
34360
- resultDims[dim] = 1; // Remove the specified axis
34361
-
34362
- // Create a new array to store the accumulated values
34363
- // @ts-ignore
34364
- const result = new inputData.constructor(inputData.length / inputDims[dim]);
34365
-
34366
- // Iterate over the data array
34367
- for (let i = 0; i < inputData.length; ++i) {
34368
-
34369
- // Calculate the index in the resulting array
34370
- let resultIndex = 0;
34371
-
34372
- for (let j = inputDims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) {
34373
- const size = inputDims[j];
34374
- if (j !== dim) {
34375
- const index = num % size;
34376
- resultIndex += index * resultMultiplier;
34377
- resultMultiplier *= resultDims[j];
34378
- }
34379
- num = Math.floor(num / size);
34380
- }
34381
-
34382
- // Accumulate the value at the current index
34383
- result[resultIndex] += inputData[i];
34384
- }
34841
+ // Compute sum
34842
+ const [type, result, resultDims] = reduce_helper((a, b) => a + b, input, dim, keepdim);
34385
34843
 
34844
+ // Divide by number of elements in the dimension
34386
34845
  if (inputDims[dim] !== 1) {
34387
34846
  for (let i = 0; i < result.length; ++i) {
34388
- result[i] = result[i] / inputDims[dim];
34847
+ result[i] /= inputDims[dim];
34389
34848
  }
34390
34849
  }
34391
34850
 
34392
- if (!keepdim) {
34393
- resultDims.splice(dim, 1);
34394
- }
34395
-
34396
- return new Tensor(input.type, result, resultDims);
34851
+ return new Tensor(type, result, resultDims);
34397
34852
  }
34398
34853
 
34399
34854
 
@@ -34893,6 +35348,10 @@ __webpack_require__.r(__webpack_exports__);
34893
35348
  /* harmony export */ GraniteModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GraniteModel),
34894
35349
  /* harmony export */ GranitePreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GranitePreTrainedModel),
34895
35350
  /* harmony export */ Grok1Tokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.Grok1Tokenizer),
35351
+ /* harmony export */ GroundingDinoForObjectDetection: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroundingDinoForObjectDetection),
35352
+ /* harmony export */ GroundingDinoImageProcessor: () => (/* reexport safe */ _models_image_processors_js__WEBPACK_IMPORTED_MODULE_13__.GroundingDinoImageProcessor),
35353
+ /* harmony export */ GroundingDinoPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroundingDinoPreTrainedModel),
35354
+ /* harmony export */ GroundingDinoProcessor: () => (/* reexport safe */ _models_processors_js__WEBPACK_IMPORTED_MODULE_16__.GroundingDinoProcessor),
34896
35355
  /* harmony export */ GroupViTModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroupViTModel),
34897
35356
  /* harmony export */ GroupViTPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.GroupViTPreTrainedModel),
34898
35357
  /* harmony export */ HerbertTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.HerbertTokenizer),
@@ -35122,6 +35581,7 @@ __webpack_require__.r(__webpack_exports__);
35122
35581
  /* harmony export */ RTDetrModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.RTDetrModel),
35123
35582
  /* harmony export */ RTDetrObjectDetectionOutput: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.RTDetrObjectDetectionOutput),
35124
35583
  /* harmony export */ RTDetrPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.RTDetrPreTrainedModel),
35584
+ /* harmony export */ RawAudio: () => (/* reexport safe */ _utils_audio_js__WEBPACK_IMPORTED_MODULE_5__.RawAudio),
35125
35585
  /* harmony export */ RawImage: () => (/* reexport safe */ _utils_image_js__WEBPACK_IMPORTED_MODULE_6__.RawImage),
35126
35586
  /* harmony export */ RepetitionPenaltyLogitsProcessor: () => (/* reexport safe */ _generation_logits_process_js__WEBPACK_IMPORTED_MODULE_20__.RepetitionPenaltyLogitsProcessor),
35127
35587
  /* harmony export */ ResNetForImageClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ResNetForImageClassification),
@@ -35187,6 +35647,8 @@ __webpack_require__.r(__webpack_exports__);
35187
35647
  /* harmony export */ Starcoder2PreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Starcoder2PreTrainedModel),
35188
35648
  /* harmony export */ StoppingCriteria: () => (/* reexport safe */ _generation_stopping_criteria_js__WEBPACK_IMPORTED_MODULE_19__.StoppingCriteria),
35189
35649
  /* harmony export */ StoppingCriteriaList: () => (/* reexport safe */ _generation_stopping_criteria_js__WEBPACK_IMPORTED_MODULE_19__.StoppingCriteriaList),
35650
+ /* harmony export */ StyleTextToSpeech2Model: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.StyleTextToSpeech2Model),
35651
+ /* harmony export */ StyleTextToSpeech2PreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.StyleTextToSpeech2PreTrainedModel),
35190
35652
  /* harmony export */ SummarizationPipeline: () => (/* reexport safe */ _pipelines_js__WEBPACK_IMPORTED_MODULE_1__.SummarizationPipeline),
35191
35653
  /* harmony export */ SuppressTokensAtBeginLogitsProcessor: () => (/* reexport safe */ _generation_logits_process_js__WEBPACK_IMPORTED_MODULE_20__.SuppressTokensAtBeginLogitsProcessor),
35192
35654
  /* harmony export */ Swin2SRForImageSuperResolution: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Swin2SRForImageSuperResolution),