@huggingface/transformers 3.2.3 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +2 -2
  2. package/dist/transformers.cjs +203 -92
  3. package/dist/transformers.cjs.map +1 -1
  4. package/dist/transformers.js +203 -92
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.cjs +1 -1
  7. package/dist/transformers.min.cjs.map +1 -1
  8. package/dist/transformers.min.js +1 -1
  9. package/dist/transformers.min.js.map +1 -1
  10. package/dist/transformers.min.mjs +1 -1
  11. package/dist/transformers.min.mjs.map +1 -1
  12. package/dist/transformers.mjs +203 -92
  13. package/dist/transformers.mjs.map +1 -1
  14. package/package.json +2 -2
  15. package/src/base/feature_extraction_utils.js +9 -9
  16. package/src/base/image_processors_utils.js +11 -0
  17. package/src/base/processing_utils.js +13 -3
  18. package/src/configs.js +5 -0
  19. package/src/env.js +1 -1
  20. package/src/models/auto/feature_extraction_auto.js +0 -16
  21. package/src/models/auto/processing_auto.js +0 -16
  22. package/src/models/convnext/image_processing_convnext.js +1 -0
  23. package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
  24. package/src/models/florence2/processing_florence2.js +3 -0
  25. package/src/models/idefics3/image_processing_idefics3.js +2 -0
  26. package/src/models/janus/image_processing_janus.js +1 -0
  27. package/src/models/mgp_str/processing_mgp_str.js +2 -0
  28. package/src/models/paligemma/processing_paligemma.js +1 -0
  29. package/src/models/phi3_v/processing_phi3_v.js +1 -1
  30. package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
  31. package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
  32. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
  33. package/src/models/whisper/feature_extraction_whisper.js +1 -1
  34. package/src/models.js +50 -15
  35. package/src/ops/registry.js +10 -0
  36. package/src/pipelines.js +34 -7
  37. package/src/tokenizers.js +4 -7
  38. package/src/utils/dtypes.js +2 -0
  39. package/src/utils/hub.js +1 -1
  40. package/src/utils/maths.js +8 -6
  41. package/src/utils/tensor.js +42 -10
  42. package/types/base/feature_extraction_utils.d.ts +7 -7
  43. package/types/base/image_processors_utils.d.ts.map +1 -1
  44. package/types/base/processing_utils.d.ts +17 -19
  45. package/types/base/processing_utils.d.ts.map +1 -1
  46. package/types/configs.d.ts.map +1 -1
  47. package/types/generation/parameters.d.ts +1 -1
  48. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  49. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  50. package/types/models/auto/processing_auto.d.ts.map +1 -1
  51. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
  52. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
  53. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  54. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  55. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  56. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  57. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  58. package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
  59. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
  60. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  61. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  62. package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
  63. package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
  64. package/types/models/whisper/generation_whisper.d.ts +1 -1
  65. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  66. package/types/models.d.ts +32 -17
  67. package/types/models.d.ts.map +1 -1
  68. package/types/ops/registry.d.ts +1 -0
  69. package/types/ops/registry.d.ts.map +1 -1
  70. package/types/pipelines.d.ts +2 -2
  71. package/types/pipelines.d.ts.map +1 -1
  72. package/types/tokenizers.d.ts.map +1 -1
  73. package/types/tsconfig.tsbuildinfo +1 -0
  74. package/types/utils/dtypes.d.ts.map +1 -1
  75. package/types/utils/hub.d.ts +1 -1
  76. package/types/utils/hub.d.ts.map +1 -1
  77. package/types/utils/image.d.ts +3 -2
  78. package/types/utils/image.d.ts.map +1 -1
  79. package/types/utils/maths.d.ts +8 -6
  80. package/types/utils/maths.d.ts.map +1 -1
  81. package/types/utils/tensor.d.ts +8 -4
  82. package/types/utils/tensor.d.ts.map +1 -1
@@ -4132,23 +4132,23 @@ class FeatureExtractor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Ca
4132
4132
  }
4133
4133
 
4134
4134
  /**
4135
- * Instantiate one of the processor classes of the library from a pretrained model.
4135
+ * Instantiate one of the feature extractor classes of the library from a pretrained model.
4136
4136
  *
4137
- * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
4138
- * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
4137
+ * The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
4138
+ * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
4139
4139
  *
4140
4140
  * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
4141
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
4141
+ * - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
4142
4142
  * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
4143
4143
  * user or organization name, like `dbmdz/bert-base-german-cased`.
4144
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
4145
- * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
4144
+ * - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
4145
+ * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
4146
4146
  *
4147
- * @returns {Promise<FeatureExtractor>} A new instance of the Processor class.
4147
+ * @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
4148
4148
  */
4149
4149
  static async from_pretrained(pretrained_model_name_or_path, options) {
4150
- const preprocessorConfig = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
4151
- return new this(preprocessorConfig);
4150
+ const config = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
4151
+ return new this(config);
4152
4152
  }
4153
4153
  }
4154
4154
 
@@ -4798,14 +4798,20 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
4798
4798
  this.do_thumbnail = config.do_thumbnail;
4799
4799
  this.size = config.size ?? config.image_size;
4800
4800
  this.do_resize = config.do_resize ?? (this.size !== undefined);
4801
+ // @ts-expect-error TS2339
4801
4802
  this.size_divisibility = config.size_divisibility ?? config.size_divisor;
4802
4803
 
4803
4804
  this.do_center_crop = config.do_center_crop;
4805
+ // @ts-expect-error TS2339
4804
4806
  this.crop_size = config.crop_size;
4807
+ // @ts-expect-error TS2339
4805
4808
  this.do_convert_rgb = config.do_convert_rgb ?? true;
4809
+ // @ts-expect-error TS2339
4806
4810
  this.do_crop_margin = config.do_crop_margin;
4807
4811
 
4812
+ // @ts-expect-error TS2339
4808
4813
  this.pad_size = config.pad_size;
4814
+ // @ts-expect-error TS2339
4809
4815
  this.do_pad = config.do_pad;
4810
4816
 
4811
4817
  if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
@@ -5014,6 +5020,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5014
5020
  // Support both formats for backwards compatibility
5015
5021
  else if (Number.isInteger(size)) {
5016
5022
  shortest_edge = size;
5023
+ // @ts-expect-error TS2339
5017
5024
  longest_edge = this.config.max_size ?? shortest_edge;
5018
5025
 
5019
5026
  } else if (size !== undefined) {
@@ -5082,6 +5089,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5082
5089
  } else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
5083
5090
  // Custom resize logic for Qwen2-VL models
5084
5091
  const { min_pixels, max_pixels } = size;
5092
+ // @ts-expect-error TS2339
5085
5093
  const factor = this.config.patch_size * this.config.merge_size;
5086
5094
  return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
5087
5095
  } else {
@@ -5097,6 +5105,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5097
5105
  async resize(image) {
5098
5106
  const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
5099
5107
  return await image.resize(newWidth, newHeight, {
5108
+ // @ts-expect-error TS2322
5100
5109
  resample: this.resample,
5101
5110
  });
5102
5111
  }
@@ -5147,6 +5156,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5147
5156
 
5148
5157
  // Resize the image using thumbnail method.
5149
5158
  if (this.do_thumbnail) {
5159
+ // @ts-expect-error TS2345
5150
5160
  image = await this.thumbnail(image, this.size, this.resample);
5151
5161
  }
5152
5162
 
@@ -5171,6 +5181,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5171
5181
  // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
5172
5182
  // occurs with data in the hwc format (height, width, channels),
5173
5183
  // to emulate the behavior of the original Python code (w/ numpy).
5184
+ /** @type {Float32Array} */
5174
5185
  let pixelData = Float32Array.from(image.data);
5175
5186
  let imgDims = [image.height, image.width, image.channels];
5176
5187
 
@@ -5328,6 +5339,7 @@ __webpack_require__.r(__webpack_exports__);
5328
5339
  /**
5329
5340
  * @typedef {Object} ProcessorProperties Additional processor-specific properties.
5330
5341
  * @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
5342
+ * @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
5331
5343
  */
5332
5344
 
5333
5345
 
@@ -5361,7 +5373,7 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5361
5373
  }
5362
5374
 
5363
5375
  /**
5364
- * @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
5376
+ * @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
5365
5377
  */
5366
5378
  get tokenizer() {
5367
5379
  return this.components.tokenizer;
@@ -5374,6 +5386,11 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5374
5386
  return this.components.feature_extractor;
5375
5387
  }
5376
5388
 
5389
+ /**
5390
+ * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
5391
+ * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
5392
+ * @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
5393
+ */
5377
5394
  apply_chat_template(messages, options = {}) {
5378
5395
  if (!this.tokenizer) {
5379
5396
  throw new Error('Unable to apply chat template without a tokenizer.');
@@ -5384,6 +5401,10 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5384
5401
  });
5385
5402
  }
5386
5403
 
5404
+ /**
5405
+ * @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
5406
+ * @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
5407
+ */
5387
5408
  batch_decode(...args) {
5388
5409
  if (!this.tokenizer) {
5389
5410
  throw new Error('Unable to decode without a tokenizer.');
@@ -5411,8 +5432,8 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5411
5432
  /**
5412
5433
  * Instantiate one of the processor classes of the library from a pretrained model.
5413
5434
  *
5414
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
5415
- * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
5435
+ * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
5436
+ * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
5416
5437
  *
5417
5438
  * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
5418
5439
  * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
@@ -5531,15 +5552,19 @@ function getNormalizedConfig(config) {
5531
5552
  case 'florence2':
5532
5553
  case 'llava_onevision':
5533
5554
  case 'idefics3':
5555
+ // @ts-expect-error TS2339
5534
5556
  init_normalized_config = getNormalizedConfig(config.text_config);
5535
5557
  break;
5536
5558
  case 'moondream1':
5559
+ // @ts-expect-error TS2339
5537
5560
  init_normalized_config = getNormalizedConfig(config.phi_config);
5538
5561
  break;
5539
5562
  case 'musicgen':
5563
+ // @ts-expect-error TS2339
5540
5564
  init_normalized_config = getNormalizedConfig(config.decoder);
5541
5565
  break;
5542
5566
  case 'multi_modality':
5567
+ // @ts-expect-error TS2339
5543
5568
  init_normalized_config = getNormalizedConfig(config.language_config);
5544
5569
  break;
5545
5570
 
@@ -5660,6 +5685,7 @@ function getNormalizedConfig(config) {
5660
5685
  break;
5661
5686
 
5662
5687
  case 'vision-encoder-decoder':
5688
+ // @ts-expect-error TS2339
5663
5689
  const decoderConfig = getNormalizedConfig(config.decoder);
5664
5690
 
5665
5691
  const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
@@ -5902,7 +5928,7 @@ __webpack_require__.r(__webpack_exports__);
5902
5928
 
5903
5929
 
5904
5930
 
5905
- const VERSION = '3.2.3';
5931
+ const VERSION = '3.2.4';
5906
5932
 
5907
5933
  // Check if various APIs are available (depends on environment)
5908
5934
  const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
@@ -8558,8 +8584,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
8558
8584
  } else if (session_options.externalData !== undefined) {
8559
8585
  externalDataPromises = session_options.externalData.map(async (ext) => {
8560
8586
  // if the external data is a string, fetch the file and replace the string with its content
8587
+ // @ts-expect-error TS2339
8561
8588
  if (typeof ext.data === "string") {
8589
+ // @ts-expect-error TS2339
8562
8590
  const ext_buffer = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, ext.data, true, options);
8591
+ // @ts-expect-error TS2698
8563
8592
  return { ...ext, data: ext_buffer };
8564
8593
  }
8565
8594
  return ext;
@@ -9807,6 +9836,7 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
9807
9836
  if (this.config.model_type === 'musicgen') {
9808
9837
  // Custom logic (TODO: move to Musicgen class)
9809
9838
  decoder_input_ids = Array.from({
9839
+ // @ts-expect-error TS2339
9810
9840
  length: batch_size * this.config.decoder.num_codebooks
9811
9841
  }, () => [decoder_start_token_id]);
9812
9842
 
@@ -10136,11 +10166,13 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
10136
10166
  async encode_image({ pixel_values }) {
10137
10167
  // image_inputs === { pixel_values }
10138
10168
  const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
10169
+ // @ts-expect-error TS2339
10139
10170
  if (!this.config.num_image_tokens) {
10140
10171
  console.warn(
10141
10172
  'The number of image tokens was not set in the model configuration. ' +
10142
10173
  `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
10143
10174
  )
10175
+ // @ts-expect-error TS2339
10144
10176
  this.config.num_image_tokens = features.dims[1];
10145
10177
  }
10146
10178
  return features;
@@ -11568,6 +11600,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11568
11600
 
11569
11601
  if (generation_config.return_token_timestamps) {
11570
11602
  outputs["token_timestamps"] = this._extract_token_timestamps(
11603
+ // @ts-expect-error TS2345
11571
11604
  outputs,
11572
11605
  generation_config.alignment_heads,
11573
11606
  generation_config.num_frames,
@@ -11603,6 +11636,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11603
11636
  );
11604
11637
  }
11605
11638
 
11639
+ // @ts-expect-error TS2339
11606
11640
  let median_filter_width = this.config.median_filter_width;
11607
11641
  if (median_filter_width === undefined) {
11608
11642
  console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -11613,6 +11647,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11613
11647
  const batch = generate_outputs.cross_attentions;
11614
11648
  // Create a list with `decoder_layers` elements, each a tensor of shape
11615
11649
  // (batch size, attention_heads, output length, input length).
11650
+ // @ts-expect-error TS2339
11616
11651
  const cross_attentions = Array.from({ length: this.config.decoder_layers },
11617
11652
  // Concatenate the cross attentions for each layer across sequence length dimension.
11618
11653
  (_, i) => (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.cat)(batch.map(x => x[i]), 2)
@@ -11756,6 +11791,7 @@ class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
11756
11791
  attention_mask,
11757
11792
  }) {
11758
11793
 
11794
+ // @ts-expect-error TS2339
11759
11795
  const image_token_index = this.config.image_token_index;
11760
11796
 
11761
11797
  const idsList = input_ids.tolist();
@@ -12741,6 +12777,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12741
12777
  const image_nums = vision_tokens.filter(x => x == image_token_id).length;
12742
12778
  const video_nums = vision_tokens.filter(x => x == video_token_id).length;
12743
12779
 
12780
+ /** @type {number[][]} */
12744
12781
  let llm_pos_ids_list = [];
12745
12782
  let st = 0;
12746
12783
  let remain_images = image_nums;
@@ -12810,6 +12847,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12810
12847
  // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
12811
12848
  // meaning to perform concatenation along dim=1, we can do the following:
12812
12849
  const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
12850
+ /** @type {number[]} */
12813
12851
  const llm_positions = new Array(num_items);
12814
12852
  let index = 0;
12815
12853
  for (let x = 0; x < 3; ++x) {
@@ -12850,9 +12888,10 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12850
12888
  { length: 3 * data.length },
12851
12889
  (_, i) => data[i % data.length]
12852
12890
  );
12891
+ /** @type {bigint[]} */
12853
12892
  const mrope_position_deltas = Array.from(
12854
12893
  { length: dims[0] },
12855
- (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
12894
+ (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
12856
12895
  );
12857
12896
 
12858
12897
  return [
@@ -13423,7 +13462,7 @@ class DPTModel extends DPTPreTrainedModel { }
13423
13462
  *
13424
13463
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
13425
13464
  * ```javascript
13426
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
13465
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
13427
13466
  *
13428
13467
  * // Load model and processor
13429
13468
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -13432,7 +13471,7 @@ class DPTModel extends DPTPreTrainedModel { }
13432
13471
  *
13433
13472
  * // Load image from URL
13434
13473
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
13435
- * const image = await RawImage.fromURL(url);
13474
+ * const image = await RawImage.read(url);
13436
13475
  *
13437
13476
  * // Prepare image for the model
13438
13477
  * const inputs = await processor(image);
@@ -13441,10 +13480,15 @@ class DPTModel extends DPTPreTrainedModel { }
13441
13480
  * const { predicted_depth } = await model(inputs);
13442
13481
  *
13443
13482
  * // Interpolate to original size
13444
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
13483
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
13484
+ * size: image.size.reverse(),
13485
+ * mode: 'bilinear',
13486
+ * })).squeeze(1);
13445
13487
  *
13446
13488
  * // Visualize the prediction
13447
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
13489
+ * const min = prediction.min().item();
13490
+ * const max = prediction.max().item();
13491
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
13448
13492
  * const depth = RawImage.fromTensor(formatted);
13449
13493
  * // RawImage {
13450
13494
  * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -13494,11 +13538,7 @@ class GLPNPreTrainedModel extends PreTrainedModel { }
13494
13538
  class GLPNModel extends GLPNPreTrainedModel { }
13495
13539
 
13496
13540
  /**
13497
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
13498
- *
13499
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
13500
- * ```javascript
13501
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
13541
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
13502
13542
  *
13503
13543
  * // Load model and processor
13504
13544
  * const model_id = 'Xenova/glpn-kitti';
@@ -13507,7 +13547,7 @@ class GLPNModel extends GLPNPreTrainedModel { }
13507
13547
  *
13508
13548
  * // Load image from URL
13509
13549
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
13510
- * const image = await RawImage.fromURL(url);
13550
+ * const image = await RawImage.read(url);
13511
13551
  *
13512
13552
  * // Prepare image for the model
13513
13553
  * const inputs = await processor(image);
@@ -13516,13 +13556,18 @@ class GLPNModel extends GLPNPreTrainedModel { }
13516
13556
  * const { predicted_depth } = await model(inputs);
13517
13557
  *
13518
13558
  * // Interpolate to original size
13519
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
13559
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
13560
+ * size: image.size.reverse(),
13561
+ * mode: 'bilinear',
13562
+ * })).squeeze(1);
13520
13563
  *
13521
13564
  * // Visualize the prediction
13522
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
13565
+ * const min = prediction.min().item();
13566
+ * const max = prediction.max().item();
13567
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
13523
13568
  * const depth = RawImage.fromTensor(formatted);
13524
13569
  * // RawImage {
13525
- * // data: Uint8Array(307200) [ 207, 169, 154, ... ],
13570
+ * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
13526
13571
  * // width: 640,
13527
13572
  * // height: 480,
13528
13573
  * // channels: 1
@@ -14489,10 +14534,12 @@ class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
14489
14534
 
14490
14535
  const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
14491
14536
 
14537
+ // @ts-expect-error TS2339
14492
14538
  const r = encoder_outputs.dims[1] / this.config.reduction_factor;
14493
14539
  const maxlen = Math.floor(r * maxlenratio);
14494
14540
  const minlen = Math.floor(r * minlenratio);
14495
14541
 
14542
+ // @ts-expect-error TS2339
14496
14543
  const num_mel_bins = this.config.num_mel_bins;
14497
14544
 
14498
14545
  let spectrogramParts = [];
@@ -14857,11 +14904,13 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
14857
14904
  */
14858
14905
  _apply_and_filter_by_delay_pattern_mask(outputs) {
14859
14906
  const [bs_x_codebooks, seqLength] = outputs.dims;
14907
+ // @ts-expect-error TS2339
14860
14908
  const num_codebooks = this.config.decoder.num_codebooks;
14861
14909
  const upperBound = (seqLength - num_codebooks);
14862
14910
 
14863
14911
  let newDataSize = 0;
14864
14912
  for (let i = 0; i < outputs.size; ++i) {
14913
+ // @ts-expect-error TS2339
14865
14914
  if (outputs.data[i] === this.config.decoder.pad_token_id) {
14866
14915
  continue;
14867
14916
  }
@@ -14891,7 +14940,9 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
14891
14940
  let clonedInputIds = structuredClone(input_ids);
14892
14941
  for (let i = 0; i < clonedInputIds.length; ++i) {
14893
14942
  for (let j = 0; j < clonedInputIds[i].length; ++j) {
14943
+ // @ts-expect-error TS2339
14894
14944
  if ((i % this.config.decoder.num_codebooks) >= j) {
14945
+ // @ts-expect-error TS2339
14895
14946
  clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
14896
14947
  }
14897
14948
  }
@@ -15048,6 +15099,9 @@ class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
15048
15099
  'past_key_values',
15049
15100
  ];
15050
15101
 
15102
+ /**
15103
+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
15104
+ */
15051
15105
  constructor(...args) {
15052
15106
  super(...args);
15053
15107
 
@@ -16016,10 +16070,17 @@ class SequenceClassifierOutput extends ModelOutput {
16016
16070
  /**
16017
16071
  * @param {Object} output The output of the model.
16018
16072
  * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
16073
+ * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
16074
+ * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
16019
16075
  */
16020
- constructor({ logits }) {
16076
+ constructor({ logits, ...attentions }) {
16021
16077
  super();
16022
16078
  this.logits = logits;
16079
+ const attentions_list = Object.values(attentions);
16080
+ if (attentions_list.length > 0) {
16081
+ // Only set attentions if they are not empty
16082
+ this.attentions = attentions_list;
16083
+ }
16023
16084
  }
16024
16085
  }
16025
16086
 
@@ -16275,22 +16336,6 @@ __webpack_require__.r(__webpack_exports__);
16275
16336
 
16276
16337
  class AutoFeatureExtractor {
16277
16338
 
16278
- /**
16279
- * Instantiate one of the feature extractor classes of the library from a pretrained model.
16280
- *
16281
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of
16282
- * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
16283
- *
16284
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
16285
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
16286
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
16287
- * user or organization name, like `dbmdz/bert-base-german-cased`.
16288
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
16289
- * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
16290
- *
16291
- * @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
16292
- */
16293
-
16294
16339
  /** @type {typeof FeatureExtractor.from_pretrained} */
16295
16340
  static async from_pretrained(pretrained_model_name_or_path, options={}) {
16296
16341
 
@@ -16417,22 +16462,6 @@ __webpack_require__.r(__webpack_exports__);
16417
16462
  */
16418
16463
  class AutoProcessor {
16419
16464
 
16420
- /**
16421
- * Instantiate one of the processor classes of the library from a pretrained model.
16422
- *
16423
- * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
16424
- * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
16425
- *
16426
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
16427
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
16428
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
16429
- * user or organization name, like `dbmdz/bert-base-german-cased`.
16430
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
16431
- * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
16432
- *
16433
- * @returns {Promise<Processor>} A new instance of the Processor class.
16434
- */
16435
-
16436
16465
  /** @type {typeof Processor.from_pretrained} */
16437
16466
  static async from_pretrained(pretrained_model_name_or_path, options={}) {
16438
16467
 
@@ -16750,6 +16779,7 @@ class ConvNextImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
16750
16779
  /**
16751
16780
  * Percentage of the image to crop. Only has an effect if this.size < 384.
16752
16781
  */
16782
+ // @ts-expect-error TS2339
16753
16783
  this.crop_pct = this.config.crop_pct ?? (224 / 256);
16754
16784
  }
16755
16785
 
@@ -16952,6 +16982,7 @@ __webpack_require__.r(__webpack_exports__);
16952
16982
  class EfficientNetImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
16953
16983
  constructor(config) {
16954
16984
  super(config);
16985
+ // @ts-expect-error TS2339
16955
16986
  this.include_top = this.config.include_top ?? true;
16956
16987
  if (this.include_top) {
16957
16988
  this.image_std = this.image_std.map(x => x * x);
@@ -17033,8 +17064,11 @@ class Florence2Processor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
17033
17064
  super(config, components);
17034
17065
 
17035
17066
  const {
17067
+ // @ts-expect-error TS2339
17036
17068
  tasks_answer_post_processing_type,
17069
+ // @ts-expect-error TS2339
17037
17070
  task_prompts_without_inputs,
17071
+ // @ts-expect-error TS2339
17038
17072
  task_prompts_with_input,
17039
17073
  } = this.image_processor.config;
17040
17074
 
@@ -17329,6 +17363,8 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
17329
17363
 
17330
17364
  const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
17331
17365
  const end_offset = (i + 1) * pixel_attention_mask_stride;
17366
+
17367
+ // @ts-expect-error
17332
17368
  pixel_attention_mask_data.fill(false, start_offset, end_offset);
17333
17369
  }
17334
17370
  }
@@ -17735,6 +17771,7 @@ class VLMImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTE
17735
17771
  },
17736
17772
  ...config,
17737
17773
  });
17774
+ // @ts-expect-error TS2339
17738
17775
  this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
17739
17776
  }
17740
17777
 
@@ -18176,6 +18213,8 @@ class MgpstrProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE
18176
18213
  * - bpe_preds: The list of BPE decoded sentences.
18177
18214
  * - wp_preds: The list of wp decoded sentences.
18178
18215
  */
18216
+ // @ts-expect-error The type of this method is not compatible with the one
18217
+ // in the base class. It might be a good idea to fix this.
18179
18218
  batch_decode([char_logits, bpe_logits, wp_logits]) {
18180
18219
  const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
18181
18220
  const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
@@ -18557,6 +18596,7 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
18557
18596
  }
18558
18597
 
18559
18598
  const bos_token = this.tokenizer.bos_token;
18599
+ // @ts-expect-error TS2339
18560
18600
  const image_seq_length = this.image_processor.config.image_seq_length;
18561
18601
  let input_strings;
18562
18602
  if (text.some((t) => t.includes(IMAGE_TOKEN))) {
@@ -18807,7 +18847,7 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
18807
18847
  *
18808
18848
  * @param {string|string[]} text
18809
18849
  * @param {RawImage|RawImage[]} images
18810
- * @param {...any} args
18850
+ * @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
18811
18851
  * @returns {Promise<any>}
18812
18852
  */
18813
18853
  async _call(text, images = null, {
@@ -18991,6 +19031,7 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
18991
19031
 
18992
19032
  let current_speaker = -1;
18993
19033
  for (let i = 0; i < scores.length; ++i) {
19034
+ /** @type {number[]} */
18994
19035
  const probabilities = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.softmax)(scores[i]);
18995
19036
  const [score, id] = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.max)(probabilities);
18996
19037
  const [start, end] = [i, i + 1];
@@ -19175,6 +19216,7 @@ class Qwen2VLProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODUL
19175
19216
  }
19176
19217
 
19177
19218
  if (image_grid_thw) {
19219
+ // @ts-expect-error TS2551
19178
19220
  let merge_length = this.image_processor.config.merge_size ** 2;
19179
19221
  let index = 0;
19180
19222
 
@@ -19662,8 +19704,8 @@ class SeamlessM4TFeatureExtractor extends _base_feature_extraction_utils_js__WEB
19662
19704
  'int64',
19663
19705
  new BigInt64Array(numPaddedFrames),
19664
19706
  [1, numPaddedFrames],
19665
- )
19666
- padded_attention_mask.data.fill(1n, 0, num_frames);
19707
+ );
19708
+ /** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
19667
19709
  }
19668
19710
  }
19669
19711
  }
@@ -20463,7 +20505,7 @@ class WhisperFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK
20463
20505
  )
20464
20506
 
20465
20507
  const data = features.data;
20466
- const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(data)[0];
20508
+ const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(/** @type {Float32Array} */(data))[0];
20467
20509
 
20468
20510
  for (let i = 0; i < data.length; ++i) {
20469
20511
  data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
@@ -20722,6 +20764,16 @@ class TensorOpRegistry {
20722
20764
  // executionProviders: ['webgpu'],
20723
20765
  };
20724
20766
 
20767
+ static get nearest_interpolate_4d() {
20768
+ if (!this._nearest_interpolate_4d) {
20769
+ this._nearest_interpolate_4d = wrap(
20770
+ [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
20771
+ this.session_options,
20772
+ 'y',
20773
+ );
20774
+ }
20775
+ return this._nearest_interpolate_4d;
20776
+ }
20725
20777
  static get bilinear_interpolate_4d() {
20726
20778
  if (!this._bilinear_interpolate_4d) {
20727
20779
  this._bilinear_interpolate_4d = wrap(
@@ -21095,6 +21147,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
21095
21147
 
21096
21148
  // TODO: Use softmax tensor function
21097
21149
  const function_to_apply =
21150
+ // @ts-expect-error TS2339
21098
21151
  this.model.config.problem_type === 'multi_label_classification'
21099
21152
  ? batch => batch.sigmoid()
21100
21153
  : batch => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor(
@@ -21103,6 +21156,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
21103
21156
  batch.dims,
21104
21157
  ); // single_label_classification (default)
21105
21158
 
21159
+ // @ts-expect-error TS2339
21106
21160
  const id2label = this.model.config.id2label;
21107
21161
 
21108
21162
  const toReturn = [];
@@ -21205,6 +21259,7 @@ class TokenClassificationPipeline extends (/** @type {new (options: TextPipeline
21205
21259
  const outputs = await this.model(model_inputs)
21206
21260
 
21207
21261
  const logits = outputs.logits;
21262
+ // @ts-expect-error TS2339
21208
21263
  const id2label = this.model.config.id2label;
21209
21264
 
21210
21265
  const toReturn = [];
@@ -21544,11 +21599,14 @@ class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipeline
21544
21599
 
21545
21600
 
21546
21601
  // Add global prefix, if present
21602
+ // @ts-expect-error TS2339
21547
21603
  if (this.model.config.prefix) {
21604
+ // @ts-expect-error TS2339
21548
21605
  texts = texts.map(x => this.model.config.prefix + x)
21549
21606
  }
21550
21607
 
21551
21608
  // Handle task specific params:
21609
+ // @ts-expect-error TS2339
21552
21610
  const task_specific_params = this.model.config.task_specific_params
21553
21611
  if (task_specific_params && task_specific_params[this.task]) {
21554
21612
  // Add prefixes, if present
@@ -22287,6 +22345,7 @@ class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelin
22287
22345
  const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
22288
22346
  const preparedAudios = await prepareAudios(audio, sampling_rate);
22289
22347
 
22348
+ // @ts-expect-error TS2339
22290
22349
  const id2label = this.model.config.id2label;
22291
22350
 
22292
22351
  const toReturn = [];
@@ -22597,6 +22656,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22597
22656
  audio = [/** @type {AudioInput} */ (audio)];
22598
22657
  }
22599
22658
 
22659
+ // @ts-expect-error TS2339
22600
22660
  const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
22601
22661
  const hop_length = this.processor.feature_extractor.config.hop_length;
22602
22662
 
@@ -22662,7 +22722,9 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22662
22722
 
22663
22723
  // TODO: Right now we only get top beam
22664
22724
  if (return_timestamps === 'word') {
22725
+ // @ts-expect-error TS2339
22665
22726
  chunk.tokens = data.sequences.tolist()[0];
22727
+ // @ts-expect-error TS2339
22666
22728
  chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
22667
22729
  (/** @type {number} */ x) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.round)(x, 2)
22668
22730
  );
@@ -22707,7 +22769,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22707
22769
  const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
22708
22770
  const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
22709
22771
 
22710
- const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
22772
+ const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
22711
22773
  toReturn.push({ text });
22712
22774
  }
22713
22775
  return single ? toReturn[0] : toReturn;
@@ -22856,6 +22918,7 @@ class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelin
22856
22918
  const { pixel_values } = await this.processor(preparedImages);
22857
22919
  const output = await this.model({ pixel_values });
22858
22920
 
22921
+ // @ts-expect-error TS2339
22859
22922
  const id2label = this.model.config.id2label;
22860
22923
 
22861
22924
  /** @type {ImageClassificationOutput[]} */
@@ -22970,6 +23033,7 @@ class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineC
22970
23033
  }
22971
23034
  }
22972
23035
 
23036
+ // @ts-expect-error TS2339
22973
23037
  const id2label = this.model.config.id2label;
22974
23038
 
22975
23039
  /** @type {ImageSegmentationPipelineOutput[]} */
@@ -23196,6 +23260,7 @@ class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineCon
23196
23260
  const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
23197
23261
 
23198
23262
  // Add labels
23263
+ // @ts-expect-error TS2339
23199
23264
  const id2label = this.model.config.id2label;
23200
23265
 
23201
23266
  // Format output
@@ -23415,6 +23480,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
23415
23480
  // Run model
23416
23481
  const output = await this.model.generate({
23417
23482
  inputs: pixel_values,
23483
+ // @ts-expect-error TS2339
23418
23484
  max_length: this.model.config.decoder.max_position_embeddings,
23419
23485
  decoder_input_ids,
23420
23486
  ...generate_kwargs,
@@ -23530,6 +23596,7 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
23530
23596
  // Generate waveform
23531
23597
  const { waveform } = await this.model(inputs);
23532
23598
 
23599
+ // @ts-expect-error TS2339
23533
23600
  const sampling_rate = this.model.config.sampling_rate;
23534
23601
  return {
23535
23602
  audio: waveform.data,
@@ -23687,11 +23754,23 @@ class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineCon
23687
23754
 
23688
23755
  const toReturn = [];
23689
23756
  for (let i = 0; i < preparedImages.length; ++i) {
23690
- const prediction = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate)(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
23691
- const formatted = prediction.mul_(255 / (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.max)(prediction.data)[0]).to('uint8');
23757
+ const batch = predicted_depth[i];
23758
+ const [height, width] = batch.dims.slice(-2);
23759
+ const [new_width, new_height] = preparedImages[i].size;
23760
+
23761
+ // Interpolate to original size
23762
+ const prediction = (await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate_4d)(batch.view(1, 1, height, width), {
23763
+ size: [new_height, new_width],
23764
+ mode: 'bilinear',
23765
+ })).view(new_height, new_width);
23766
+
23767
+ const minval = /** @type {number} */(prediction.min().item());
23768
+ const maxval = /** @type {number} */(prediction.max().item());
23769
+ const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
23770
+ const depth = _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted);
23692
23771
  toReturn.push({
23693
- predicted_depth: predicted_depth[i],
23694
- depth: _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted),
23772
+ predicted_depth: prediction,
23773
+ depth,
23695
23774
  });
23696
23775
  }
23697
23776
 
@@ -24171,6 +24250,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
24171
24250
  return result;
24172
24251
  }
24173
24252
 
24253
+
24174
24254
  /***/ }),
24175
24255
 
24176
24256
  /***/ "./src/tokenizers.js":
@@ -24239,7 +24319,6 @@ __webpack_require__.r(__webpack_exports__);
24239
24319
  /* harmony import */ var _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./utils/data-structures.js */ "./src/utils/data-structures.js");
24240
24320
  /* harmony import */ var _huggingface_jinja__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! @huggingface/jinja */ "./node_modules/@huggingface/jinja/dist/index.js");
24241
24321
  /* harmony import */ var _models_whisper_common_whisper_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./models/whisper/common_whisper.js */ "./src/models/whisper/common_whisper.js");
24242
- /* harmony import */ var _utils_constants_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./utils/constants.js */ "./src/utils/constants.js");
24243
24322
 
24244
24323
  /**
24245
24324
  * @file Tokenizers are used to prepare textual inputs for a model.
@@ -24276,7 +24355,6 @@ __webpack_require__.r(__webpack_exports__);
24276
24355
 
24277
24356
 
24278
24357
 
24279
-
24280
24358
  /**
24281
24359
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
24282
24360
  * @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used.
@@ -24760,7 +24838,7 @@ class Unigram extends TokenizerModel {
24760
24838
  * Create a new Unigram tokenizer model.
24761
24839
  * @param {Object} config The configuration object for the Unigram model.
24762
24840
  * @param {number} config.unk_id The ID of the unknown token
24763
- * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
24841
+ * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
24764
24842
  * @param {Object} moreConfig Additional configuration object for the Unigram model.
24765
24843
  */
24766
24844
  constructor(config, moreConfig) {
@@ -24768,11 +24846,10 @@ class Unigram extends TokenizerModel {
24768
24846
 
24769
24847
  const vocabSize = config.vocab.length;
24770
24848
  this.vocab = new Array(vocabSize);
24849
+ /** @type {number[]} */
24771
24850
  this.scores = new Array(vocabSize);
24772
24851
  for (let i = 0; i < vocabSize; ++i) {
24773
- const piece = config.vocab[i];
24774
- this.vocab[i] = piece[0];
24775
- this.scores[i] = piece[1];
24852
+ [this.vocab[i], this.scores[i]] = config.vocab[i];
24776
24853
  }
24777
24854
 
24778
24855
  this.unk_token_id = config.unk_id;
@@ -30129,6 +30206,8 @@ __webpack_require__.r(__webpack_exports__);
30129
30206
  /* harmony export */ });
30130
30207
  /* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
30131
30208
  /* harmony import */ var _devices_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./devices.js */ "./src/utils/devices.js");
30209
+ /// <reference types="@webgpu/types" />
30210
+
30132
30211
 
30133
30212
 
30134
30213
 
@@ -30382,7 +30461,7 @@ class FileResponse {
30382
30461
  */
30383
30462
  async arrayBuffer() {
30384
30463
  const data = await fs__WEBPACK_IMPORTED_MODULE_0__["default"].promises.readFile(this.filePath);
30385
- return data.buffer;
30464
+ return /** @type {ArrayBuffer} */ (data.buffer);
30386
30465
  }
30387
30466
 
30388
30467
  /**
@@ -32041,8 +32120,9 @@ function magnitude(arr) {
32041
32120
 
32042
32121
  /**
32043
32122
  * Returns the value and index of the minimum element in an array.
32044
- * @param {number[]|TypedArray} arr array of numbers.
32045
- * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
32123
+ * @template {number[]|bigint[]|AnyTypedArray} T
32124
+ * @param {T} arr array of numbers.
32125
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
32046
32126
  * @throws {Error} If array is empty.
32047
32127
  */
32048
32128
  function min(arr) {
@@ -32055,14 +32135,15 @@ function min(arr) {
32055
32135
  indexOfMin = i;
32056
32136
  }
32057
32137
  }
32058
- return [min, indexOfMin];
32138
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
32059
32139
  }
32060
32140
 
32061
32141
 
32062
32142
  /**
32063
32143
  * Returns the value and index of the maximum element in an array.
32064
- * @param {number[]|AnyTypedArray} arr array of numbers.
32065
- * @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
32144
+ * @template {number[]|bigint[]|AnyTypedArray} T
32145
+ * @param {T} arr array of numbers.
32146
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
32066
32147
  * @throws {Error} If array is empty.
32067
32148
  */
32068
32149
  function max(arr) {
@@ -32075,7 +32156,7 @@ function max(arr) {
32075
32156
  indexOfMax = i;
32076
32157
  }
32077
32158
  }
32078
- return [Number(max), indexOfMax];
32159
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
32079
32160
  }
32080
32161
 
32081
32162
  function isPowerOfTwo(number) {
@@ -33372,8 +33453,6 @@ class Tensor {
33372
33453
  return this.permute(...dims);
33373
33454
  }
33374
33455
 
33375
- // TODO add .max() and .min() methods
33376
-
33377
33456
  /**
33378
33457
  * Returns the sum of each row of the input tensor in the given dimension dim.
33379
33458
  *
@@ -33667,6 +33746,36 @@ class Tensor {
33667
33746
  return mean(this, dim, keepdim);
33668
33747
  }
33669
33748
 
33749
+ min(dim = null, keepdim = false) {
33750
+ if (dim !== null) {
33751
+ throw new Error("`dim !== null` not yet implemented.");
33752
+ }
33753
+ const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[0];
33754
+ return new Tensor(this.type, [value], []);
33755
+ }
33756
+ max(dim = null, keepdim = false) {
33757
+ if (dim !== null) {
33758
+ throw new Error("`dim !== null` not yet implemented.");
33759
+ }
33760
+ const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[0];
33761
+ return new Tensor(this.type, [value], []);
33762
+ }
33763
+
33764
+ argmin(dim = null, keepdim = false) {
33765
+ if (dim !== null) {
33766
+ throw new Error("`dim !== null` not yet implemented.");
33767
+ }
33768
+ const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[1];
33769
+ return new Tensor('int64', [BigInt(index)], []);
33770
+ }
33771
+ argmax(dim = null, keepdim = false) {
33772
+ if (dim !== null) {
33773
+ throw new Error("`dim !== null` not yet implemented.");
33774
+ }
33775
+ const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[1];
33776
+ return new Tensor('int64', [BigInt(index)], []);
33777
+ }
33778
+
33670
33779
  /**
33671
33780
  * Performs Tensor dtype conversion.
33672
33781
  * @param {DataType} type The desired data type.
@@ -33800,7 +33909,7 @@ function interpolate(input, [out_height, out_width], mode = 'bilinear', align_co
33800
33909
  * @param {Tensor} input the input tensor
33801
33910
  * @param {Object} options the options for the interpolation
33802
33911
  * @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
33803
- * @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
33912
+ * @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
33804
33913
  * @returns {Promise<Tensor>} The interpolated tensor.
33805
33914
  */
33806
33915
  async function interpolate_4d(input, {
@@ -33830,7 +33939,9 @@ async function interpolate_4d(input, {
33830
33939
  }
33831
33940
 
33832
33941
  let op;
33833
- if (mode === 'bilinear') {
33942
+ if (mode === 'nearest') {
33943
+ op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.nearest_interpolate_4d;
33944
+ } else if (mode === 'bilinear') {
33834
33945
  op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bilinear_interpolate_4d;
33835
33946
  } else if (mode === 'bicubic') {
33836
33947
  op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bicubic_interpolate_4d;
@@ -33871,13 +33982,13 @@ async function rfft(x, a) {
33871
33982
  * Returns the k largest elements of the given input tensor.
33872
33983
  * Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
33873
33984
  * @param {Tensor} x the input tensor
33874
- * @param {number} k the k in "top-k"
33985
+ * @param {number} [k] the k in "top-k"
33875
33986
  * @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
33876
33987
  */
33877
33988
  async function topk(x, k) {
33878
33989
  const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.top_k;
33879
33990
 
33880
- if (k === null) {
33991
+ if (k == null) {
33881
33992
  k = x.dims.at(-1);
33882
33993
  } else {
33883
33994
  k = Math.min(k, x.dims.at(-1));
@@ -33906,10 +34017,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
33906
34017
  async function slice(data, starts, ends, axes, steps) {
33907
34018
  const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
33908
34019
  return await op({
33909
- x: data,
33910
- s: arrayToIndexTensor(starts),
33911
- e: arrayToIndexTensor(ends),
33912
- a: arrayToIndexTensor(axes),
34020
+ x: data,
34021
+ s: arrayToIndexTensor(starts),
34022
+ e: arrayToIndexTensor(ends),
34023
+ a: arrayToIndexTensor(axes),
33913
34024
  t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
33914
34025
  });
33915
34026
  }