@huggingface/transformers 3.2.3 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +2 -2
  2. package/dist/transformers.cjs +203 -92
  3. package/dist/transformers.cjs.map +1 -1
  4. package/dist/transformers.js +203 -92
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.cjs +1 -1
  7. package/dist/transformers.min.cjs.map +1 -1
  8. package/dist/transformers.min.js +1 -1
  9. package/dist/transformers.min.js.map +1 -1
  10. package/dist/transformers.min.mjs +1 -1
  11. package/dist/transformers.min.mjs.map +1 -1
  12. package/dist/transformers.mjs +203 -92
  13. package/dist/transformers.mjs.map +1 -1
  14. package/package.json +2 -2
  15. package/src/base/feature_extraction_utils.js +9 -9
  16. package/src/base/image_processors_utils.js +11 -0
  17. package/src/base/processing_utils.js +13 -3
  18. package/src/configs.js +5 -0
  19. package/src/env.js +1 -1
  20. package/src/models/auto/feature_extraction_auto.js +0 -16
  21. package/src/models/auto/processing_auto.js +0 -16
  22. package/src/models/convnext/image_processing_convnext.js +1 -0
  23. package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
  24. package/src/models/florence2/processing_florence2.js +3 -0
  25. package/src/models/idefics3/image_processing_idefics3.js +2 -0
  26. package/src/models/janus/image_processing_janus.js +1 -0
  27. package/src/models/mgp_str/processing_mgp_str.js +2 -0
  28. package/src/models/paligemma/processing_paligemma.js +1 -0
  29. package/src/models/phi3_v/processing_phi3_v.js +1 -1
  30. package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
  31. package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
  32. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
  33. package/src/models/whisper/feature_extraction_whisper.js +1 -1
  34. package/src/models.js +50 -15
  35. package/src/ops/registry.js +10 -0
  36. package/src/pipelines.js +34 -7
  37. package/src/tokenizers.js +4 -7
  38. package/src/utils/dtypes.js +2 -0
  39. package/src/utils/hub.js +1 -1
  40. package/src/utils/maths.js +8 -6
  41. package/src/utils/tensor.js +42 -10
  42. package/types/base/feature_extraction_utils.d.ts +7 -7
  43. package/types/base/image_processors_utils.d.ts.map +1 -1
  44. package/types/base/processing_utils.d.ts +17 -19
  45. package/types/base/processing_utils.d.ts.map +1 -1
  46. package/types/configs.d.ts.map +1 -1
  47. package/types/generation/parameters.d.ts +1 -1
  48. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  49. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  50. package/types/models/auto/processing_auto.d.ts.map +1 -1
  51. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
  52. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
  53. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  54. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  55. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  56. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  57. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  58. package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
  59. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
  60. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  61. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  62. package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
  63. package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
  64. package/types/models/whisper/generation_whisper.d.ts +1 -1
  65. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  66. package/types/models.d.ts +32 -17
  67. package/types/models.d.ts.map +1 -1
  68. package/types/ops/registry.d.ts +1 -0
  69. package/types/ops/registry.d.ts.map +1 -1
  70. package/types/pipelines.d.ts +2 -2
  71. package/types/pipelines.d.ts.map +1 -1
  72. package/types/tokenizers.d.ts.map +1 -1
  73. package/types/tsconfig.tsbuildinfo +1 -0
  74. package/types/utils/dtypes.d.ts.map +1 -1
  75. package/types/utils/hub.d.ts +1 -1
  76. package/types/utils/hub.d.ts.map +1 -1
  77. package/types/utils/image.d.ts +3 -2
  78. package/types/utils/image.d.ts.map +1 -1
  79. package/types/utils/maths.d.ts +8 -6
  80. package/types/utils/maths.d.ts.map +1 -1
  81. package/types/utils/tensor.d.ts +8 -4
  82. package/types/utils/tensor.d.ts.map +1 -1
@@ -4158,23 +4158,23 @@ class FeatureExtractor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Ca
4158
4158
  }
4159
4159
 
4160
4160
  /**
4161
- * Instantiate one of the processor classes of the library from a pretrained model.
4161
+ * Instantiate one of the feature extractor classes of the library from a pretrained model.
4162
4162
  *
4163
- * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
4164
- * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
4163
+ * The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
4164
+ * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
4165
4165
  *
4166
4166
  * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
4167
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
4167
+ * - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
4168
4168
  * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
4169
4169
  * user or organization name, like `dbmdz/bert-base-german-cased`.
4170
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
4171
- * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
4170
+ * - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
4171
+ * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
4172
4172
  *
4173
- * @returns {Promise<FeatureExtractor>} A new instance of the Processor class.
4173
+ * @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
4174
4174
  */
4175
4175
  static async from_pretrained(pretrained_model_name_or_path, options) {
4176
- const preprocessorConfig = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
4177
- return new this(preprocessorConfig);
4176
+ const config = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
4177
+ return new this(config);
4178
4178
  }
4179
4179
  }
4180
4180
 
@@ -4825,14 +4825,20 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
4825
4825
  this.do_thumbnail = config.do_thumbnail;
4826
4826
  this.size = config.size ?? config.image_size;
4827
4827
  this.do_resize = config.do_resize ?? (this.size !== undefined);
4828
+ // @ts-expect-error TS2339
4828
4829
  this.size_divisibility = config.size_divisibility ?? config.size_divisor;
4829
4830
 
4830
4831
  this.do_center_crop = config.do_center_crop;
4832
+ // @ts-expect-error TS2339
4831
4833
  this.crop_size = config.crop_size;
4834
+ // @ts-expect-error TS2339
4832
4835
  this.do_convert_rgb = config.do_convert_rgb ?? true;
4836
+ // @ts-expect-error TS2339
4833
4837
  this.do_crop_margin = config.do_crop_margin;
4834
4838
 
4839
+ // @ts-expect-error TS2339
4835
4840
  this.pad_size = config.pad_size;
4841
+ // @ts-expect-error TS2339
4836
4842
  this.do_pad = config.do_pad;
4837
4843
 
4838
4844
  if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
@@ -5041,6 +5047,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5041
5047
  // Support both formats for backwards compatibility
5042
5048
  else if (Number.isInteger(size)) {
5043
5049
  shortest_edge = size;
5050
+ // @ts-expect-error TS2339
5044
5051
  longest_edge = this.config.max_size ?? shortest_edge;
5045
5052
 
5046
5053
  } else if (size !== undefined) {
@@ -5109,6 +5116,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5109
5116
  } else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
5110
5117
  // Custom resize logic for Qwen2-VL models
5111
5118
  const { min_pixels, max_pixels } = size;
5119
+ // @ts-expect-error TS2339
5112
5120
  const factor = this.config.patch_size * this.config.merge_size;
5113
5121
  return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
5114
5122
  } else {
@@ -5124,6 +5132,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5124
5132
  async resize(image) {
5125
5133
  const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
5126
5134
  return await image.resize(newWidth, newHeight, {
5135
+ // @ts-expect-error TS2322
5127
5136
  resample: this.resample,
5128
5137
  });
5129
5138
  }
@@ -5174,6 +5183,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5174
5183
 
5175
5184
  // Resize the image using thumbnail method.
5176
5185
  if (this.do_thumbnail) {
5186
+ // @ts-expect-error TS2345
5177
5187
  image = await this.thumbnail(image, this.size, this.resample);
5178
5188
  }
5179
5189
 
@@ -5198,6 +5208,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
5198
5208
  // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
5199
5209
  // occurs with data in the hwc format (height, width, channels),
5200
5210
  // to emulate the behavior of the original Python code (w/ numpy).
5211
+ /** @type {Float32Array} */
5201
5212
  let pixelData = Float32Array.from(image.data);
5202
5213
  let imgDims = [image.height, image.width, image.channels];
5203
5214
 
@@ -5356,6 +5367,7 @@ __webpack_require__.r(__webpack_exports__);
5356
5367
  /**
5357
5368
  * @typedef {Object} ProcessorProperties Additional processor-specific properties.
5358
5369
  * @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
5370
+ * @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
5359
5371
  */
5360
5372
 
5361
5373
 
@@ -5389,7 +5401,7 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5389
5401
  }
5390
5402
 
5391
5403
  /**
5392
- * @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
5404
+ * @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
5393
5405
  */
5394
5406
  get tokenizer() {
5395
5407
  return this.components.tokenizer;
@@ -5402,6 +5414,11 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5402
5414
  return this.components.feature_extractor;
5403
5415
  }
5404
5416
 
5417
+ /**
5418
+ * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
5419
+ * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
5420
+ * @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
5421
+ */
5405
5422
  apply_chat_template(messages, options = {}) {
5406
5423
  if (!this.tokenizer) {
5407
5424
  throw new Error('Unable to apply chat template without a tokenizer.');
@@ -5412,6 +5429,10 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5412
5429
  });
5413
5430
  }
5414
5431
 
5432
+ /**
5433
+ * @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
5434
+ * @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
5435
+ */
5415
5436
  batch_decode(...args) {
5416
5437
  if (!this.tokenizer) {
5417
5438
  throw new Error('Unable to decode without a tokenizer.');
@@ -5439,8 +5460,8 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
5439
5460
  /**
5440
5461
  * Instantiate one of the processor classes of the library from a pretrained model.
5441
5462
  *
5442
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
5443
- * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
5463
+ * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
5464
+ * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
5444
5465
  *
5445
5466
  * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
5446
5467
  * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
@@ -5560,15 +5581,19 @@ function getNormalizedConfig(config) {
5560
5581
  case 'florence2':
5561
5582
  case 'llava_onevision':
5562
5583
  case 'idefics3':
5584
+ // @ts-expect-error TS2339
5563
5585
  init_normalized_config = getNormalizedConfig(config.text_config);
5564
5586
  break;
5565
5587
  case 'moondream1':
5588
+ // @ts-expect-error TS2339
5566
5589
  init_normalized_config = getNormalizedConfig(config.phi_config);
5567
5590
  break;
5568
5591
  case 'musicgen':
5592
+ // @ts-expect-error TS2339
5569
5593
  init_normalized_config = getNormalizedConfig(config.decoder);
5570
5594
  break;
5571
5595
  case 'multi_modality':
5596
+ // @ts-expect-error TS2339
5572
5597
  init_normalized_config = getNormalizedConfig(config.language_config);
5573
5598
  break;
5574
5599
 
@@ -5689,6 +5714,7 @@ function getNormalizedConfig(config) {
5689
5714
  break;
5690
5715
 
5691
5716
  case 'vision-encoder-decoder':
5717
+ // @ts-expect-error TS2339
5692
5718
  const decoderConfig = getNormalizedConfig(config.decoder);
5693
5719
 
5694
5720
  const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
@@ -5932,7 +5958,7 @@ __webpack_require__.r(__webpack_exports__);
5932
5958
 
5933
5959
 
5934
5960
 
5935
- const VERSION = '3.2.3';
5961
+ const VERSION = '3.2.4';
5936
5962
 
5937
5963
  // Check if various APIs are available (depends on environment)
5938
5964
  const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
@@ -8594,8 +8620,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
8594
8620
  } else if (session_options.externalData !== undefined) {
8595
8621
  externalDataPromises = session_options.externalData.map(async (ext) => {
8596
8622
  // if the external data is a string, fetch the file and replace the string with its content
8623
+ // @ts-expect-error TS2339
8597
8624
  if (typeof ext.data === "string") {
8625
+ // @ts-expect-error TS2339
8598
8626
  const ext_buffer = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, ext.data, true, options);
8627
+ // @ts-expect-error TS2698
8599
8628
  return { ...ext, data: ext_buffer };
8600
8629
  }
8601
8630
  return ext;
@@ -9843,6 +9872,7 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
9843
9872
  if (this.config.model_type === 'musicgen') {
9844
9873
  // Custom logic (TODO: move to Musicgen class)
9845
9874
  decoder_input_ids = Array.from({
9875
+ // @ts-expect-error TS2339
9846
9876
  length: batch_size * this.config.decoder.num_codebooks
9847
9877
  }, () => [decoder_start_token_id]);
9848
9878
 
@@ -10172,11 +10202,13 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
10172
10202
  async encode_image({ pixel_values }) {
10173
10203
  // image_inputs === { pixel_values }
10174
10204
  const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
10205
+ // @ts-expect-error TS2339
10175
10206
  if (!this.config.num_image_tokens) {
10176
10207
  console.warn(
10177
10208
  'The number of image tokens was not set in the model configuration. ' +
10178
10209
  `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
10179
10210
  )
10211
+ // @ts-expect-error TS2339
10180
10212
  this.config.num_image_tokens = features.dims[1];
10181
10213
  }
10182
10214
  return features;
@@ -11604,6 +11636,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11604
11636
 
11605
11637
  if (generation_config.return_token_timestamps) {
11606
11638
  outputs["token_timestamps"] = this._extract_token_timestamps(
11639
+ // @ts-expect-error TS2345
11607
11640
  outputs,
11608
11641
  generation_config.alignment_heads,
11609
11642
  generation_config.num_frames,
@@ -11639,6 +11672,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11639
11672
  );
11640
11673
  }
11641
11674
 
11675
+ // @ts-expect-error TS2339
11642
11676
  let median_filter_width = this.config.median_filter_width;
11643
11677
  if (median_filter_width === undefined) {
11644
11678
  console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -11649,6 +11683,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
11649
11683
  const batch = generate_outputs.cross_attentions;
11650
11684
  // Create a list with `decoder_layers` elements, each a tensor of shape
11651
11685
  // (batch size, attention_heads, output length, input length).
11686
+ // @ts-expect-error TS2339
11652
11687
  const cross_attentions = Array.from({ length: this.config.decoder_layers },
11653
11688
  // Concatenate the cross attentions for each layer across sequence length dimension.
11654
11689
  (_, i) => (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.cat)(batch.map(x => x[i]), 2)
@@ -11792,6 +11827,7 @@ class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
11792
11827
  attention_mask,
11793
11828
  }) {
11794
11829
 
11830
+ // @ts-expect-error TS2339
11795
11831
  const image_token_index = this.config.image_token_index;
11796
11832
 
11797
11833
  const idsList = input_ids.tolist();
@@ -12777,6 +12813,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12777
12813
  const image_nums = vision_tokens.filter(x => x == image_token_id).length;
12778
12814
  const video_nums = vision_tokens.filter(x => x == video_token_id).length;
12779
12815
 
12816
+ /** @type {number[][]} */
12780
12817
  let llm_pos_ids_list = [];
12781
12818
  let st = 0;
12782
12819
  let remain_images = image_nums;
@@ -12846,6 +12883,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12846
12883
  // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
12847
12884
  // meaning to perform concatenation along dim=1, we can do the following:
12848
12885
  const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
12886
+ /** @type {number[]} */
12849
12887
  const llm_positions = new Array(num_items);
12850
12888
  let index = 0;
12851
12889
  for (let x = 0; x < 3; ++x) {
@@ -12886,9 +12924,10 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
12886
12924
  { length: 3 * data.length },
12887
12925
  (_, i) => data[i % data.length]
12888
12926
  );
12927
+ /** @type {bigint[]} */
12889
12928
  const mrope_position_deltas = Array.from(
12890
12929
  { length: dims[0] },
12891
- (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
12930
+ (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
12892
12931
  );
12893
12932
 
12894
12933
  return [
@@ -13459,7 +13498,7 @@ class DPTModel extends DPTPreTrainedModel { }
13459
13498
  *
13460
13499
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
13461
13500
  * ```javascript
13462
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
13501
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
13463
13502
  *
13464
13503
  * // Load model and processor
13465
13504
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -13468,7 +13507,7 @@ class DPTModel extends DPTPreTrainedModel { }
13468
13507
  *
13469
13508
  * // Load image from URL
13470
13509
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
13471
- * const image = await RawImage.fromURL(url);
13510
+ * const image = await RawImage.read(url);
13472
13511
  *
13473
13512
  * // Prepare image for the model
13474
13513
  * const inputs = await processor(image);
@@ -13477,10 +13516,15 @@ class DPTModel extends DPTPreTrainedModel { }
13477
13516
  * const { predicted_depth } = await model(inputs);
13478
13517
  *
13479
13518
  * // Interpolate to original size
13480
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
13519
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
13520
+ * size: image.size.reverse(),
13521
+ * mode: 'bilinear',
13522
+ * })).squeeze(1);
13481
13523
  *
13482
13524
  * // Visualize the prediction
13483
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
13525
+ * const min = prediction.min().item();
13526
+ * const max = prediction.max().item();
13527
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
13484
13528
  * const depth = RawImage.fromTensor(formatted);
13485
13529
  * // RawImage {
13486
13530
  * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -13530,11 +13574,7 @@ class GLPNPreTrainedModel extends PreTrainedModel { }
13530
13574
  class GLPNModel extends GLPNPreTrainedModel { }
13531
13575
 
13532
13576
  /**
13533
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
13534
- *
13535
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
13536
- * ```javascript
13537
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
13577
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
13538
13578
  *
13539
13579
  * // Load model and processor
13540
13580
  * const model_id = 'Xenova/glpn-kitti';
@@ -13543,7 +13583,7 @@ class GLPNModel extends GLPNPreTrainedModel { }
13543
13583
  *
13544
13584
  * // Load image from URL
13545
13585
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
13546
- * const image = await RawImage.fromURL(url);
13586
+ * const image = await RawImage.read(url);
13547
13587
  *
13548
13588
  * // Prepare image for the model
13549
13589
  * const inputs = await processor(image);
@@ -13552,13 +13592,18 @@ class GLPNModel extends GLPNPreTrainedModel { }
13552
13592
  * const { predicted_depth } = await model(inputs);
13553
13593
  *
13554
13594
  * // Interpolate to original size
13555
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
13595
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
13596
+ * size: image.size.reverse(),
13597
+ * mode: 'bilinear',
13598
+ * })).squeeze(1);
13556
13599
  *
13557
13600
  * // Visualize the prediction
13558
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
13601
+ * const min = prediction.min().item();
13602
+ * const max = prediction.max().item();
13603
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
13559
13604
  * const depth = RawImage.fromTensor(formatted);
13560
13605
  * // RawImage {
13561
- * // data: Uint8Array(307200) [ 207, 169, 154, ... ],
13606
+ * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
13562
13607
  * // width: 640,
13563
13608
  * // height: 480,
13564
13609
  * // channels: 1
@@ -14525,10 +14570,12 @@ class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
14525
14570
 
14526
14571
  const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
14527
14572
 
14573
+ // @ts-expect-error TS2339
14528
14574
  const r = encoder_outputs.dims[1] / this.config.reduction_factor;
14529
14575
  const maxlen = Math.floor(r * maxlenratio);
14530
14576
  const minlen = Math.floor(r * minlenratio);
14531
14577
 
14578
+ // @ts-expect-error TS2339
14532
14579
  const num_mel_bins = this.config.num_mel_bins;
14533
14580
 
14534
14581
  let spectrogramParts = [];
@@ -14893,11 +14940,13 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
14893
14940
  */
14894
14941
  _apply_and_filter_by_delay_pattern_mask(outputs) {
14895
14942
  const [bs_x_codebooks, seqLength] = outputs.dims;
14943
+ // @ts-expect-error TS2339
14896
14944
  const num_codebooks = this.config.decoder.num_codebooks;
14897
14945
  const upperBound = (seqLength - num_codebooks);
14898
14946
 
14899
14947
  let newDataSize = 0;
14900
14948
  for (let i = 0; i < outputs.size; ++i) {
14949
+ // @ts-expect-error TS2339
14901
14950
  if (outputs.data[i] === this.config.decoder.pad_token_id) {
14902
14951
  continue;
14903
14952
  }
@@ -14927,7 +14976,9 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
14927
14976
  let clonedInputIds = structuredClone(input_ids);
14928
14977
  for (let i = 0; i < clonedInputIds.length; ++i) {
14929
14978
  for (let j = 0; j < clonedInputIds[i].length; ++j) {
14979
+ // @ts-expect-error TS2339
14930
14980
  if ((i % this.config.decoder.num_codebooks) >= j) {
14981
+ // @ts-expect-error TS2339
14931
14982
  clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
14932
14983
  }
14933
14984
  }
@@ -15084,6 +15135,9 @@ class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
15084
15135
  'past_key_values',
15085
15136
  ];
15086
15137
 
15138
+ /**
15139
+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
15140
+ */
15087
15141
  constructor(...args) {
15088
15142
  super(...args);
15089
15143
 
@@ -16052,10 +16106,17 @@ class SequenceClassifierOutput extends ModelOutput {
16052
16106
  /**
16053
16107
  * @param {Object} output The output of the model.
16054
16108
  * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
16109
+ * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
16110
+ * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
16055
16111
  */
16056
- constructor({ logits }) {
16112
+ constructor({ logits, ...attentions }) {
16057
16113
  super();
16058
16114
  this.logits = logits;
16115
+ const attentions_list = Object.values(attentions);
16116
+ if (attentions_list.length > 0) {
16117
+ // Only set attentions if they are not empty
16118
+ this.attentions = attentions_list;
16119
+ }
16059
16120
  }
16060
16121
  }
16061
16122
 
@@ -16313,22 +16374,6 @@ __webpack_require__.r(__webpack_exports__);
16313
16374
 
16314
16375
  class AutoFeatureExtractor {
16315
16376
 
16316
- /**
16317
- * Instantiate one of the feature extractor classes of the library from a pretrained model.
16318
- *
16319
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of
16320
- * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
16321
- *
16322
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
16323
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
16324
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
16325
- * user or organization name, like `dbmdz/bert-base-german-cased`.
16326
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
16327
- * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
16328
- *
16329
- * @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
16330
- */
16331
-
16332
16377
  /** @type {typeof FeatureExtractor.from_pretrained} */
16333
16378
  static async from_pretrained(pretrained_model_name_or_path, options={}) {
16334
16379
 
@@ -16457,22 +16502,6 @@ __webpack_require__.r(__webpack_exports__);
16457
16502
  */
16458
16503
  class AutoProcessor {
16459
16504
 
16460
- /**
16461
- * Instantiate one of the processor classes of the library from a pretrained model.
16462
- *
16463
- * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
16464
- * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
16465
- *
16466
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
16467
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
16468
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
16469
- * user or organization name, like `dbmdz/bert-base-german-cased`.
16470
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
16471
- * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
16472
- *
16473
- * @returns {Promise<Processor>} A new instance of the Processor class.
16474
- */
16475
-
16476
16505
  /** @type {typeof Processor.from_pretrained} */
16477
16506
  static async from_pretrained(pretrained_model_name_or_path, options={}) {
16478
16507
 
@@ -16796,6 +16825,7 @@ class ConvNextImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
16796
16825
  /**
16797
16826
  * Percentage of the image to crop. Only has an effect if this.size < 384.
16798
16827
  */
16828
+ // @ts-expect-error TS2339
16799
16829
  this.crop_pct = this.config.crop_pct ?? (224 / 256);
16800
16830
  }
16801
16831
 
@@ -17003,6 +17033,7 @@ __webpack_require__.r(__webpack_exports__);
17003
17033
  class EfficientNetImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
17004
17034
  constructor(config) {
17005
17035
  super(config);
17036
+ // @ts-expect-error TS2339
17006
17037
  this.include_top = this.config.include_top ?? true;
17007
17038
  if (this.include_top) {
17008
17039
  this.image_std = this.image_std.map(x => x * x);
@@ -17086,8 +17117,11 @@ class Florence2Processor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
17086
17117
  super(config, components);
17087
17118
 
17088
17119
  const {
17120
+ // @ts-expect-error TS2339
17089
17121
  tasks_answer_post_processing_type,
17122
+ // @ts-expect-error TS2339
17090
17123
  task_prompts_without_inputs,
17124
+ // @ts-expect-error TS2339
17091
17125
  task_prompts_with_input,
17092
17126
  } = this.image_processor.config;
17093
17127
 
@@ -17384,6 +17418,8 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
17384
17418
 
17385
17419
  const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
17386
17420
  const end_offset = (i + 1) * pixel_attention_mask_stride;
17421
+
17422
+ // @ts-expect-error
17387
17423
  pixel_attention_mask_data.fill(false, start_offset, end_offset);
17388
17424
  }
17389
17425
  }
@@ -17793,6 +17829,7 @@ class VLMImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTE
17793
17829
  },
17794
17830
  ...config,
17795
17831
  });
17832
+ // @ts-expect-error TS2339
17796
17833
  this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
17797
17834
  }
17798
17835
 
@@ -18241,6 +18278,8 @@ class MgpstrProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE
18241
18278
  * - bpe_preds: The list of BPE decoded sentences.
18242
18279
  * - wp_preds: The list of wp decoded sentences.
18243
18280
  */
18281
+ // @ts-expect-error The type of this method is not compatible with the one
18282
+ // in the base class. It might be a good idea to fix this.
18244
18283
  batch_decode([char_logits, bpe_logits, wp_logits]) {
18245
18284
  const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
18246
18285
  const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
@@ -18634,6 +18673,7 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
18634
18673
  }
18635
18674
 
18636
18675
  const bos_token = this.tokenizer.bos_token;
18676
+ // @ts-expect-error TS2339
18637
18677
  const image_seq_length = this.image_processor.config.image_seq_length;
18638
18678
  let input_strings;
18639
18679
  if (text.some((t) => t.includes(IMAGE_TOKEN))) {
@@ -18886,7 +18926,7 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
18886
18926
  *
18887
18927
  * @param {string|string[]} text
18888
18928
  * @param {RawImage|RawImage[]} images
18889
- * @param {...any} args
18929
+ * @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
18890
18930
  * @returns {Promise<any>}
18891
18931
  */
18892
18932
  async _call(text, images = null, {
@@ -19073,6 +19113,7 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
19073
19113
 
19074
19114
  let current_speaker = -1;
19075
19115
  for (let i = 0; i < scores.length; ++i) {
19116
+ /** @type {number[]} */
19076
19117
  const probabilities = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.softmax)(scores[i]);
19077
19118
  const [score, id] = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.max)(probabilities);
19078
19119
  const [start, end] = [i, i + 1];
@@ -19260,6 +19301,7 @@ class Qwen2VLProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODUL
19260
19301
  }
19261
19302
 
19262
19303
  if (image_grid_thw) {
19304
+ // @ts-expect-error TS2551
19263
19305
  let merge_length = this.image_processor.config.merge_size ** 2;
19264
19306
  let index = 0;
19265
19307
 
@@ -19751,8 +19793,8 @@ class SeamlessM4TFeatureExtractor extends _base_feature_extraction_utils_js__WEB
19751
19793
  'int64',
19752
19794
  new BigInt64Array(numPaddedFrames),
19753
19795
  [1, numPaddedFrames],
19754
- )
19755
- padded_attention_mask.data.fill(1n, 0, num_frames);
19796
+ );
19797
+ /** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
19756
19798
  }
19757
19799
  }
19758
19800
  }
@@ -20565,7 +20607,7 @@ class WhisperFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK
20565
20607
  )
20566
20608
 
20567
20609
  const data = features.data;
20568
- const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(data)[0];
20610
+ const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(/** @type {Float32Array} */(data))[0];
20569
20611
 
20570
20612
  for (let i = 0; i < data.length; ++i) {
20571
20613
  data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
@@ -20828,6 +20870,16 @@ class TensorOpRegistry {
20828
20870
  // executionProviders: ['webgpu'],
20829
20871
  };
20830
20872
 
20873
+ static get nearest_interpolate_4d() {
20874
+ if (!this._nearest_interpolate_4d) {
20875
+ this._nearest_interpolate_4d = wrap(
20876
+ [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
20877
+ this.session_options,
20878
+ 'y',
20879
+ );
20880
+ }
20881
+ return this._nearest_interpolate_4d;
20882
+ }
20831
20883
  static get bilinear_interpolate_4d() {
20832
20884
  if (!this._bilinear_interpolate_4d) {
20833
20885
  this._bilinear_interpolate_4d = wrap(
@@ -21202,6 +21254,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
21202
21254
 
21203
21255
  // TODO: Use softmax tensor function
21204
21256
  const function_to_apply =
21257
+ // @ts-expect-error TS2339
21205
21258
  this.model.config.problem_type === 'multi_label_classification'
21206
21259
  ? batch => batch.sigmoid()
21207
21260
  : batch => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor(
@@ -21210,6 +21263,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
21210
21263
  batch.dims,
21211
21264
  ); // single_label_classification (default)
21212
21265
 
21266
+ // @ts-expect-error TS2339
21213
21267
  const id2label = this.model.config.id2label;
21214
21268
 
21215
21269
  const toReturn = [];
@@ -21312,6 +21366,7 @@ class TokenClassificationPipeline extends (/** @type {new (options: TextPipeline
21312
21366
  const outputs = await this.model(model_inputs)
21313
21367
 
21314
21368
  const logits = outputs.logits;
21369
+ // @ts-expect-error TS2339
21315
21370
  const id2label = this.model.config.id2label;
21316
21371
 
21317
21372
  const toReturn = [];
@@ -21651,11 +21706,14 @@ class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipeline
21651
21706
 
21652
21707
 
21653
21708
  // Add global prefix, if present
21709
+ // @ts-expect-error TS2339
21654
21710
  if (this.model.config.prefix) {
21711
+ // @ts-expect-error TS2339
21655
21712
  texts = texts.map(x => this.model.config.prefix + x)
21656
21713
  }
21657
21714
 
21658
21715
  // Handle task specific params:
21716
+ // @ts-expect-error TS2339
21659
21717
  const task_specific_params = this.model.config.task_specific_params
21660
21718
  if (task_specific_params && task_specific_params[this.task]) {
21661
21719
  // Add prefixes, if present
@@ -22394,6 +22452,7 @@ class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelin
22394
22452
  const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
22395
22453
  const preparedAudios = await prepareAudios(audio, sampling_rate);
22396
22454
 
22455
+ // @ts-expect-error TS2339
22397
22456
  const id2label = this.model.config.id2label;
22398
22457
 
22399
22458
  const toReturn = [];
@@ -22704,6 +22763,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22704
22763
  audio = [/** @type {AudioInput} */ (audio)];
22705
22764
  }
22706
22765
 
22766
+ // @ts-expect-error TS2339
22707
22767
  const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
22708
22768
  const hop_length = this.processor.feature_extractor.config.hop_length;
22709
22769
 
@@ -22769,7 +22829,9 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22769
22829
 
22770
22830
  // TODO: Right now we only get top beam
22771
22831
  if (return_timestamps === 'word') {
22832
+ // @ts-expect-error TS2339
22772
22833
  chunk.tokens = data.sequences.tolist()[0];
22834
+ // @ts-expect-error TS2339
22773
22835
  chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
22774
22836
  (/** @type {number} */ x) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.round)(x, 2)
22775
22837
  );
@@ -22814,7 +22876,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
22814
22876
  const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
22815
22877
  const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
22816
22878
 
22817
- const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
22879
+ const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
22818
22880
  toReturn.push({ text });
22819
22881
  }
22820
22882
  return single ? toReturn[0] : toReturn;
@@ -22963,6 +23025,7 @@ class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelin
22963
23025
  const { pixel_values } = await this.processor(preparedImages);
22964
23026
  const output = await this.model({ pixel_values });
22965
23027
 
23028
+ // @ts-expect-error TS2339
22966
23029
  const id2label = this.model.config.id2label;
22967
23030
 
22968
23031
  /** @type {ImageClassificationOutput[]} */
@@ -23077,6 +23140,7 @@ class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineC
23077
23140
  }
23078
23141
  }
23079
23142
 
23143
+ // @ts-expect-error TS2339
23080
23144
  const id2label = this.model.config.id2label;
23081
23145
 
23082
23146
  /** @type {ImageSegmentationPipelineOutput[]} */
@@ -23303,6 +23367,7 @@ class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineCon
23303
23367
  const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
23304
23368
 
23305
23369
  // Add labels
23370
+ // @ts-expect-error TS2339
23306
23371
  const id2label = this.model.config.id2label;
23307
23372
 
23308
23373
  // Format output
@@ -23522,6 +23587,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
23522
23587
  // Run model
23523
23588
  const output = await this.model.generate({
23524
23589
  inputs: pixel_values,
23590
+ // @ts-expect-error TS2339
23525
23591
  max_length: this.model.config.decoder.max_position_embeddings,
23526
23592
  decoder_input_ids,
23527
23593
  ...generate_kwargs,
@@ -23637,6 +23703,7 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
23637
23703
  // Generate waveform
23638
23704
  const { waveform } = await this.model(inputs);
23639
23705
 
23706
+ // @ts-expect-error TS2339
23640
23707
  const sampling_rate = this.model.config.sampling_rate;
23641
23708
  return {
23642
23709
  audio: waveform.data,
@@ -23794,11 +23861,23 @@ class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineCon
23794
23861
 
23795
23862
  const toReturn = [];
23796
23863
  for (let i = 0; i < preparedImages.length; ++i) {
23797
- const prediction = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate)(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
23798
- const formatted = prediction.mul_(255 / (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.max)(prediction.data)[0]).to('uint8');
23864
+ const batch = predicted_depth[i];
23865
+ const [height, width] = batch.dims.slice(-2);
23866
+ const [new_width, new_height] = preparedImages[i].size;
23867
+
23868
+ // Interpolate to original size
23869
+ const prediction = (await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate_4d)(batch.view(1, 1, height, width), {
23870
+ size: [new_height, new_width],
23871
+ mode: 'bilinear',
23872
+ })).view(new_height, new_width);
23873
+
23874
+ const minval = /** @type {number} */(prediction.min().item());
23875
+ const maxval = /** @type {number} */(prediction.max().item());
23876
+ const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
23877
+ const depth = _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted);
23799
23878
  toReturn.push({
23800
- predicted_depth: predicted_depth[i],
23801
- depth: _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted),
23879
+ predicted_depth: prediction,
23880
+ depth,
23802
23881
  });
23803
23882
  }
23804
23883
 
@@ -24278,6 +24357,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
24278
24357
  return result;
24279
24358
  }
24280
24359
 
24360
+
24281
24361
  /***/ }),
24282
24362
 
24283
24363
  /***/ "./src/tokenizers.js":
@@ -24347,7 +24427,6 @@ __webpack_require__.r(__webpack_exports__);
24347
24427
  /* harmony import */ var _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./utils/data-structures.js */ "./src/utils/data-structures.js");
24348
24428
  /* harmony import */ var _huggingface_jinja__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! @huggingface/jinja */ "./node_modules/@huggingface/jinja/dist/index.js");
24349
24429
  /* harmony import */ var _models_whisper_common_whisper_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./models/whisper/common_whisper.js */ "./src/models/whisper/common_whisper.js");
24350
- /* harmony import */ var _utils_constants_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./utils/constants.js */ "./src/utils/constants.js");
24351
24430
 
24352
24431
  /**
24353
24432
  * @file Tokenizers are used to prepare textual inputs for a model.
@@ -24384,7 +24463,6 @@ __webpack_require__.r(__webpack_exports__);
24384
24463
 
24385
24464
 
24386
24465
 
24387
-
24388
24466
  /**
24389
24467
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
24390
24468
  * @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used.
@@ -24868,7 +24946,7 @@ class Unigram extends TokenizerModel {
24868
24946
  * Create a new Unigram tokenizer model.
24869
24947
  * @param {Object} config The configuration object for the Unigram model.
24870
24948
  * @param {number} config.unk_id The ID of the unknown token
24871
- * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
24949
+ * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
24872
24950
  * @param {Object} moreConfig Additional configuration object for the Unigram model.
24873
24951
  */
24874
24952
  constructor(config, moreConfig) {
@@ -24876,11 +24954,10 @@ class Unigram extends TokenizerModel {
24876
24954
 
24877
24955
  const vocabSize = config.vocab.length;
24878
24956
  this.vocab = new Array(vocabSize);
24957
+ /** @type {number[]} */
24879
24958
  this.scores = new Array(vocabSize);
24880
24959
  for (let i = 0; i < vocabSize; ++i) {
24881
- const piece = config.vocab[i];
24882
- this.vocab[i] = piece[0];
24883
- this.scores[i] = piece[1];
24960
+ [this.vocab[i], this.scores[i]] = config.vocab[i];
24884
24961
  }
24885
24962
 
24886
24963
  this.unk_token_id = config.unk_id;
@@ -30243,6 +30320,8 @@ __webpack_require__.r(__webpack_exports__);
30243
30320
  /* harmony export */ });
30244
30321
  /* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
30245
30322
  /* harmony import */ var _devices_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./devices.js */ "./src/utils/devices.js");
30323
+ /// <reference types="@webgpu/types" />
30324
+
30246
30325
 
30247
30326
 
30248
30327
 
@@ -30498,7 +30577,7 @@ class FileResponse {
30498
30577
  */
30499
30578
  async arrayBuffer() {
30500
30579
  const data = await fs__WEBPACK_IMPORTED_MODULE_0__.promises.readFile(this.filePath);
30501
- return data.buffer;
30580
+ return /** @type {ArrayBuffer} */ (data.buffer);
30502
30581
  }
30503
30582
 
30504
30583
  /**
@@ -32159,8 +32238,9 @@ function magnitude(arr) {
32159
32238
 
32160
32239
  /**
32161
32240
  * Returns the value and index of the minimum element in an array.
32162
- * @param {number[]|TypedArray} arr array of numbers.
32163
- * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
32241
+ * @template {number[]|bigint[]|AnyTypedArray} T
32242
+ * @param {T} arr array of numbers.
32243
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
32164
32244
  * @throws {Error} If array is empty.
32165
32245
  */
32166
32246
  function min(arr) {
@@ -32173,14 +32253,15 @@ function min(arr) {
32173
32253
  indexOfMin = i;
32174
32254
  }
32175
32255
  }
32176
- return [min, indexOfMin];
32256
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
32177
32257
  }
32178
32258
 
32179
32259
 
32180
32260
  /**
32181
32261
  * Returns the value and index of the maximum element in an array.
32182
- * @param {number[]|AnyTypedArray} arr array of numbers.
32183
- * @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
32262
+ * @template {number[]|bigint[]|AnyTypedArray} T
32263
+ * @param {T} arr array of numbers.
32264
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
32184
32265
  * @throws {Error} If array is empty.
32185
32266
  */
32186
32267
  function max(arr) {
@@ -32193,7 +32274,7 @@ function max(arr) {
32193
32274
  indexOfMax = i;
32194
32275
  }
32195
32276
  }
32196
- return [Number(max), indexOfMax];
32277
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
32197
32278
  }
32198
32279
 
32199
32280
  function isPowerOfTwo(number) {
@@ -33491,8 +33572,6 @@ class Tensor {
33491
33572
  return this.permute(...dims);
33492
33573
  }
33493
33574
 
33494
- // TODO add .max() and .min() methods
33495
-
33496
33575
  /**
33497
33576
  * Returns the sum of each row of the input tensor in the given dimension dim.
33498
33577
  *
@@ -33786,6 +33865,36 @@ class Tensor {
33786
33865
  return mean(this, dim, keepdim);
33787
33866
  }
33788
33867
 
33868
+ min(dim = null, keepdim = false) {
33869
+ if (dim !== null) {
33870
+ throw new Error("`dim !== null` not yet implemented.");
33871
+ }
33872
+ const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[0];
33873
+ return new Tensor(this.type, [value], []);
33874
+ }
33875
+ max(dim = null, keepdim = false) {
33876
+ if (dim !== null) {
33877
+ throw new Error("`dim !== null` not yet implemented.");
33878
+ }
33879
+ const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[0];
33880
+ return new Tensor(this.type, [value], []);
33881
+ }
33882
+
33883
+ argmin(dim = null, keepdim = false) {
33884
+ if (dim !== null) {
33885
+ throw new Error("`dim !== null` not yet implemented.");
33886
+ }
33887
+ const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[1];
33888
+ return new Tensor('int64', [BigInt(index)], []);
33889
+ }
33890
+ argmax(dim = null, keepdim = false) {
33891
+ if (dim !== null) {
33892
+ throw new Error("`dim !== null` not yet implemented.");
33893
+ }
33894
+ const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[1];
33895
+ return new Tensor('int64', [BigInt(index)], []);
33896
+ }
33897
+
33789
33898
  /**
33790
33899
  * Performs Tensor dtype conversion.
33791
33900
  * @param {DataType} type The desired data type.
@@ -33919,7 +34028,7 @@ function interpolate(input, [out_height, out_width], mode = 'bilinear', align_co
33919
34028
  * @param {Tensor} input the input tensor
33920
34029
  * @param {Object} options the options for the interpolation
33921
34030
  * @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
33922
- * @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
34031
+ * @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
33923
34032
  * @returns {Promise<Tensor>} The interpolated tensor.
33924
34033
  */
33925
34034
  async function interpolate_4d(input, {
@@ -33949,7 +34058,9 @@ async function interpolate_4d(input, {
33949
34058
  }
33950
34059
 
33951
34060
  let op;
33952
- if (mode === 'bilinear') {
34061
+ if (mode === 'nearest') {
34062
+ op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.nearest_interpolate_4d;
34063
+ } else if (mode === 'bilinear') {
33953
34064
  op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bilinear_interpolate_4d;
33954
34065
  } else if (mode === 'bicubic') {
33955
34066
  op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bicubic_interpolate_4d;
@@ -33990,13 +34101,13 @@ async function rfft(x, a) {
33990
34101
  * Returns the k largest elements of the given input tensor.
33991
34102
  * Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
33992
34103
  * @param {Tensor} x the input tensor
33993
- * @param {number} k the k in "top-k"
34104
+ * @param {number} [k] the k in "top-k"
33994
34105
  * @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
33995
34106
  */
33996
34107
  async function topk(x, k) {
33997
34108
  const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.top_k;
33998
34109
 
33999
- if (k === null) {
34110
+ if (k == null) {
34000
34111
  k = x.dims.at(-1);
34001
34112
  } else {
34002
34113
  k = Math.min(k, x.dims.at(-1));
@@ -34025,10 +34136,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
34025
34136
  async function slice(data, starts, ends, axes, steps) {
34026
34137
  const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
34027
34138
  return await op({
34028
- x: data,
34029
- s: arrayToIndexTensor(starts),
34030
- e: arrayToIndexTensor(ends),
34031
- a: arrayToIndexTensor(axes),
34139
+ x: data,
34140
+ s: arrayToIndexTensor(starts),
34141
+ e: arrayToIndexTensor(ends),
34142
+ a: arrayToIndexTensor(axes),
34032
34143
  t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
34033
34144
  });
34034
34145
  }