@huggingface/transformers 3.2.3 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +2 -2
  2. package/dist/transformers.cjs +203 -92
  3. package/dist/transformers.cjs.map +1 -1
  4. package/dist/transformers.js +203 -92
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.cjs +1 -1
  7. package/dist/transformers.min.cjs.map +1 -1
  8. package/dist/transformers.min.js +1 -1
  9. package/dist/transformers.min.js.map +1 -1
  10. package/dist/transformers.min.mjs +1 -1
  11. package/dist/transformers.min.mjs.map +1 -1
  12. package/dist/transformers.mjs +203 -92
  13. package/dist/transformers.mjs.map +1 -1
  14. package/package.json +2 -2
  15. package/src/base/feature_extraction_utils.js +9 -9
  16. package/src/base/image_processors_utils.js +11 -0
  17. package/src/base/processing_utils.js +13 -3
  18. package/src/configs.js +5 -0
  19. package/src/env.js +1 -1
  20. package/src/models/auto/feature_extraction_auto.js +0 -16
  21. package/src/models/auto/processing_auto.js +0 -16
  22. package/src/models/convnext/image_processing_convnext.js +1 -0
  23. package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
  24. package/src/models/florence2/processing_florence2.js +3 -0
  25. package/src/models/idefics3/image_processing_idefics3.js +2 -0
  26. package/src/models/janus/image_processing_janus.js +1 -0
  27. package/src/models/mgp_str/processing_mgp_str.js +2 -0
  28. package/src/models/paligemma/processing_paligemma.js +1 -0
  29. package/src/models/phi3_v/processing_phi3_v.js +1 -1
  30. package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
  31. package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
  32. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
  33. package/src/models/whisper/feature_extraction_whisper.js +1 -1
  34. package/src/models.js +50 -15
  35. package/src/ops/registry.js +10 -0
  36. package/src/pipelines.js +34 -7
  37. package/src/tokenizers.js +4 -7
  38. package/src/utils/dtypes.js +2 -0
  39. package/src/utils/hub.js +1 -1
  40. package/src/utils/maths.js +8 -6
  41. package/src/utils/tensor.js +42 -10
  42. package/types/base/feature_extraction_utils.d.ts +7 -7
  43. package/types/base/image_processors_utils.d.ts.map +1 -1
  44. package/types/base/processing_utils.d.ts +17 -19
  45. package/types/base/processing_utils.d.ts.map +1 -1
  46. package/types/configs.d.ts.map +1 -1
  47. package/types/generation/parameters.d.ts +1 -1
  48. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  49. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  50. package/types/models/auto/processing_auto.d.ts.map +1 -1
  51. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
  52. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
  53. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  54. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  55. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  56. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  57. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  58. package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
  59. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
  60. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  61. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  62. package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
  63. package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
  64. package/types/models/whisper/generation_whisper.d.ts +1 -1
  65. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  66. package/types/models.d.ts +32 -17
  67. package/types/models.d.ts.map +1 -1
  68. package/types/ops/registry.d.ts +1 -0
  69. package/types/ops/registry.d.ts.map +1 -1
  70. package/types/pipelines.d.ts +2 -2
  71. package/types/pipelines.d.ts.map +1 -1
  72. package/types/tokenizers.d.ts.map +1 -1
  73. package/types/tsconfig.tsbuildinfo +1 -0
  74. package/types/utils/dtypes.d.ts.map +1 -1
  75. package/types/utils/hub.d.ts +1 -1
  76. package/types/utils/hub.d.ts.map +1 -1
  77. package/types/utils/image.d.ts +3 -2
  78. package/types/utils/image.d.ts.map +1 -1
  79. package/types/utils/maths.d.ts +8 -6
  80. package/types/utils/maths.d.ts.map +1 -1
  81. package/types/utils/tensor.d.ts +8 -4
  82. package/types/utils/tensor.d.ts.map +1 -1
@@ -6927,23 +6927,23 @@ class FeatureExtractor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Ca
6927
6927
  }
6928
6928
 
6929
6929
  /**
6930
- * Instantiate one of the processor classes of the library from a pretrained model.
6930
+ * Instantiate one of the feature extractor classes of the library from a pretrained model.
6931
6931
  *
6932
- * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
6933
- * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
6932
+ * The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
6933
+ * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
6934
6934
  *
6935
6935
  * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
6936
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
6936
+ * - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
6937
6937
  * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
6938
6938
  * user or organization name, like `dbmdz/bert-base-german-cased`.
6939
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
6940
- * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
6939
+ * - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
6940
+ * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
6941
6941
  *
6942
- * @returns {Promise<FeatureExtractor>} A new instance of the Processor class.
6942
+ * @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
6943
6943
  */
6944
6944
  static async from_pretrained(pretrained_model_name_or_path, options) {
6945
- const preprocessorConfig = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
6946
- return new this(preprocessorConfig);
6945
+ const config = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
6946
+ return new this(config);
6947
6947
  }
6948
6948
  }
6949
6949
 
@@ -7593,14 +7593,20 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
7593
7593
  this.do_thumbnail = config.do_thumbnail;
7594
7594
  this.size = config.size ?? config.image_size;
7595
7595
  this.do_resize = config.do_resize ?? (this.size !== undefined);
7596
+ // @ts-expect-error TS2339
7596
7597
  this.size_divisibility = config.size_divisibility ?? config.size_divisor;
7597
7598
 
7598
7599
  this.do_center_crop = config.do_center_crop;
7600
+ // @ts-expect-error TS2339
7599
7601
  this.crop_size = config.crop_size;
7602
+ // @ts-expect-error TS2339
7600
7603
  this.do_convert_rgb = config.do_convert_rgb ?? true;
7604
+ // @ts-expect-error TS2339
7601
7605
  this.do_crop_margin = config.do_crop_margin;
7602
7606
 
7607
+ // @ts-expect-error TS2339
7603
7608
  this.pad_size = config.pad_size;
7609
+ // @ts-expect-error TS2339
7604
7610
  this.do_pad = config.do_pad;
7605
7611
 
7606
7612
  if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
@@ -7809,6 +7815,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
7809
7815
  // Support both formats for backwards compatibility
7810
7816
  else if (Number.isInteger(size)) {
7811
7817
  shortest_edge = size;
7818
+ // @ts-expect-error TS2339
7812
7819
  longest_edge = this.config.max_size ?? shortest_edge;
7813
7820
 
7814
7821
  } else if (size !== undefined) {
@@ -7877,6 +7884,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
7877
7884
  } else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
7878
7885
  // Custom resize logic for Qwen2-VL models
7879
7886
  const { min_pixels, max_pixels } = size;
7887
+ // @ts-expect-error TS2339
7880
7888
  const factor = this.config.patch_size * this.config.merge_size;
7881
7889
  return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
7882
7890
  } else {
@@ -7892,6 +7900,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
7892
7900
  async resize(image) {
7893
7901
  const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
7894
7902
  return await image.resize(newWidth, newHeight, {
7903
+ // @ts-expect-error TS2322
7895
7904
  resample: this.resample,
7896
7905
  });
7897
7906
  }
@@ -7942,6 +7951,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
7942
7951
 
7943
7952
  // Resize the image using thumbnail method.
7944
7953
  if (this.do_thumbnail) {
7954
+ // @ts-expect-error TS2345
7945
7955
  image = await this.thumbnail(image, this.size, this.resample);
7946
7956
  }
7947
7957
 
@@ -7966,6 +7976,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
7966
7976
  // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
7967
7977
  // occurs with data in the hwc format (height, width, channels),
7968
7978
  // to emulate the behavior of the original Python code (w/ numpy).
7979
+ /** @type {Float32Array} */
7969
7980
  let pixelData = Float32Array.from(image.data);
7970
7981
  let imgDims = [image.height, image.width, image.channels];
7971
7982
 
@@ -8123,6 +8134,7 @@ __webpack_require__.r(__webpack_exports__);
8123
8134
  /**
8124
8135
  * @typedef {Object} ProcessorProperties Additional processor-specific properties.
8125
8136
  * @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
8137
+ * @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
8126
8138
  */
8127
8139
 
8128
8140
 
@@ -8156,7 +8168,7 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
8156
8168
  }
8157
8169
 
8158
8170
  /**
8159
- * @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
8171
+ * @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
8160
8172
  */
8161
8173
  get tokenizer() {
8162
8174
  return this.components.tokenizer;
@@ -8169,6 +8181,11 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
8169
8181
  return this.components.feature_extractor;
8170
8182
  }
8171
8183
 
8184
+ /**
8185
+ * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
8186
+ * @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
8187
+ * @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
8188
+ */
8172
8189
  apply_chat_template(messages, options = {}) {
8173
8190
  if (!this.tokenizer) {
8174
8191
  throw new Error('Unable to apply chat template without a tokenizer.');
@@ -8179,6 +8196,10 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
8179
8196
  });
8180
8197
  }
8181
8198
 
8199
+ /**
8200
+ * @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
8201
+ * @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
8202
+ */
8182
8203
  batch_decode(...args) {
8183
8204
  if (!this.tokenizer) {
8184
8205
  throw new Error('Unable to decode without a tokenizer.');
@@ -8206,8 +8227,8 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
8206
8227
  /**
8207
8228
  * Instantiate one of the processor classes of the library from a pretrained model.
8208
8229
  *
8209
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
8210
- * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
8230
+ * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
8231
+ * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
8211
8232
  *
8212
8233
  * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
8213
8234
  * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
@@ -8326,15 +8347,19 @@ function getNormalizedConfig(config) {
8326
8347
  case 'florence2':
8327
8348
  case 'llava_onevision':
8328
8349
  case 'idefics3':
8350
+ // @ts-expect-error TS2339
8329
8351
  init_normalized_config = getNormalizedConfig(config.text_config);
8330
8352
  break;
8331
8353
  case 'moondream1':
8354
+ // @ts-expect-error TS2339
8332
8355
  init_normalized_config = getNormalizedConfig(config.phi_config);
8333
8356
  break;
8334
8357
  case 'musicgen':
8358
+ // @ts-expect-error TS2339
8335
8359
  init_normalized_config = getNormalizedConfig(config.decoder);
8336
8360
  break;
8337
8361
  case 'multi_modality':
8362
+ // @ts-expect-error TS2339
8338
8363
  init_normalized_config = getNormalizedConfig(config.language_config);
8339
8364
  break;
8340
8365
 
@@ -8455,6 +8480,7 @@ function getNormalizedConfig(config) {
8455
8480
  break;
8456
8481
 
8457
8482
  case 'vision-encoder-decoder':
8483
+ // @ts-expect-error TS2339
8458
8484
  const decoderConfig = getNormalizedConfig(config.decoder);
8459
8485
 
8460
8486
  const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
@@ -8697,7 +8723,7 @@ __webpack_require__.r(__webpack_exports__);
8697
8723
 
8698
8724
 
8699
8725
 
8700
- const VERSION = '3.2.3';
8726
+ const VERSION = '3.2.4';
8701
8727
 
8702
8728
  // Check if various APIs are available (depends on environment)
8703
8729
  const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
@@ -11353,8 +11379,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
11353
11379
  } else if (session_options.externalData !== undefined) {
11354
11380
  externalDataPromises = session_options.externalData.map(async (ext) => {
11355
11381
  // if the external data is a string, fetch the file and replace the string with its content
11382
+ // @ts-expect-error TS2339
11356
11383
  if (typeof ext.data === "string") {
11384
+ // @ts-expect-error TS2339
11357
11385
  const ext_buffer = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, ext.data, true, options);
11386
+ // @ts-expect-error TS2698
11358
11387
  return { ...ext, data: ext_buffer };
11359
11388
  }
11360
11389
  return ext;
@@ -12602,6 +12631,7 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
12602
12631
  if (this.config.model_type === 'musicgen') {
12603
12632
  // Custom logic (TODO: move to Musicgen class)
12604
12633
  decoder_input_ids = Array.from({
12634
+ // @ts-expect-error TS2339
12605
12635
  length: batch_size * this.config.decoder.num_codebooks
12606
12636
  }, () => [decoder_start_token_id]);
12607
12637
 
@@ -12931,11 +12961,13 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
12931
12961
  async encode_image({ pixel_values }) {
12932
12962
  // image_inputs === { pixel_values }
12933
12963
  const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
12964
+ // @ts-expect-error TS2339
12934
12965
  if (!this.config.num_image_tokens) {
12935
12966
  console.warn(
12936
12967
  'The number of image tokens was not set in the model configuration. ' +
12937
12968
  `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
12938
12969
  )
12970
+ // @ts-expect-error TS2339
12939
12971
  this.config.num_image_tokens = features.dims[1];
12940
12972
  }
12941
12973
  return features;
@@ -14363,6 +14395,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
14363
14395
 
14364
14396
  if (generation_config.return_token_timestamps) {
14365
14397
  outputs["token_timestamps"] = this._extract_token_timestamps(
14398
+ // @ts-expect-error TS2345
14366
14399
  outputs,
14367
14400
  generation_config.alignment_heads,
14368
14401
  generation_config.num_frames,
@@ -14398,6 +14431,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
14398
14431
  );
14399
14432
  }
14400
14433
 
14434
+ // @ts-expect-error TS2339
14401
14435
  let median_filter_width = this.config.median_filter_width;
14402
14436
  if (median_filter_width === undefined) {
14403
14437
  console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -14408,6 +14442,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
14408
14442
  const batch = generate_outputs.cross_attentions;
14409
14443
  // Create a list with `decoder_layers` elements, each a tensor of shape
14410
14444
  // (batch size, attention_heads, output length, input length).
14445
+ // @ts-expect-error TS2339
14411
14446
  const cross_attentions = Array.from({ length: this.config.decoder_layers },
14412
14447
  // Concatenate the cross attentions for each layer across sequence length dimension.
14413
14448
  (_, i) => (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.cat)(batch.map(x => x[i]), 2)
@@ -14551,6 +14586,7 @@ class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
14551
14586
  attention_mask,
14552
14587
  }) {
14553
14588
 
14589
+ // @ts-expect-error TS2339
14554
14590
  const image_token_index = this.config.image_token_index;
14555
14591
 
14556
14592
  const idsList = input_ids.tolist();
@@ -15536,6 +15572,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
15536
15572
  const image_nums = vision_tokens.filter(x => x == image_token_id).length;
15537
15573
  const video_nums = vision_tokens.filter(x => x == video_token_id).length;
15538
15574
 
15575
+ /** @type {number[][]} */
15539
15576
  let llm_pos_ids_list = [];
15540
15577
  let st = 0;
15541
15578
  let remain_images = image_nums;
@@ -15605,6 +15642,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
15605
15642
  // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
15606
15643
  // meaning to perform concatenation along dim=1, we can do the following:
15607
15644
  const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
15645
+ /** @type {number[]} */
15608
15646
  const llm_positions = new Array(num_items);
15609
15647
  let index = 0;
15610
15648
  for (let x = 0; x < 3; ++x) {
@@ -15645,9 +15683,10 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
15645
15683
  { length: 3 * data.length },
15646
15684
  (_, i) => data[i % data.length]
15647
15685
  );
15686
+ /** @type {bigint[]} */
15648
15687
  const mrope_position_deltas = Array.from(
15649
15688
  { length: dims[0] },
15650
- (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
15689
+ (_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
15651
15690
  );
15652
15691
 
15653
15692
  return [
@@ -16218,7 +16257,7 @@ class DPTModel extends DPTPreTrainedModel { }
16218
16257
  *
16219
16258
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
16220
16259
  * ```javascript
16221
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
16260
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
16222
16261
  *
16223
16262
  * // Load model and processor
16224
16263
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -16227,7 +16266,7 @@ class DPTModel extends DPTPreTrainedModel { }
16227
16266
  *
16228
16267
  * // Load image from URL
16229
16268
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
16230
- * const image = await RawImage.fromURL(url);
16269
+ * const image = await RawImage.read(url);
16231
16270
  *
16232
16271
  * // Prepare image for the model
16233
16272
  * const inputs = await processor(image);
@@ -16236,10 +16275,15 @@ class DPTModel extends DPTPreTrainedModel { }
16236
16275
  * const { predicted_depth } = await model(inputs);
16237
16276
  *
16238
16277
  * // Interpolate to original size
16239
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
16278
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
16279
+ * size: image.size.reverse(),
16280
+ * mode: 'bilinear',
16281
+ * })).squeeze(1);
16240
16282
  *
16241
16283
  * // Visualize the prediction
16242
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
16284
+ * const min = prediction.min().item();
16285
+ * const max = prediction.max().item();
16286
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
16243
16287
  * const depth = RawImage.fromTensor(formatted);
16244
16288
  * // RawImage {
16245
16289
  * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -16289,11 +16333,7 @@ class GLPNPreTrainedModel extends PreTrainedModel { }
16289
16333
  class GLPNModel extends GLPNPreTrainedModel { }
16290
16334
 
16291
16335
  /**
16292
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
16293
- *
16294
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
16295
- * ```javascript
16296
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
16336
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
16297
16337
  *
16298
16338
  * // Load model and processor
16299
16339
  * const model_id = 'Xenova/glpn-kitti';
@@ -16302,7 +16342,7 @@ class GLPNModel extends GLPNPreTrainedModel { }
16302
16342
  *
16303
16343
  * // Load image from URL
16304
16344
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
16305
- * const image = await RawImage.fromURL(url);
16345
+ * const image = await RawImage.read(url);
16306
16346
  *
16307
16347
  * // Prepare image for the model
16308
16348
  * const inputs = await processor(image);
@@ -16311,13 +16351,18 @@ class GLPNModel extends GLPNPreTrainedModel { }
16311
16351
  * const { predicted_depth } = await model(inputs);
16312
16352
  *
16313
16353
  * // Interpolate to original size
16314
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
16354
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
16355
+ * size: image.size.reverse(),
16356
+ * mode: 'bilinear',
16357
+ * })).squeeze(1);
16315
16358
  *
16316
16359
  * // Visualize the prediction
16317
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
16360
+ * const min = prediction.min().item();
16361
+ * const max = prediction.max().item();
16362
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
16318
16363
  * const depth = RawImage.fromTensor(formatted);
16319
16364
  * // RawImage {
16320
- * // data: Uint8Array(307200) [ 207, 169, 154, ... ],
16365
+ * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
16321
16366
  * // width: 640,
16322
16367
  * // height: 480,
16323
16368
  * // channels: 1
@@ -17284,10 +17329,12 @@ class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
17284
17329
 
17285
17330
  const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
17286
17331
 
17332
+ // @ts-expect-error TS2339
17287
17333
  const r = encoder_outputs.dims[1] / this.config.reduction_factor;
17288
17334
  const maxlen = Math.floor(r * maxlenratio);
17289
17335
  const minlen = Math.floor(r * minlenratio);
17290
17336
 
17337
+ // @ts-expect-error TS2339
17291
17338
  const num_mel_bins = this.config.num_mel_bins;
17292
17339
 
17293
17340
  let spectrogramParts = [];
@@ -17652,11 +17699,13 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
17652
17699
  */
17653
17700
  _apply_and_filter_by_delay_pattern_mask(outputs) {
17654
17701
  const [bs_x_codebooks, seqLength] = outputs.dims;
17702
+ // @ts-expect-error TS2339
17655
17703
  const num_codebooks = this.config.decoder.num_codebooks;
17656
17704
  const upperBound = (seqLength - num_codebooks);
17657
17705
 
17658
17706
  let newDataSize = 0;
17659
17707
  for (let i = 0; i < outputs.size; ++i) {
17708
+ // @ts-expect-error TS2339
17660
17709
  if (outputs.data[i] === this.config.decoder.pad_token_id) {
17661
17710
  continue;
17662
17711
  }
@@ -17686,7 +17735,9 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
17686
17735
  let clonedInputIds = structuredClone(input_ids);
17687
17736
  for (let i = 0; i < clonedInputIds.length; ++i) {
17688
17737
  for (let j = 0; j < clonedInputIds[i].length; ++j) {
17738
+ // @ts-expect-error TS2339
17689
17739
  if ((i % this.config.decoder.num_codebooks) >= j) {
17740
+ // @ts-expect-error TS2339
17690
17741
  clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
17691
17742
  }
17692
17743
  }
@@ -17843,6 +17894,9 @@ class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
17843
17894
  'past_key_values',
17844
17895
  ];
17845
17896
 
17897
+ /**
17898
+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
17899
+ */
17846
17900
  constructor(...args) {
17847
17901
  super(...args);
17848
17902
 
@@ -18811,10 +18865,17 @@ class SequenceClassifierOutput extends ModelOutput {
18811
18865
  /**
18812
18866
  * @param {Object} output The output of the model.
18813
18867
  * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
18868
+ * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
18869
+ * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
18814
18870
  */
18815
- constructor({ logits }) {
18871
+ constructor({ logits, ...attentions }) {
18816
18872
  super();
18817
18873
  this.logits = logits;
18874
+ const attentions_list = Object.values(attentions);
18875
+ if (attentions_list.length > 0) {
18876
+ // Only set attentions if they are not empty
18877
+ this.attentions = attentions_list;
18878
+ }
18818
18879
  }
18819
18880
  }
18820
18881
 
@@ -19070,22 +19131,6 @@ __webpack_require__.r(__webpack_exports__);
19070
19131
 
19071
19132
  class AutoFeatureExtractor {
19072
19133
 
19073
- /**
19074
- * Instantiate one of the feature extractor classes of the library from a pretrained model.
19075
- *
19076
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of
19077
- * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
19078
- *
19079
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
19080
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
19081
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
19082
- * user or organization name, like `dbmdz/bert-base-german-cased`.
19083
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
19084
- * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
19085
- *
19086
- * @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
19087
- */
19088
-
19089
19134
  /** @type {typeof FeatureExtractor.from_pretrained} */
19090
19135
  static async from_pretrained(pretrained_model_name_or_path, options={}) {
19091
19136
 
@@ -19212,22 +19257,6 @@ __webpack_require__.r(__webpack_exports__);
19212
19257
  */
19213
19258
  class AutoProcessor {
19214
19259
 
19215
- /**
19216
- * Instantiate one of the processor classes of the library from a pretrained model.
19217
- *
19218
- * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
19219
- * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
19220
- *
19221
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
19222
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
19223
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
19224
- * user or organization name, like `dbmdz/bert-base-german-cased`.
19225
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
19226
- * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
19227
- *
19228
- * @returns {Promise<Processor>} A new instance of the Processor class.
19229
- */
19230
-
19231
19260
  /** @type {typeof Processor.from_pretrained} */
19232
19261
  static async from_pretrained(pretrained_model_name_or_path, options={}) {
19233
19262
 
@@ -19545,6 +19574,7 @@ class ConvNextImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
19545
19574
  /**
19546
19575
  * Percentage of the image to crop. Only has an effect if this.size < 384.
19547
19576
  */
19577
+ // @ts-expect-error TS2339
19548
19578
  this.crop_pct = this.config.crop_pct ?? (224 / 256);
19549
19579
  }
19550
19580
 
@@ -19747,6 +19777,7 @@ __webpack_require__.r(__webpack_exports__);
19747
19777
  class EfficientNetImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
19748
19778
  constructor(config) {
19749
19779
  super(config);
19780
+ // @ts-expect-error TS2339
19750
19781
  this.include_top = this.config.include_top ?? true;
19751
19782
  if (this.include_top) {
19752
19783
  this.image_std = this.image_std.map(x => x * x);
@@ -19828,8 +19859,11 @@ class Florence2Processor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
19828
19859
  super(config, components);
19829
19860
 
19830
19861
  const {
19862
+ // @ts-expect-error TS2339
19831
19863
  tasks_answer_post_processing_type,
19864
+ // @ts-expect-error TS2339
19832
19865
  task_prompts_without_inputs,
19866
+ // @ts-expect-error TS2339
19833
19867
  task_prompts_with_input,
19834
19868
  } = this.image_processor.config;
19835
19869
 
@@ -20124,6 +20158,8 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
20124
20158
 
20125
20159
  const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
20126
20160
  const end_offset = (i + 1) * pixel_attention_mask_stride;
20161
+
20162
+ // @ts-expect-error
20127
20163
  pixel_attention_mask_data.fill(false, start_offset, end_offset);
20128
20164
  }
20129
20165
  }
@@ -20530,6 +20566,7 @@ class VLMImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTE
20530
20566
  },
20531
20567
  ...config,
20532
20568
  });
20569
+ // @ts-expect-error TS2339
20533
20570
  this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
20534
20571
  }
20535
20572
 
@@ -20971,6 +21008,8 @@ class MgpstrProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE
20971
21008
  * - bpe_preds: The list of BPE decoded sentences.
20972
21009
  * - wp_preds: The list of wp decoded sentences.
20973
21010
  */
21011
+ // @ts-expect-error The type of this method is not compatible with the one
21012
+ // in the base class. It might be a good idea to fix this.
20974
21013
  batch_decode([char_logits, bpe_logits, wp_logits]) {
20975
21014
  const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
20976
21015
  const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
@@ -21352,6 +21391,7 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
21352
21391
  }
21353
21392
 
21354
21393
  const bos_token = this.tokenizer.bos_token;
21394
+ // @ts-expect-error TS2339
21355
21395
  const image_seq_length = this.image_processor.config.image_seq_length;
21356
21396
  let input_strings;
21357
21397
  if (text.some((t) => t.includes(IMAGE_TOKEN))) {
@@ -21602,7 +21642,7 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
21602
21642
  *
21603
21643
  * @param {string|string[]} text
21604
21644
  * @param {RawImage|RawImage[]} images
21605
- * @param {...any} args
21645
+ * @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
21606
21646
  * @returns {Promise<any>}
21607
21647
  */
21608
21648
  async _call(text, images = null, {
@@ -21786,6 +21826,7 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
21786
21826
 
21787
21827
  let current_speaker = -1;
21788
21828
  for (let i = 0; i < scores.length; ++i) {
21829
+ /** @type {number[]} */
21789
21830
  const probabilities = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.softmax)(scores[i]);
21790
21831
  const [score, id] = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.max)(probabilities);
21791
21832
  const [start, end] = [i, i + 1];
@@ -21970,6 +22011,7 @@ class Qwen2VLProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODUL
21970
22011
  }
21971
22012
 
21972
22013
  if (image_grid_thw) {
22014
+ // @ts-expect-error TS2551
21973
22015
  let merge_length = this.image_processor.config.merge_size ** 2;
21974
22016
  let index = 0;
21975
22017
 
@@ -22457,8 +22499,8 @@ class SeamlessM4TFeatureExtractor extends _base_feature_extraction_utils_js__WEB
22457
22499
  'int64',
22458
22500
  new BigInt64Array(numPaddedFrames),
22459
22501
  [1, numPaddedFrames],
22460
- )
22461
- padded_attention_mask.data.fill(1n, 0, num_frames);
22502
+ );
22503
+ /** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
22462
22504
  }
22463
22505
  }
22464
22506
  }
@@ -23258,7 +23300,7 @@ class WhisperFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK
23258
23300
  )
23259
23301
 
23260
23302
  const data = features.data;
23261
- const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(data)[0];
23303
+ const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(/** @type {Float32Array} */(data))[0];
23262
23304
 
23263
23305
  for (let i = 0; i < data.length; ++i) {
23264
23306
  data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
@@ -23517,6 +23559,16 @@ class TensorOpRegistry {
23517
23559
  // executionProviders: ['webgpu'],
23518
23560
  };
23519
23561
 
23562
+ static get nearest_interpolate_4d() {
23563
+ if (!this._nearest_interpolate_4d) {
23564
+ this._nearest_interpolate_4d = wrap(
23565
+ [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
23566
+ this.session_options,
23567
+ 'y',
23568
+ );
23569
+ }
23570
+ return this._nearest_interpolate_4d;
23571
+ }
23520
23572
  static get bilinear_interpolate_4d() {
23521
23573
  if (!this._bilinear_interpolate_4d) {
23522
23574
  this._bilinear_interpolate_4d = wrap(
@@ -23890,6 +23942,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
23890
23942
 
23891
23943
  // TODO: Use softmax tensor function
23892
23944
  const function_to_apply =
23945
+ // @ts-expect-error TS2339
23893
23946
  this.model.config.problem_type === 'multi_label_classification'
23894
23947
  ? batch => batch.sigmoid()
23895
23948
  : batch => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor(
@@ -23898,6 +23951,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
23898
23951
  batch.dims,
23899
23952
  ); // single_label_classification (default)
23900
23953
 
23954
+ // @ts-expect-error TS2339
23901
23955
  const id2label = this.model.config.id2label;
23902
23956
 
23903
23957
  const toReturn = [];
@@ -24000,6 +24054,7 @@ class TokenClassificationPipeline extends (/** @type {new (options: TextPipeline
24000
24054
  const outputs = await this.model(model_inputs)
24001
24055
 
24002
24056
  const logits = outputs.logits;
24057
+ // @ts-expect-error TS2339
24003
24058
  const id2label = this.model.config.id2label;
24004
24059
 
24005
24060
  const toReturn = [];
@@ -24339,11 +24394,14 @@ class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipeline
24339
24394
 
24340
24395
 
24341
24396
  // Add global prefix, if present
24397
+ // @ts-expect-error TS2339
24342
24398
  if (this.model.config.prefix) {
24399
+ // @ts-expect-error TS2339
24343
24400
  texts = texts.map(x => this.model.config.prefix + x)
24344
24401
  }
24345
24402
 
24346
24403
  // Handle task specific params:
24404
+ // @ts-expect-error TS2339
24347
24405
  const task_specific_params = this.model.config.task_specific_params
24348
24406
  if (task_specific_params && task_specific_params[this.task]) {
24349
24407
  // Add prefixes, if present
@@ -25082,6 +25140,7 @@ class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelin
25082
25140
  const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
25083
25141
  const preparedAudios = await prepareAudios(audio, sampling_rate);
25084
25142
 
25143
+ // @ts-expect-error TS2339
25085
25144
  const id2label = this.model.config.id2label;
25086
25145
 
25087
25146
  const toReturn = [];
@@ -25392,6 +25451,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
25392
25451
  audio = [/** @type {AudioInput} */ (audio)];
25393
25452
  }
25394
25453
 
25454
+ // @ts-expect-error TS2339
25395
25455
  const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
25396
25456
  const hop_length = this.processor.feature_extractor.config.hop_length;
25397
25457
 
@@ -25457,7 +25517,9 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
25457
25517
 
25458
25518
  // TODO: Right now we only get top beam
25459
25519
  if (return_timestamps === 'word') {
25520
+ // @ts-expect-error TS2339
25460
25521
  chunk.tokens = data.sequences.tolist()[0];
25522
+ // @ts-expect-error TS2339
25461
25523
  chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
25462
25524
  (/** @type {number} */ x) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.round)(x, 2)
25463
25525
  );
@@ -25502,7 +25564,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
25502
25564
  const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
25503
25565
  const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
25504
25566
 
25505
- const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
25567
+ const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
25506
25568
  toReturn.push({ text });
25507
25569
  }
25508
25570
  return single ? toReturn[0] : toReturn;
@@ -25651,6 +25713,7 @@ class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelin
25651
25713
  const { pixel_values } = await this.processor(preparedImages);
25652
25714
  const output = await this.model({ pixel_values });
25653
25715
 
25716
+ // @ts-expect-error TS2339
25654
25717
  const id2label = this.model.config.id2label;
25655
25718
 
25656
25719
  /** @type {ImageClassificationOutput[]} */
@@ -25765,6 +25828,7 @@ class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineC
25765
25828
  }
25766
25829
  }
25767
25830
 
25831
+ // @ts-expect-error TS2339
25768
25832
  const id2label = this.model.config.id2label;
25769
25833
 
25770
25834
  /** @type {ImageSegmentationPipelineOutput[]} */
@@ -25991,6 +26055,7 @@ class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineCon
25991
26055
  const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
25992
26056
 
25993
26057
  // Add labels
26058
+ // @ts-expect-error TS2339
25994
26059
  const id2label = this.model.config.id2label;
25995
26060
 
25996
26061
  // Format output
@@ -26210,6 +26275,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
26210
26275
  // Run model
26211
26276
  const output = await this.model.generate({
26212
26277
  inputs: pixel_values,
26278
+ // @ts-expect-error TS2339
26213
26279
  max_length: this.model.config.decoder.max_position_embeddings,
26214
26280
  decoder_input_ids,
26215
26281
  ...generate_kwargs,
@@ -26325,6 +26391,7 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
26325
26391
  // Generate waveform
26326
26392
  const { waveform } = await this.model(inputs);
26327
26393
 
26394
+ // @ts-expect-error TS2339
26328
26395
  const sampling_rate = this.model.config.sampling_rate;
26329
26396
  return {
26330
26397
  audio: waveform.data,
@@ -26482,11 +26549,23 @@ class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineCon
26482
26549
 
26483
26550
  const toReturn = [];
26484
26551
  for (let i = 0; i < preparedImages.length; ++i) {
26485
- const prediction = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate)(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
26486
- const formatted = prediction.mul_(255 / (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.max)(prediction.data)[0]).to('uint8');
26552
+ const batch = predicted_depth[i];
26553
+ const [height, width] = batch.dims.slice(-2);
26554
+ const [new_width, new_height] = preparedImages[i].size;
26555
+
26556
+ // Interpolate to original size
26557
+ const prediction = (await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate_4d)(batch.view(1, 1, height, width), {
26558
+ size: [new_height, new_width],
26559
+ mode: 'bilinear',
26560
+ })).view(new_height, new_width);
26561
+
26562
+ const minval = /** @type {number} */(prediction.min().item());
26563
+ const maxval = /** @type {number} */(prediction.max().item());
26564
+ const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
26565
+ const depth = _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted);
26487
26566
  toReturn.push({
26488
- predicted_depth: predicted_depth[i],
26489
- depth: _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted),
26567
+ predicted_depth: prediction,
26568
+ depth,
26490
26569
  });
26491
26570
  }
26492
26571
 
@@ -26966,6 +27045,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
26966
27045
  return result;
26967
27046
  }
26968
27047
 
27048
+
26969
27049
  /***/ }),
26970
27050
 
26971
27051
  /***/ "./src/tokenizers.js":
@@ -27034,7 +27114,6 @@ __webpack_require__.r(__webpack_exports__);
27034
27114
  /* harmony import */ var _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./utils/data-structures.js */ "./src/utils/data-structures.js");
27035
27115
  /* harmony import */ var _huggingface_jinja__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! @huggingface/jinja */ "./node_modules/@huggingface/jinja/dist/index.js");
27036
27116
  /* harmony import */ var _models_whisper_common_whisper_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./models/whisper/common_whisper.js */ "./src/models/whisper/common_whisper.js");
27037
- /* harmony import */ var _utils_constants_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./utils/constants.js */ "./src/utils/constants.js");
27038
27117
 
27039
27118
  /**
27040
27119
  * @file Tokenizers are used to prepare textual inputs for a model.
@@ -27071,7 +27150,6 @@ __webpack_require__.r(__webpack_exports__);
27071
27150
 
27072
27151
 
27073
27152
 
27074
-
27075
27153
  /**
27076
27154
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
27077
27155
  * @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used.
@@ -27555,7 +27633,7 @@ class Unigram extends TokenizerModel {
27555
27633
  * Create a new Unigram tokenizer model.
27556
27634
  * @param {Object} config The configuration object for the Unigram model.
27557
27635
  * @param {number} config.unk_id The ID of the unknown token
27558
- * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
27636
+ * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
27559
27637
  * @param {Object} moreConfig Additional configuration object for the Unigram model.
27560
27638
  */
27561
27639
  constructor(config, moreConfig) {
@@ -27563,11 +27641,10 @@ class Unigram extends TokenizerModel {
27563
27641
 
27564
27642
  const vocabSize = config.vocab.length;
27565
27643
  this.vocab = new Array(vocabSize);
27644
+ /** @type {number[]} */
27566
27645
  this.scores = new Array(vocabSize);
27567
27646
  for (let i = 0; i < vocabSize; ++i) {
27568
- const piece = config.vocab[i];
27569
- this.vocab[i] = piece[0];
27570
- this.scores[i] = piece[1];
27647
+ [this.vocab[i], this.scores[i]] = config.vocab[i];
27571
27648
  }
27572
27649
 
27573
27650
  this.unk_token_id = config.unk_id;
@@ -32924,6 +33001,8 @@ __webpack_require__.r(__webpack_exports__);
32924
33001
  /* harmony export */ });
32925
33002
  /* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
32926
33003
  /* harmony import */ var _devices_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./devices.js */ "./src/utils/devices.js");
33004
+ /// <reference types="@webgpu/types" />
33005
+
32927
33006
 
32928
33007
 
32929
33008
 
@@ -33177,7 +33256,7 @@ class FileResponse {
33177
33256
  */
33178
33257
  async arrayBuffer() {
33179
33258
  const data = await fs__WEBPACK_IMPORTED_MODULE_0__.promises.readFile(this.filePath);
33180
- return data.buffer;
33259
+ return /** @type {ArrayBuffer} */ (data.buffer);
33181
33260
  }
33182
33261
 
33183
33262
  /**
@@ -34836,8 +34915,9 @@ function magnitude(arr) {
34836
34915
 
34837
34916
  /**
34838
34917
  * Returns the value and index of the minimum element in an array.
34839
- * @param {number[]|TypedArray} arr array of numbers.
34840
- * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
34918
+ * @template {number[]|bigint[]|AnyTypedArray} T
34919
+ * @param {T} arr array of numbers.
34920
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
34841
34921
  * @throws {Error} If array is empty.
34842
34922
  */
34843
34923
  function min(arr) {
@@ -34850,14 +34930,15 @@ function min(arr) {
34850
34930
  indexOfMin = i;
34851
34931
  }
34852
34932
  }
34853
- return [min, indexOfMin];
34933
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
34854
34934
  }
34855
34935
 
34856
34936
 
34857
34937
  /**
34858
34938
  * Returns the value and index of the maximum element in an array.
34859
- * @param {number[]|AnyTypedArray} arr array of numbers.
34860
- * @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
34939
+ * @template {number[]|bigint[]|AnyTypedArray} T
34940
+ * @param {T} arr array of numbers.
34941
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
34861
34942
  * @throws {Error} If array is empty.
34862
34943
  */
34863
34944
  function max(arr) {
@@ -34870,7 +34951,7 @@ function max(arr) {
34870
34951
  indexOfMax = i;
34871
34952
  }
34872
34953
  }
34873
- return [Number(max), indexOfMax];
34954
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
34874
34955
  }
34875
34956
 
34876
34957
  function isPowerOfTwo(number) {
@@ -36167,8 +36248,6 @@ class Tensor {
36167
36248
  return this.permute(...dims);
36168
36249
  }
36169
36250
 
36170
- // TODO add .max() and .min() methods
36171
-
36172
36251
  /**
36173
36252
  * Returns the sum of each row of the input tensor in the given dimension dim.
36174
36253
  *
@@ -36462,6 +36541,36 @@ class Tensor {
36462
36541
  return mean(this, dim, keepdim);
36463
36542
  }
36464
36543
 
36544
+ min(dim = null, keepdim = false) {
36545
+ if (dim !== null) {
36546
+ throw new Error("`dim !== null` not yet implemented.");
36547
+ }
36548
+ const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[0];
36549
+ return new Tensor(this.type, [value], []);
36550
+ }
36551
+ max(dim = null, keepdim = false) {
36552
+ if (dim !== null) {
36553
+ throw new Error("`dim !== null` not yet implemented.");
36554
+ }
36555
+ const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[0];
36556
+ return new Tensor(this.type, [value], []);
36557
+ }
36558
+
36559
+ argmin(dim = null, keepdim = false) {
36560
+ if (dim !== null) {
36561
+ throw new Error("`dim !== null` not yet implemented.");
36562
+ }
36563
+ const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[1];
36564
+ return new Tensor('int64', [BigInt(index)], []);
36565
+ }
36566
+ argmax(dim = null, keepdim = false) {
36567
+ if (dim !== null) {
36568
+ throw new Error("`dim !== null` not yet implemented.");
36569
+ }
36570
+ const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[1];
36571
+ return new Tensor('int64', [BigInt(index)], []);
36572
+ }
36573
+
36465
36574
  /**
36466
36575
  * Performs Tensor dtype conversion.
36467
36576
  * @param {DataType} type The desired data type.
@@ -36595,7 +36704,7 @@ function interpolate(input, [out_height, out_width], mode = 'bilinear', align_co
36595
36704
  * @param {Tensor} input the input tensor
36596
36705
  * @param {Object} options the options for the interpolation
36597
36706
  * @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
36598
- * @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
36707
+ * @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
36599
36708
  * @returns {Promise<Tensor>} The interpolated tensor.
36600
36709
  */
36601
36710
  async function interpolate_4d(input, {
@@ -36625,7 +36734,9 @@ async function interpolate_4d(input, {
36625
36734
  }
36626
36735
 
36627
36736
  let op;
36628
- if (mode === 'bilinear') {
36737
+ if (mode === 'nearest') {
36738
+ op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.nearest_interpolate_4d;
36739
+ } else if (mode === 'bilinear') {
36629
36740
  op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bilinear_interpolate_4d;
36630
36741
  } else if (mode === 'bicubic') {
36631
36742
  op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bicubic_interpolate_4d;
@@ -36666,13 +36777,13 @@ async function rfft(x, a) {
36666
36777
  * Returns the k largest elements of the given input tensor.
36667
36778
  * Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
36668
36779
  * @param {Tensor} x the input tensor
36669
- * @param {number} k the k in "top-k"
36780
+ * @param {number} [k] the k in "top-k"
36670
36781
  * @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
36671
36782
  */
36672
36783
  async function topk(x, k) {
36673
36784
  const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.top_k;
36674
36785
 
36675
- if (k === null) {
36786
+ if (k == null) {
36676
36787
  k = x.dims.at(-1);
36677
36788
  } else {
36678
36789
  k = Math.min(k, x.dims.at(-1));
@@ -36701,10 +36812,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
36701
36812
  async function slice(data, starts, ends, axes, steps) {
36702
36813
  const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
36703
36814
  return await op({
36704
- x: data,
36705
- s: arrayToIndexTensor(starts),
36706
- e: arrayToIndexTensor(ends),
36707
- a: arrayToIndexTensor(axes),
36815
+ x: data,
36816
+ s: arrayToIndexTensor(starts),
36817
+ e: arrayToIndexTensor(ends),
36818
+ a: arrayToIndexTensor(axes),
36708
36819
  t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
36709
36820
  });
36710
36821
  }