@huggingface/transformers 3.2.2 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +3 -2
  2. package/dist/transformers.cjs +252 -113
  3. package/dist/transformers.cjs.map +1 -1
  4. package/dist/transformers.js +256 -114
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.cjs +1 -1
  7. package/dist/transformers.min.cjs.map +1 -1
  8. package/dist/transformers.min.js +1 -1
  9. package/dist/transformers.min.js.map +1 -1
  10. package/dist/transformers.min.mjs +1 -1
  11. package/dist/transformers.min.mjs.map +1 -1
  12. package/dist/transformers.mjs +256 -114
  13. package/dist/transformers.mjs.map +1 -1
  14. package/package.json +2 -2
  15. package/src/base/feature_extraction_utils.js +9 -9
  16. package/src/base/image_processors_utils.js +11 -0
  17. package/src/base/processing_utils.js +13 -3
  18. package/src/configs.js +5 -0
  19. package/src/env.js +1 -1
  20. package/src/models/auto/feature_extraction_auto.js +0 -16
  21. package/src/models/auto/processing_auto.js +0 -16
  22. package/src/models/convnext/image_processing_convnext.js +1 -0
  23. package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
  24. package/src/models/florence2/processing_florence2.js +3 -0
  25. package/src/models/idefics3/image_processing_idefics3.js +2 -0
  26. package/src/models/janus/image_processing_janus.js +1 -0
  27. package/src/models/mgp_str/processing_mgp_str.js +2 -0
  28. package/src/models/paligemma/processing_paligemma.js +1 -0
  29. package/src/models/phi3_v/processing_phi3_v.js +1 -1
  30. package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
  31. package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
  32. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
  33. package/src/models/whisper/feature_extraction_whisper.js +1 -1
  34. package/src/models.js +93 -36
  35. package/src/ops/registry.js +10 -0
  36. package/src/pipelines.js +34 -7
  37. package/src/tokenizers.js +4 -7
  38. package/src/utils/dtypes.js +2 -0
  39. package/src/utils/hub.js +1 -1
  40. package/src/utils/maths.js +8 -6
  41. package/src/utils/tensor.js +42 -10
  42. package/types/base/feature_extraction_utils.d.ts +7 -7
  43. package/types/base/image_processors_utils.d.ts.map +1 -1
  44. package/types/base/processing_utils.d.ts +17 -19
  45. package/types/base/processing_utils.d.ts.map +1 -1
  46. package/types/configs.d.ts.map +1 -1
  47. package/types/generation/parameters.d.ts +1 -1
  48. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  49. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  50. package/types/models/auto/processing_auto.d.ts.map +1 -1
  51. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
  52. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
  53. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  54. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  55. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  56. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  57. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  58. package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
  59. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
  60. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  61. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  62. package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
  63. package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
  64. package/types/models/whisper/generation_whisper.d.ts +1 -1
  65. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  66. package/types/models.d.ts +48 -17
  67. package/types/models.d.ts.map +1 -1
  68. package/types/ops/registry.d.ts +1 -0
  69. package/types/ops/registry.d.ts.map +1 -1
  70. package/types/pipelines.d.ts +2 -2
  71. package/types/pipelines.d.ts.map +1 -1
  72. package/types/tokenizers.d.ts.map +1 -1
  73. package/types/tsconfig.tsbuildinfo +1 -0
  74. package/types/utils/dtypes.d.ts.map +1 -1
  75. package/types/utils/hub.d.ts +1 -1
  76. package/types/utils/hub.d.ts.map +1 -1
  77. package/types/utils/image.d.ts +3 -2
  78. package/types/utils/image.d.ts.map +1 -1
  79. package/types/utils/maths.d.ts +8 -6
  80. package/types/utils/maths.d.ts.map +1 -1
  81. package/types/utils/tensor.d.ts +8 -4
  82. package/types/utils/tensor.d.ts.map +1 -1
package/src/models.js CHANGED
@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
270
270
  } else if (session_options.externalData !== undefined) {
271
271
  externalDataPromises = session_options.externalData.map(async (ext) => {
272
272
  // if the external data is a string, fetch the file and replace the string with its content
273
+ // @ts-expect-error TS2339
273
274
  if (typeof ext.data === "string") {
275
+ // @ts-expect-error TS2339
274
276
  const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
277
+ // @ts-expect-error TS2698
275
278
  return { ...ext, data: ext_buffer };
276
279
  }
277
280
  return ext;
@@ -1519,6 +1522,7 @@ export class PreTrainedModel extends Callable {
1519
1522
  if (this.config.model_type === 'musicgen') {
1520
1523
  // Custom logic (TODO: move to Musicgen class)
1521
1524
  decoder_input_ids = Array.from({
1525
+ // @ts-expect-error TS2339
1522
1526
  length: batch_size * this.config.decoder.num_codebooks
1523
1527
  }, () => [decoder_start_token_id]);
1524
1528
 
@@ -1848,11 +1852,13 @@ export class PreTrainedModel extends Callable {
1848
1852
  async encode_image({ pixel_values }) {
1849
1853
  // image_inputs === { pixel_values }
1850
1854
  const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
1855
+ // @ts-expect-error TS2339
1851
1856
  if (!this.config.num_image_tokens) {
1852
1857
  console.warn(
1853
1858
  'The number of image tokens was not set in the model configuration. ' +
1854
1859
  `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
1855
1860
  )
1861
+ // @ts-expect-error TS2339
1856
1862
  this.config.num_image_tokens = features.dims[1];
1857
1863
  }
1858
1864
  return features;
@@ -3280,6 +3286,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3280
3286
 
3281
3287
  if (generation_config.return_token_timestamps) {
3282
3288
  outputs["token_timestamps"] = this._extract_token_timestamps(
3289
+ // @ts-expect-error TS2345
3283
3290
  outputs,
3284
3291
  generation_config.alignment_heads,
3285
3292
  generation_config.num_frames,
@@ -3315,6 +3322,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3315
3322
  );
3316
3323
  }
3317
3324
 
3325
+ // @ts-expect-error TS2339
3318
3326
  let median_filter_width = this.config.median_filter_width;
3319
3327
  if (median_filter_width === undefined) {
3320
3328
  console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -3325,6 +3333,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3325
3333
  const batch = generate_outputs.cross_attentions;
3326
3334
  // Create a list with `decoder_layers` elements, each a tensor of shape
3327
3335
  // (batch size, attention_heads, output length, input length).
3336
+ // @ts-expect-error TS2339
3328
3337
  const cross_attentions = Array.from({ length: this.config.decoder_layers },
3329
3338
  // Concatenate the cross attentions for each layer across sequence length dimension.
3330
3339
  (_, i) => cat(batch.map(x => x[i]), 2)
@@ -3421,7 +3430,7 @@ export class MoonshinePreTrainedModel extends PreTrainedModel {
3421
3430
  */
3422
3431
  export class MoonshineModel extends MoonshinePreTrainedModel { }
3423
3432
 
3424
- export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
3433
+ export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
3425
3434
  //////////////////////////////////////////////////
3426
3435
 
3427
3436
 
@@ -3468,6 +3477,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
3468
3477
  attention_mask,
3469
3478
  }) {
3470
3479
 
3480
+ // @ts-expect-error TS2339
3471
3481
  const image_token_index = this.config.image_token_index;
3472
3482
 
3473
3483
  const idsList = input_ids.tolist();
@@ -3821,9 +3831,9 @@ export class CLIPTextModel extends CLIPPreTrainedModel {
3821
3831
  /** @type {typeof PreTrainedModel.from_pretrained} */
3822
3832
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3823
3833
  return super.from_pretrained(pretrained_model_name_or_path, {
3824
- // Update default model file name if not provided
3825
- model_file_name: 'text_model',
3826
3834
  ...options,
3835
+ // Update default model file name if not provided
3836
+ model_file_name: options.model_file_name ?? 'text_model',
3827
3837
  });
3828
3838
  }
3829
3839
  }
@@ -3858,9 +3868,9 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
3858
3868
  /** @type {typeof PreTrainedModel.from_pretrained} */
3859
3869
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3860
3870
  return super.from_pretrained(pretrained_model_name_or_path, {
3861
- // Update default model file name if not provided
3862
- model_file_name: 'text_model',
3863
3871
  ...options,
3872
+ // Update default model file name if not provided
3873
+ model_file_name: options.model_file_name ?? 'text_model',
3864
3874
  });
3865
3875
  }
3866
3876
  }
@@ -3872,9 +3882,9 @@ export class CLIPVisionModel extends CLIPPreTrainedModel {
3872
3882
  /** @type {typeof PreTrainedModel.from_pretrained} */
3873
3883
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3874
3884
  return super.from_pretrained(pretrained_model_name_or_path, {
3875
- // Update default model file name if not provided
3876
- model_file_name: 'vision_model',
3877
3885
  ...options,
3886
+ // Update default model file name if not provided
3887
+ model_file_name: options.model_file_name ?? 'vision_model',
3878
3888
  });
3879
3889
  }
3880
3890
  }
@@ -3909,9 +3919,9 @@ export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
3909
3919
  /** @type {typeof PreTrainedModel.from_pretrained} */
3910
3920
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3911
3921
  return super.from_pretrained(pretrained_model_name_or_path, {
3912
- // Update default model file name if not provided
3913
- model_file_name: 'vision_model',
3914
3922
  ...options,
3923
+ // Update default model file name if not provided
3924
+ model_file_name: options.model_file_name ?? 'vision_model',
3915
3925
  });
3916
3926
  }
3917
3927
  }
@@ -3997,9 +4007,9 @@ export class SiglipTextModel extends SiglipPreTrainedModel {
3997
4007
  /** @type {typeof PreTrainedModel.from_pretrained} */
3998
4008
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3999
4009
  return super.from_pretrained(pretrained_model_name_or_path, {
4000
- // Update default model file name if not provided
4001
- model_file_name: 'text_model',
4002
4010
  ...options,
4011
+ // Update default model file name if not provided
4012
+ model_file_name: options.model_file_name ?? 'text_model',
4003
4013
  });
4004
4014
  }
4005
4015
  }
@@ -4034,9 +4044,9 @@ export class SiglipVisionModel extends CLIPPreTrainedModel {
4034
4044
  /** @type {typeof PreTrainedModel.from_pretrained} */
4035
4045
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
4036
4046
  return super.from_pretrained(pretrained_model_name_or_path, {
4037
- // Update default model file name if not provided
4038
- model_file_name: 'vision_model',
4039
4047
  ...options,
4048
+ // Update default model file name if not provided
4049
+ model_file_name: options.model_file_name ?? 'vision_model',
4040
4050
  });
4041
4051
  }
4042
4052
  }
@@ -4093,9 +4103,9 @@ export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
4093
4103
  /** @type {typeof PreTrainedModel.from_pretrained} */
4094
4104
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
4095
4105
  return super.from_pretrained(pretrained_model_name_or_path, {
4096
- // Update default model file name if not provided
4097
- model_file_name: 'text_model',
4098
4106
  ...options,
4107
+ // Update default model file name if not provided
4108
+ model_file_name: options.model_file_name ?? 'text_model',
4099
4109
  });
4100
4110
  }
4101
4111
  }
@@ -4104,9 +4114,9 @@ export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
4104
4114
  /** @type {typeof PreTrainedModel.from_pretrained} */
4105
4115
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
4106
4116
  return super.from_pretrained(pretrained_model_name_or_path, {
4107
- // Update default model file name if not provided
4108
- model_file_name: 'vision_model',
4109
4117
  ...options,
4118
+ // Update default model file name if not provided
4119
+ model_file_name: options.model_file_name ?? 'vision_model',
4110
4120
  });
4111
4121
  }
4112
4122
  }
@@ -4453,6 +4463,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
4453
4463
  const image_nums = vision_tokens.filter(x => x == image_token_id).length;
4454
4464
  const video_nums = vision_tokens.filter(x => x == video_token_id).length;
4455
4465
 
4466
+ /** @type {number[][]} */
4456
4467
  let llm_pos_ids_list = [];
4457
4468
  let st = 0;
4458
4469
  let remain_images = image_nums;
@@ -4522,6 +4533,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
4522
4533
  // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
4523
4534
  // meaning to perform concatenation along dim=1, we can do the following:
4524
4535
  const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
4536
+ /** @type {number[]} */
4525
4537
  const llm_positions = new Array(num_items);
4526
4538
  let index = 0;
4527
4539
  for (let x = 0; x < 3; ++x) {
@@ -4562,9 +4574,10 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
4562
4574
  { length: 3 * data.length },
4563
4575
  (_, i) => data[i % data.length]
4564
4576
  );
4577
+ /** @type {bigint[]} */
4565
4578
  const mrope_position_deltas = Array.from(
4566
4579
  { length: dims[0] },
4567
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
4580
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
4568
4581
  );
4569
4582
 
4570
4583
  return [
@@ -5135,7 +5148,7 @@ export class DPTModel extends DPTPreTrainedModel { }
5135
5148
  *
5136
5149
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
5137
5150
  * ```javascript
5138
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
5151
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
5139
5152
  *
5140
5153
  * // Load model and processor
5141
5154
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -5144,7 +5157,7 @@ export class DPTModel extends DPTPreTrainedModel { }
5144
5157
  *
5145
5158
  * // Load image from URL
5146
5159
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
5147
- * const image = await RawImage.fromURL(url);
5160
+ * const image = await RawImage.read(url);
5148
5161
  *
5149
5162
  * // Prepare image for the model
5150
5163
  * const inputs = await processor(image);
@@ -5153,10 +5166,15 @@ export class DPTModel extends DPTPreTrainedModel { }
5153
5166
  * const { predicted_depth } = await model(inputs);
5154
5167
  *
5155
5168
  * // Interpolate to original size
5156
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
5169
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
5170
+ * size: image.size.reverse(),
5171
+ * mode: 'bilinear',
5172
+ * })).squeeze(1);
5157
5173
  *
5158
5174
  * // Visualize the prediction
5159
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
5175
+ * const min = prediction.min().item();
5176
+ * const max = prediction.max().item();
5177
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
5160
5178
  * const depth = RawImage.fromTensor(formatted);
5161
5179
  * // RawImage {
5162
5180
  * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -5206,11 +5224,7 @@ export class GLPNPreTrainedModel extends PreTrainedModel { }
5206
5224
  export class GLPNModel extends GLPNPreTrainedModel { }
5207
5225
 
5208
5226
  /**
5209
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
5210
- *
5211
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
5212
- * ```javascript
5213
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
5227
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
5214
5228
  *
5215
5229
  * // Load model and processor
5216
5230
  * const model_id = 'Xenova/glpn-kitti';
@@ -5219,7 +5233,7 @@ export class GLPNModel extends GLPNPreTrainedModel { }
5219
5233
  *
5220
5234
  * // Load image from URL
5221
5235
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
5222
- * const image = await RawImage.fromURL(url);
5236
+ * const image = await RawImage.read(url);
5223
5237
  *
5224
5238
  * // Prepare image for the model
5225
5239
  * const inputs = await processor(image);
@@ -5228,13 +5242,18 @@ export class GLPNModel extends GLPNPreTrainedModel { }
5228
5242
  * const { predicted_depth } = await model(inputs);
5229
5243
  *
5230
5244
  * // Interpolate to original size
5231
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
5245
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
5246
+ * size: image.size.reverse(),
5247
+ * mode: 'bilinear',
5248
+ * })).squeeze(1);
5232
5249
  *
5233
5250
  * // Visualize the prediction
5234
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
5251
+ * const min = prediction.min().item();
5252
+ * const max = prediction.max().item();
5253
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
5235
5254
  * const depth = RawImage.fromTensor(formatted);
5236
5255
  * // RawImage {
5237
- * // data: Uint8Array(307200) [ 207, 169, 154, ... ],
5256
+ * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
5238
5257
  * // width: 640,
5239
5258
  * // height: 480,
5240
5259
  * // channels: 1
@@ -5389,6 +5408,26 @@ export class Dinov2ForImageClassification extends Dinov2PreTrainedModel {
5389
5408
  }
5390
5409
  //////////////////////////////////////////////////
5391
5410
 
5411
+ //////////////////////////////////////////////////
5412
+ export class Dinov2WithRegistersPreTrainedModel extends PreTrainedModel { }
5413
+
5414
+ /**
5415
+ * The bare Dinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top.
5416
+ */
5417
+ export class Dinov2WithRegistersModel extends Dinov2WithRegistersPreTrainedModel { }
5418
+
5419
+ /**
5420
+ * Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet.
5421
+ */
5422
+ export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegistersPreTrainedModel {
5423
+ /**
5424
+ * @param {any} model_inputs
5425
+ */
5426
+ async _call(model_inputs) {
5427
+ return new SequenceClassifierOutput(await super._call(model_inputs));
5428
+ }
5429
+ }
5430
+ //////////////////////////////////////////////////
5392
5431
 
5393
5432
  //////////////////////////////////////////////////
5394
5433
  export class YolosPreTrainedModel extends PreTrainedModel { }
@@ -6181,10 +6220,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
6181
6220
 
6182
6221
  const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
6183
6222
 
6223
+ // @ts-expect-error TS2339
6184
6224
  const r = encoder_outputs.dims[1] / this.config.reduction_factor;
6185
6225
  const maxlen = Math.floor(r * maxlenratio);
6186
6226
  const minlen = Math.floor(r * minlenratio);
6187
6227
 
6228
+ // @ts-expect-error TS2339
6188
6229
  const num_mel_bins = this.config.num_mel_bins;
6189
6230
 
6190
6231
  let spectrogramParts = [];
@@ -6338,9 +6379,9 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel {
6338
6379
  /** @type {typeof PreTrainedModel.from_pretrained} */
6339
6380
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
6340
6381
  return super.from_pretrained(pretrained_model_name_or_path, {
6341
- // Update default model file name if not provided
6342
- model_file_name: 'text_model',
6343
6382
  ...options,
6383
+ // Update default model file name if not provided
6384
+ model_file_name: options.model_file_name ?? 'text_model',
6344
6385
  });
6345
6386
  }
6346
6387
  }
@@ -6375,9 +6416,9 @@ export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
6375
6416
  /** @type {typeof PreTrainedModel.from_pretrained} */
6376
6417
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
6377
6418
  return super.from_pretrained(pretrained_model_name_or_path, {
6378
- // Update default model file name if not provided
6379
- model_file_name: 'audio_model',
6380
6419
  ...options,
6420
+ // Update default model file name if not provided
6421
+ model_file_name: options.model_file_name ?? 'audio_model',
6381
6422
  });
6382
6423
  }
6383
6424
  }
@@ -6549,11 +6590,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
6549
6590
  */
6550
6591
  _apply_and_filter_by_delay_pattern_mask(outputs) {
6551
6592
  const [bs_x_codebooks, seqLength] = outputs.dims;
6593
+ // @ts-expect-error TS2339
6552
6594
  const num_codebooks = this.config.decoder.num_codebooks;
6553
6595
  const upperBound = (seqLength - num_codebooks);
6554
6596
 
6555
6597
  let newDataSize = 0;
6556
6598
  for (let i = 0; i < outputs.size; ++i) {
6599
+ // @ts-expect-error TS2339
6557
6600
  if (outputs.data[i] === this.config.decoder.pad_token_id) {
6558
6601
  continue;
6559
6602
  }
@@ -6583,7 +6626,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
6583
6626
  let clonedInputIds = structuredClone(input_ids);
6584
6627
  for (let i = 0; i < clonedInputIds.length; ++i) {
6585
6628
  for (let j = 0; j < clonedInputIds[i].length; ++j) {
6629
+ // @ts-expect-error TS2339
6586
6630
  if ((i % this.config.decoder.num_codebooks) >= j) {
6631
+ // @ts-expect-error TS2339
6587
6632
  clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
6588
6633
  }
6589
6634
  }
@@ -6740,6 +6785,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
6740
6785
  'past_key_values',
6741
6786
  ];
6742
6787
 
6788
+ /**
6789
+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
6790
+ */
6743
6791
  constructor(...args) {
6744
6792
  super(...args);
6745
6793
 
@@ -7018,6 +7066,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
7018
7066
  ['convnext', ['ConvNextModel', ConvNextModel]],
7019
7067
  ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]],
7020
7068
  ['dinov2', ['Dinov2Model', Dinov2Model]],
7069
+ ['dinov2_with_registers', ['Dinov2WithRegistersModel', Dinov2WithRegistersModel]],
7021
7070
  ['resnet', ['ResNetModel', ResNetModel]],
7022
7071
  ['swin', ['SwinModel', SwinModel]],
7023
7072
  ['swin2sr', ['Swin2SRModel', Swin2SRModel]],
@@ -7263,6 +7312,7 @@ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
7263
7312
  ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]],
7264
7313
  ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]],
7265
7314
  ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]],
7315
+ ['dinov2_with_registers', ['Dinov2WithRegistersForImageClassification', Dinov2WithRegistersForImageClassification]],
7266
7316
  ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]],
7267
7317
  ['swin', ['SwinForImageClassification', SwinForImageClassification]],
7268
7318
  ['segformer', ['SegformerForImageClassification', SegformerForImageClassification]],
@@ -7706,10 +7756,17 @@ export class SequenceClassifierOutput extends ModelOutput {
7706
7756
  /**
7707
7757
  * @param {Object} output The output of the model.
7708
7758
  * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
7759
+ * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
7760
+ * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
7709
7761
  */
7710
- constructor({ logits }) {
7762
+ constructor({ logits, ...attentions }) {
7711
7763
  super();
7712
7764
  this.logits = logits;
7765
+ const attentions_list = Object.values(attentions);
7766
+ if (attentions_list.length > 0) {
7767
+ // Only set attentions if they are not empty
7768
+ this.attentions = attentions_list;
7769
+ }
7713
7770
  }
7714
7771
  }
7715
7772
 
@@ -36,6 +36,16 @@ export class TensorOpRegistry {
36
36
  // executionProviders: ['webgpu'],
37
37
  };
38
38
 
39
+ static get nearest_interpolate_4d() {
40
+ if (!this._nearest_interpolate_4d) {
41
+ this._nearest_interpolate_4d = wrap(
42
+ [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
43
+ this.session_options,
44
+ 'y',
45
+ );
46
+ }
47
+ return this._nearest_interpolate_4d;
48
+ }
39
49
  static get bilinear_interpolate_4d() {
40
50
  if (!this._bilinear_interpolate_4d) {
41
51
  this._bilinear_interpolate_4d = wrap(
package/src/pipelines.js CHANGED
@@ -69,7 +69,7 @@ import {
69
69
  import {
70
70
  Tensor,
71
71
  mean_pooling,
72
- interpolate,
72
+ interpolate_4d,
73
73
  quantize_embeddings,
74
74
  topk,
75
75
  } from './utils/tensor.js';
@@ -294,6 +294,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi
294
294
 
295
295
  // TODO: Use softmax tensor function
296
296
  const function_to_apply =
297
+ // @ts-expect-error TS2339
297
298
  this.model.config.problem_type === 'multi_label_classification'
298
299
  ? batch => batch.sigmoid()
299
300
  : batch => new Tensor(
@@ -302,6 +303,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi
302
303
  batch.dims,
303
304
  ); // single_label_classification (default)
304
305
 
306
+ // @ts-expect-error TS2339
305
307
  const id2label = this.model.config.id2label;
306
308
 
307
309
  const toReturn = [];
@@ -404,6 +406,7 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP
404
406
  const outputs = await this.model(model_inputs)
405
407
 
406
408
  const logits = outputs.logits;
409
+ // @ts-expect-error TS2339
407
410
  const id2label = this.model.config.id2label;
408
411
 
409
412
  const toReturn = [];
@@ -743,11 +746,14 @@ export class Text2TextGenerationPipeline extends (/** @type {new (options: TextP
743
746
 
744
747
 
745
748
  // Add global prefix, if present
749
+ // @ts-expect-error TS2339
746
750
  if (this.model.config.prefix) {
751
+ // @ts-expect-error TS2339
747
752
  texts = texts.map(x => this.model.config.prefix + x)
748
753
  }
749
754
 
750
755
  // Handle task specific params:
756
+ // @ts-expect-error TS2339
751
757
  const task_specific_params = this.model.config.task_specific_params
752
758
  if (task_specific_params && task_specific_params[this.task]) {
753
759
  // Add prefixes, if present
@@ -1486,6 +1492,7 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio
1486
1492
  const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1487
1493
  const preparedAudios = await prepareAudios(audio, sampling_rate);
1488
1494
 
1495
+ // @ts-expect-error TS2339
1489
1496
  const id2label = this.model.config.id2label;
1490
1497
 
1491
1498
  const toReturn = [];
@@ -1796,6 +1803,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1796
1803
  audio = [/** @type {AudioInput} */ (audio)];
1797
1804
  }
1798
1805
 
1806
+ // @ts-expect-error TS2339
1799
1807
  const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
1800
1808
  const hop_length = this.processor.feature_extractor.config.hop_length;
1801
1809
 
@@ -1861,7 +1869,9 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1861
1869
 
1862
1870
  // TODO: Right now we only get top beam
1863
1871
  if (return_timestamps === 'word') {
1872
+ // @ts-expect-error TS2339
1864
1873
  chunk.tokens = data.sequences.tolist()[0];
1874
+ // @ts-expect-error TS2339
1865
1875
  chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
1866
1876
  (/** @type {number} */ x) => round(x, 2)
1867
1877
  );
@@ -1906,7 +1916,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1906
1916
  const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
1907
1917
  const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
1908
1918
 
1909
- const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
1919
+ const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
1910
1920
  toReturn.push({ text });
1911
1921
  }
1912
1922
  return single ? toReturn[0] : toReturn;
@@ -2055,6 +2065,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image
2055
2065
  const { pixel_values } = await this.processor(preparedImages);
2056
2066
  const output = await this.model({ pixel_values });
2057
2067
 
2068
+ // @ts-expect-error TS2339
2058
2069
  const id2label = this.model.config.id2label;
2059
2070
 
2060
2071
  /** @type {ImageClassificationOutput[]} */
@@ -2169,6 +2180,7 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
2169
2180
  }
2170
2181
  }
2171
2182
 
2183
+ // @ts-expect-error TS2339
2172
2184
  const id2label = this.model.config.id2label;
2173
2185
 
2174
2186
  /** @type {ImageSegmentationPipelineOutput[]} */
@@ -2395,6 +2407,7 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe
2395
2407
  const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
2396
2408
 
2397
2409
  // Add labels
2410
+ // @ts-expect-error TS2339
2398
2411
  const id2label = this.model.config.id2label;
2399
2412
 
2400
2413
  // Format output
@@ -2614,6 +2627,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
2614
2627
  // Run model
2615
2628
  const output = await this.model.generate({
2616
2629
  inputs: pixel_values,
2630
+ // @ts-expect-error TS2339
2617
2631
  max_length: this.model.config.decoder.max_position_embeddings,
2618
2632
  decoder_input_ids,
2619
2633
  ...generate_kwargs,
@@ -2729,6 +2743,7 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi
2729
2743
  // Generate waveform
2730
2744
  const { waveform } = await this.model(inputs);
2731
2745
 
2746
+ // @ts-expect-error TS2339
2732
2747
  const sampling_rate = this.model.config.sampling_rate;
2733
2748
  return {
2734
2749
  audio: waveform.data,
@@ -2886,11 +2901,23 @@ export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipe
2886
2901
 
2887
2902
  const toReturn = [];
2888
2903
  for (let i = 0; i < preparedImages.length; ++i) {
2889
- const prediction = interpolate(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
2890
- const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
2904
+ const batch = predicted_depth[i];
2905
+ const [height, width] = batch.dims.slice(-2);
2906
+ const [new_width, new_height] = preparedImages[i].size;
2907
+
2908
+ // Interpolate to original size
2909
+ const prediction = (await interpolate_4d(batch.view(1, 1, height, width), {
2910
+ size: [new_height, new_width],
2911
+ mode: 'bilinear',
2912
+ })).view(new_height, new_width);
2913
+
2914
+ const minval = /** @type {number} */(prediction.min().item());
2915
+ const maxval = /** @type {number} */(prediction.max().item());
2916
+ const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
2917
+ const depth = RawImage.fromTensor(formatted);
2891
2918
  toReturn.push({
2892
- predicted_depth: predicted_depth[i],
2893
- depth: RawImage.fromTensor(formatted),
2919
+ predicted_depth: prediction,
2920
+ depth,
2894
2921
  });
2895
2922
  }
2896
2923
 
@@ -3368,4 +3395,4 @@ async function loadItems(mapping, model, pretrainedOptions) {
3368
3395
  }
3369
3396
 
3370
3397
  return result;
3371
- }
3398
+ }
package/src/tokenizers.js CHANGED
@@ -47,10 +47,8 @@ import {
47
47
  import { Template } from '@huggingface/jinja';
48
48
 
49
49
  import {
50
- WHISPER_LANGUAGE_MAPPING,
51
- whisper_language_to_code,
50
+ WHISPER_LANGUAGE_MAPPING
52
51
  } from './models/whisper/common_whisper.js';
53
- import { GITHUB_ISSUE_URL } from './utils/constants.js';
54
52
 
55
53
  /**
56
54
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
@@ -535,7 +533,7 @@ class Unigram extends TokenizerModel {
535
533
  * Create a new Unigram tokenizer model.
536
534
  * @param {Object} config The configuration object for the Unigram model.
537
535
  * @param {number} config.unk_id The ID of the unknown token
538
- * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
536
+ * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
539
537
  * @param {Object} moreConfig Additional configuration object for the Unigram model.
540
538
  */
541
539
  constructor(config, moreConfig) {
@@ -543,11 +541,10 @@ class Unigram extends TokenizerModel {
543
541
 
544
542
  const vocabSize = config.vocab.length;
545
543
  this.vocab = new Array(vocabSize);
544
+ /** @type {number[]} */
546
545
  this.scores = new Array(vocabSize);
547
546
  for (let i = 0; i < vocabSize; ++i) {
548
- const piece = config.vocab[i];
549
- this.vocab[i] = piece[0];
550
- this.scores[i] = piece[1];
547
+ [this.vocab[i], this.scores[i]] = config.vocab[i];
551
548
  }
552
549
 
553
550
  this.unk_token_id = config.unk_id;
@@ -1,3 +1,5 @@
1
+ /// <reference types="@webgpu/types" />
2
+
1
3
  import { apis } from "../env.js";
2
4
 
3
5
  import { DEVICE_TYPES } from "./devices.js";
package/src/utils/hub.js CHANGED
@@ -121,7 +121,7 @@ class FileResponse {
121
121
  */
122
122
  async arrayBuffer() {
123
123
  const data = await fs.promises.readFile(this.filePath);
124
- return data.buffer;
124
+ return /** @type {ArrayBuffer} */ (data.buffer);
125
125
  }
126
126
 
127
127
  /**
@@ -225,8 +225,9 @@ export function magnitude(arr) {
225
225
 
226
226
  /**
227
227
  * Returns the value and index of the minimum element in an array.
228
- * @param {number[]|TypedArray} arr array of numbers.
229
- * @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
228
+ * @template {number[]|bigint[]|AnyTypedArray} T
229
+ * @param {T} arr array of numbers.
230
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
230
231
  * @throws {Error} If array is empty.
231
232
  */
232
233
  export function min(arr) {
@@ -239,14 +240,15 @@ export function min(arr) {
239
240
  indexOfMin = i;
240
241
  }
241
242
  }
242
- return [min, indexOfMin];
243
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
243
244
  }
244
245
 
245
246
 
246
247
  /**
247
248
  * Returns the value and index of the maximum element in an array.
248
- * @param {number[]|AnyTypedArray} arr array of numbers.
249
- * @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
249
+ * @template {number[]|bigint[]|AnyTypedArray} T
250
+ * @param {T} arr array of numbers.
251
+ * @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
250
252
  * @throws {Error} If array is empty.
251
253
  */
252
254
  export function max(arr) {
@@ -259,7 +261,7 @@ export function max(arr) {
259
261
  indexOfMax = i;
260
262
  }
261
263
  }
262
- return [Number(max), indexOfMax];
264
+ return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
263
265
  }
264
266
 
265
267
  function isPowerOfTwo(number) {