@huggingface/transformers 3.2.3 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/README.md +5 -3
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/ort.bundle.min.mjs +2776 -0
  4. package/dist/transformers.cjs +792 -330
  5. package/dist/transformers.cjs.map +1 -1
  6. package/dist/transformers.js +1150 -656
  7. package/dist/transformers.js.map +1 -1
  8. package/dist/transformers.min.cjs +1 -1
  9. package/dist/transformers.min.cjs.map +1 -1
  10. package/dist/transformers.min.js +1 -1
  11. package/dist/transformers.min.js.map +1 -1
  12. package/dist/transformers.min.mjs +1 -1
  13. package/dist/transformers.min.mjs.map +1 -1
  14. package/dist/transformers.mjs +798 -331
  15. package/dist/transformers.mjs.map +1 -1
  16. package/package.json +3 -3
  17. package/src/base/feature_extraction_utils.js +9 -9
  18. package/src/base/image_processors_utils.js +12 -1
  19. package/src/base/processing_utils.js +24 -3
  20. package/src/configs.js +5 -0
  21. package/src/env.js +1 -2
  22. package/src/generation/streamers.js +5 -2
  23. package/src/models/auto/feature_extraction_auto.js +0 -16
  24. package/src/models/auto/processing_auto.js +0 -16
  25. package/src/models/convnext/image_processing_convnext.js +1 -0
  26. package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
  27. package/src/models/florence2/processing_florence2.js +3 -0
  28. package/src/models/grounding_dino/image_processing_grounding_dino.js +29 -0
  29. package/src/models/grounding_dino/processing_grounding_dino.js +101 -0
  30. package/src/models/idefics3/image_processing_idefics3.js +2 -0
  31. package/src/models/image_processors.js +1 -0
  32. package/src/models/janus/image_processing_janus.js +1 -0
  33. package/src/models/mgp_str/processing_mgp_str.js +2 -0
  34. package/src/models/paligemma/processing_paligemma.js +1 -0
  35. package/src/models/phi3_v/processing_phi3_v.js +1 -1
  36. package/src/models/processors.js +3 -2
  37. package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
  38. package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
  39. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
  40. package/src/models/whisper/feature_extraction_whisper.js +1 -1
  41. package/src/models.js +72 -20
  42. package/src/ops/registry.js +10 -0
  43. package/src/pipelines.js +73 -23
  44. package/src/tokenizers.js +4 -7
  45. package/src/utils/audio.js +113 -1
  46. package/src/utils/core.js +26 -0
  47. package/src/utils/dtypes.js +2 -0
  48. package/src/utils/hub.js +1 -1
  49. package/src/utils/image.js +5 -18
  50. package/src/utils/maths.js +8 -6
  51. package/src/utils/tensor.js +134 -114
  52. package/types/base/feature_extraction_utils.d.ts +7 -7
  53. package/types/base/image_processors_utils.d.ts +7 -0
  54. package/types/base/image_processors_utils.d.ts.map +1 -1
  55. package/types/base/processing_utils.d.ts +25 -19
  56. package/types/base/processing_utils.d.ts.map +1 -1
  57. package/types/configs.d.ts.map +1 -1
  58. package/types/generation/parameters.d.ts +1 -1
  59. package/types/generation/streamers.d.ts +3 -1
  60. package/types/generation/streamers.d.ts.map +1 -1
  61. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  62. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  63. package/types/models/auto/processing_auto.d.ts.map +1 -1
  64. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
  65. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
  66. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  67. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +20 -0
  68. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts.map +1 -0
  69. package/types/models/grounding_dino/processing_grounding_dino.d.ts +27 -0
  70. package/types/models/grounding_dino/processing_grounding_dino.d.ts.map +1 -0
  71. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  72. package/types/models/image_processors.d.ts +1 -0
  73. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  74. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  75. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  76. package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
  77. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
  78. package/types/models/processors.d.ts +3 -2
  79. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  80. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  81. package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
  82. package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
  83. package/types/models/whisper/generation_whisper.d.ts +1 -1
  84. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  85. package/types/models.d.ts +40 -17
  86. package/types/models.d.ts.map +1 -1
  87. package/types/ops/registry.d.ts +1 -0
  88. package/types/ops/registry.d.ts.map +1 -1
  89. package/types/pipelines.d.ts +7 -12
  90. package/types/pipelines.d.ts.map +1 -1
  91. package/types/tokenizers.d.ts.map +1 -1
  92. package/types/tsconfig.tsbuildinfo +1 -0
  93. package/types/utils/audio.d.ts +25 -0
  94. package/types/utils/audio.d.ts.map +1 -1
  95. package/types/utils/core.d.ts +6 -0
  96. package/types/utils/core.d.ts.map +1 -1
  97. package/types/utils/dtypes.d.ts.map +1 -1
  98. package/types/utils/hub.d.ts +1 -1
  99. package/types/utils/hub.d.ts.map +1 -1
  100. package/types/utils/image.d.ts +3 -2
  101. package/types/utils/image.d.ts.map +1 -1
  102. package/types/utils/maths.d.ts +8 -6
  103. package/types/utils/maths.d.ts.map +1 -1
  104. package/types/utils/tensor.d.ts +22 -6
  105. package/types/utils/tensor.d.ts.map +1 -1
package/src/models.js CHANGED
@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
270
270
  } else if (session_options.externalData !== undefined) {
271
271
  externalDataPromises = session_options.externalData.map(async (ext) => {
272
272
  // if the external data is a string, fetch the file and replace the string with its content
273
+ // @ts-expect-error TS2339
273
274
  if (typeof ext.data === "string") {
275
+ // @ts-expect-error TS2339
274
276
  const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
277
+ // @ts-expect-error TS2698
275
278
  return { ...ext, data: ext_buffer };
276
279
  }
277
280
  return ext;
@@ -529,14 +532,23 @@ async function encoderForward(self, model_inputs) {
529
532
  encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids });
530
533
  }
531
534
  if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) {
535
+ if (!encoderFeeds.input_ids) {
536
+ throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.');
537
+ }
532
538
  // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it,
533
539
  // but they weren't created by the tokenizer.
534
- encoderFeeds.token_type_ids = new Tensor(
535
- 'int64',
536
- new BigInt64Array(encoderFeeds.input_ids.data.length),
537
- encoderFeeds.input_ids.dims
538
- )
540
+ encoderFeeds.token_type_ids = zeros_like(encoderFeeds.input_ids);
539
541
  }
542
+ if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) {
543
+ if (!encoderFeeds.pixel_values) {
544
+ throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.');
545
+ }
546
+ // Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it,
547
+ // but they weren't created by the processor.
548
+ const dims = encoderFeeds.pixel_values.dims;
549
+ encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]);
550
+ }
551
+
540
552
  return await sessionRun(session, encoderFeeds);
541
553
  }
542
554
 
@@ -1519,6 +1531,7 @@ export class PreTrainedModel extends Callable {
1519
1531
  if (this.config.model_type === 'musicgen') {
1520
1532
  // Custom logic (TODO: move to Musicgen class)
1521
1533
  decoder_input_ids = Array.from({
1534
+ // @ts-expect-error TS2339
1522
1535
  length: batch_size * this.config.decoder.num_codebooks
1523
1536
  }, () => [decoder_start_token_id]);
1524
1537
 
@@ -1848,11 +1861,13 @@ export class PreTrainedModel extends Callable {
1848
1861
  async encode_image({ pixel_values }) {
1849
1862
  // image_inputs === { pixel_values }
1850
1863
  const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
1864
+ // @ts-expect-error TS2339
1851
1865
  if (!this.config.num_image_tokens) {
1852
1866
  console.warn(
1853
1867
  'The number of image tokens was not set in the model configuration. ' +
1854
1868
  `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
1855
1869
  )
1870
+ // @ts-expect-error TS2339
1856
1871
  this.config.num_image_tokens = features.dims[1];
1857
1872
  }
1858
1873
  return features;
@@ -3280,6 +3295,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3280
3295
 
3281
3296
  if (generation_config.return_token_timestamps) {
3282
3297
  outputs["token_timestamps"] = this._extract_token_timestamps(
3298
+ // @ts-expect-error TS2345
3283
3299
  outputs,
3284
3300
  generation_config.alignment_heads,
3285
3301
  generation_config.num_frames,
@@ -3315,6 +3331,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3315
3331
  );
3316
3332
  }
3317
3333
 
3334
+ // @ts-expect-error TS2339
3318
3335
  let median_filter_width = this.config.median_filter_width;
3319
3336
  if (median_filter_width === undefined) {
3320
3337
  console.warn("Model config has no `median_filter_width`, using default value of 7.")
@@ -3325,6 +3342,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3325
3342
  const batch = generate_outputs.cross_attentions;
3326
3343
  // Create a list with `decoder_layers` elements, each a tensor of shape
3327
3344
  // (batch size, attention_heads, output length, input length).
3345
+ // @ts-expect-error TS2339
3328
3346
  const cross_attentions = Array.from({ length: this.config.decoder_layers },
3329
3347
  // Concatenate the cross attentions for each layer across sequence length dimension.
3330
3348
  (_, i) => cat(batch.map(x => x[i]), 2)
@@ -3468,6 +3486,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
3468
3486
  attention_mask,
3469
3487
  }) {
3470
3488
 
3489
+ // @ts-expect-error TS2339
3471
3490
  const image_token_index = this.config.image_token_index;
3472
3491
 
3473
3492
  const idsList = input_ids.tolist();
@@ -4453,6 +4472,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
4453
4472
  const image_nums = vision_tokens.filter(x => x == image_token_id).length;
4454
4473
  const video_nums = vision_tokens.filter(x => x == video_token_id).length;
4455
4474
 
4475
+ /** @type {number[][]} */
4456
4476
  let llm_pos_ids_list = [];
4457
4477
  let st = 0;
4458
4478
  let remain_images = image_nums;
@@ -4522,6 +4542,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
4522
4542
  // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
4523
4543
  // meaning to perform concatenation along dim=1, we can do the following:
4524
4544
  const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
4545
+ /** @type {number[]} */
4525
4546
  const llm_positions = new Array(num_items);
4526
4547
  let index = 0;
4527
4548
  for (let x = 0; x < 3; ++x) {
@@ -4562,9 +4583,10 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
4562
4583
  { length: 3 * data.length },
4563
4584
  (_, i) => data[i % data.length]
4564
4585
  );
4586
+ /** @type {bigint[]} */
4565
4587
  const mrope_position_deltas = Array.from(
4566
4588
  { length: dims[0] },
4567
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
4589
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
4568
4590
  );
4569
4591
 
4570
4592
  return [
@@ -5135,7 +5157,7 @@ export class DPTModel extends DPTPreTrainedModel { }
5135
5157
  *
5136
5158
  * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
5137
5159
  * ```javascript
5138
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
5160
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
5139
5161
  *
5140
5162
  * // Load model and processor
5141
5163
  * const model_id = 'Xenova/dpt-hybrid-midas';
@@ -5144,7 +5166,7 @@ export class DPTModel extends DPTPreTrainedModel { }
5144
5166
  *
5145
5167
  * // Load image from URL
5146
5168
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
5147
- * const image = await RawImage.fromURL(url);
5169
+ * const image = await RawImage.read(url);
5148
5170
  *
5149
5171
  * // Prepare image for the model
5150
5172
  * const inputs = await processor(image);
@@ -5153,10 +5175,15 @@ export class DPTModel extends DPTPreTrainedModel { }
5153
5175
  * const { predicted_depth } = await model(inputs);
5154
5176
  *
5155
5177
  * // Interpolate to original size
5156
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
5178
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
5179
+ * size: image.size.reverse(),
5180
+ * mode: 'bilinear',
5181
+ * })).squeeze(1);
5157
5182
  *
5158
5183
  * // Visualize the prediction
5159
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
5184
+ * const min = prediction.min().item();
5185
+ * const max = prediction.max().item();
5186
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
5160
5187
  * const depth = RawImage.fromTensor(formatted);
5161
5188
  * // RawImage {
5162
5189
  * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -5206,11 +5233,7 @@ export class GLPNPreTrainedModel extends PreTrainedModel { }
5206
5233
  export class GLPNModel extends GLPNPreTrainedModel { }
5207
5234
 
5208
5235
  /**
5209
- * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
5210
- *
5211
- * **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
5212
- * ```javascript
5213
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
5236
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
5214
5237
  *
5215
5238
  * // Load model and processor
5216
5239
  * const model_id = 'Xenova/glpn-kitti';
@@ -5219,7 +5242,7 @@ export class GLPNModel extends GLPNPreTrainedModel { }
5219
5242
  *
5220
5243
  * // Load image from URL
5221
5244
  * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
5222
- * const image = await RawImage.fromURL(url);
5245
+ * const image = await RawImage.read(url);
5223
5246
  *
5224
5247
  * // Prepare image for the model
5225
5248
  * const inputs = await processor(image);
@@ -5228,13 +5251,18 @@ export class GLPNModel extends GLPNPreTrainedModel { }
5228
5251
  * const { predicted_depth } = await model(inputs);
5229
5252
  *
5230
5253
  * // Interpolate to original size
5231
- * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
5254
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
5255
+ * size: image.size.reverse(),
5256
+ * mode: 'bilinear',
5257
+ * })).squeeze(1);
5232
5258
  *
5233
5259
  * // Visualize the prediction
5234
- * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
5260
+ * const min = prediction.min().item();
5261
+ * const max = prediction.max().item();
5262
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
5235
5263
  * const depth = RawImage.fromTensor(formatted);
5236
5264
  * // RawImage {
5237
- * // data: Uint8Array(307200) [ 207, 169, 154, ... ],
5265
+ * // data: Uint8Array(307200) [ 85, 85, 84, ... ],
5238
5266
  * // width: 640,
5239
5267
  * // height: 480,
5240
5268
  * // channels: 1
@@ -5409,6 +5437,8 @@ export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegiste
5409
5437
  }
5410
5438
  }
5411
5439
  //////////////////////////////////////////////////
5440
+ export class GroundingDinoPreTrainedModel extends PreTrainedModel { }
5441
+ export class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel { }
5412
5442
 
5413
5443
  //////////////////////////////////////////////////
5414
5444
  export class YolosPreTrainedModel extends PreTrainedModel { }
@@ -6107,6 +6137,9 @@ export class WavLMForAudioFrameClassification extends WavLMPreTrainedModel {
6107
6137
  }
6108
6138
  }
6109
6139
 
6140
+ export class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel { }
6141
+ export class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel { }
6142
+
6110
6143
  //////////////////////////////////////////////////
6111
6144
  // SpeechT5 models
6112
6145
  /**
@@ -6201,10 +6234,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
6201
6234
 
6202
6235
  const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
6203
6236
 
6237
+ // @ts-expect-error TS2339
6204
6238
  const r = encoder_outputs.dims[1] / this.config.reduction_factor;
6205
6239
  const maxlen = Math.floor(r * maxlenratio);
6206
6240
  const minlen = Math.floor(r * minlenratio);
6207
6241
 
6242
+ // @ts-expect-error TS2339
6208
6243
  const num_mel_bins = this.config.num_mel_bins;
6209
6244
 
6210
6245
  let spectrogramParts = [];
@@ -6569,11 +6604,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
6569
6604
  */
6570
6605
  _apply_and_filter_by_delay_pattern_mask(outputs) {
6571
6606
  const [bs_x_codebooks, seqLength] = outputs.dims;
6607
+ // @ts-expect-error TS2339
6572
6608
  const num_codebooks = this.config.decoder.num_codebooks;
6573
6609
  const upperBound = (seqLength - num_codebooks);
6574
6610
 
6575
6611
  let newDataSize = 0;
6576
6612
  for (let i = 0; i < outputs.size; ++i) {
6613
+ // @ts-expect-error TS2339
6577
6614
  if (outputs.data[i] === this.config.decoder.pad_token_id) {
6578
6615
  continue;
6579
6616
  }
@@ -6603,7 +6640,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
6603
6640
  let clonedInputIds = structuredClone(input_ids);
6604
6641
  for (let i = 0; i < clonedInputIds.length; ++i) {
6605
6642
  for (let j = 0; j < clonedInputIds[i].length; ++j) {
6643
+ // @ts-expect-error TS2339
6606
6644
  if ((i % this.config.decoder.num_codebooks) >= j) {
6645
+ // @ts-expect-error TS2339
6607
6646
  clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
6608
6647
  }
6609
6648
  }
@@ -6760,6 +6799,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
6760
6799
  'past_key_values',
6761
6800
  ];
6762
6801
 
6802
+ /**
6803
+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
6804
+ */
6763
6805
  constructor(...args) {
6764
6806
  super(...args);
6765
6807
 
@@ -7061,6 +7103,8 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
7061
7103
 
7062
7104
  ['maskformer', ['MaskFormerModel', MaskFormerModel]],
7063
7105
  ['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]],
7106
+
7107
+ ['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]],
7064
7108
  ]);
7065
7109
 
7066
7110
  const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -7305,6 +7349,7 @@ const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
7305
7349
  const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
7306
7350
  ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]],
7307
7351
  ['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]],
7352
+ ['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]],
7308
7353
  ]);
7309
7354
 
7310
7355
  const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
@@ -7728,10 +7773,17 @@ export class SequenceClassifierOutput extends ModelOutput {
7728
7773
  /**
7729
7774
  * @param {Object} output The output of the model.
7730
7775
  * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
7776
+ * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
7777
+ * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
7731
7778
  */
7732
- constructor({ logits }) {
7779
+ constructor({ logits, ...attentions }) {
7733
7780
  super();
7734
7781
  this.logits = logits;
7782
+ const attentions_list = Object.values(attentions);
7783
+ if (attentions_list.length > 0) {
7784
+ // Only set attentions if they are not empty
7785
+ this.attentions = attentions_list;
7786
+ }
7735
7787
  }
7736
7788
  }
7737
7789
 
@@ -36,6 +36,16 @@ export class TensorOpRegistry {
36
36
  // executionProviders: ['webgpu'],
37
37
  };
38
38
 
39
+ static get nearest_interpolate_4d() {
40
+ if (!this._nearest_interpolate_4d) {
41
+ this._nearest_interpolate_4d = wrap(
42
+ [8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
43
+ this.session_options,
44
+ 'y',
45
+ );
46
+ }
47
+ return this._nearest_interpolate_4d;
48
+ }
39
49
  static get bilinear_interpolate_4d() {
40
50
  if (!this._bilinear_interpolate_4d) {
41
51
  this._bilinear_interpolate_4d = wrap(
package/src/pipelines.js CHANGED
@@ -64,12 +64,13 @@ import {
64
64
  round,
65
65
  } from './utils/maths.js';
66
66
  import {
67
- read_audio
67
+ read_audio,
68
+ RawAudio
68
69
  } from './utils/audio.js';
69
70
  import {
70
71
  Tensor,
71
72
  mean_pooling,
72
- interpolate,
73
+ interpolate_4d,
73
74
  quantize_embeddings,
74
75
  topk,
75
76
  } from './utils/tensor.js';
@@ -294,6 +295,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi
294
295
 
295
296
  // TODO: Use softmax tensor function
296
297
  const function_to_apply =
298
+ // @ts-expect-error TS2339
297
299
  this.model.config.problem_type === 'multi_label_classification'
298
300
  ? batch => batch.sigmoid()
299
301
  : batch => new Tensor(
@@ -302,6 +304,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi
302
304
  batch.dims,
303
305
  ); // single_label_classification (default)
304
306
 
307
+ // @ts-expect-error TS2339
305
308
  const id2label = this.model.config.id2label;
306
309
 
307
310
  const toReturn = [];
@@ -404,6 +407,7 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP
404
407
  const outputs = await this.model(model_inputs)
405
408
 
406
409
  const logits = outputs.logits;
410
+ // @ts-expect-error TS2339
407
411
  const id2label = this.model.config.id2label;
408
412
 
409
413
  const toReturn = [];
@@ -743,11 +747,14 @@ export class Text2TextGenerationPipeline extends (/** @type {new (options: TextP
743
747
 
744
748
 
745
749
  // Add global prefix, if present
750
+ // @ts-expect-error TS2339
746
751
  if (this.model.config.prefix) {
752
+ // @ts-expect-error TS2339
747
753
  texts = texts.map(x => this.model.config.prefix + x)
748
754
  }
749
755
 
750
756
  // Handle task specific params:
757
+ // @ts-expect-error TS2339
751
758
  const task_specific_params = this.model.config.task_specific_params
752
759
  if (task_specific_params && task_specific_params[this.task]) {
753
760
  // Add prefixes, if present
@@ -1486,6 +1493,7 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio
1486
1493
  const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1487
1494
  const preparedAudios = await prepareAudios(audio, sampling_rate);
1488
1495
 
1496
+ // @ts-expect-error TS2339
1489
1497
  const id2label = this.model.config.id2label;
1490
1498
 
1491
1499
  const toReturn = [];
@@ -1796,6 +1804,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1796
1804
  audio = [/** @type {AudioInput} */ (audio)];
1797
1805
  }
1798
1806
 
1807
+ // @ts-expect-error TS2339
1799
1808
  const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
1800
1809
  const hop_length = this.processor.feature_extractor.config.hop_length;
1801
1810
 
@@ -1861,7 +1870,9 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1861
1870
 
1862
1871
  // TODO: Right now we only get top beam
1863
1872
  if (return_timestamps === 'word') {
1873
+ // @ts-expect-error TS2339
1864
1874
  chunk.tokens = data.sequences.tolist()[0];
1875
+ // @ts-expect-error TS2339
1865
1876
  chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
1866
1877
  (/** @type {number} */ x) => round(x, 2)
1867
1878
  );
@@ -1906,7 +1917,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1906
1917
  const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
1907
1918
  const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
1908
1919
 
1909
- const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
1920
+ const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
1910
1921
  toReturn.push({ text });
1911
1922
  }
1912
1923
  return single ? toReturn[0] : toReturn;
@@ -2055,6 +2066,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image
2055
2066
  const { pixel_values } = await this.processor(preparedImages);
2056
2067
  const output = await this.model({ pixel_values });
2057
2068
 
2069
+ // @ts-expect-error TS2339
2058
2070
  const id2label = this.model.config.id2label;
2059
2071
 
2060
2072
  /** @type {ImageClassificationOutput[]} */
@@ -2169,6 +2181,7 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
2169
2181
  }
2170
2182
  }
2171
2183
 
2184
+ // @ts-expect-error TS2339
2172
2185
  const id2label = this.model.config.id2label;
2173
2186
 
2174
2187
  /** @type {ImageSegmentationPipelineOutput[]} */
@@ -2395,6 +2408,7 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe
2395
2408
  const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
2396
2409
 
2397
2410
  // Add labels
2411
+ // @ts-expect-error TS2339
2398
2412
  const id2label = this.model.config.id2label;
2399
2413
 
2400
2414
  // Format output
@@ -2539,13 +2553,35 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T
2539
2553
  // Run model with both text and pixel inputs
2540
2554
  const output = await this.model({ ...text_inputs, pixel_values });
2541
2555
 
2542
- // @ts-ignore
2543
- const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
2544
- let result = processed.boxes.map((box, i) => ({
2545
- score: processed.scores[i],
2546
- label: candidate_labels[processed.classes[i]],
2547
- box: get_bounding_box(box, !percentage),
2548
- })).sort((a, b) => b.score - a.score);
2556
+ let result;
2557
+ if('post_process_grounded_object_detection' in this.processor) {
2558
+ // @ts-ignore
2559
+ const processed = this.processor.post_process_grounded_object_detection(
2560
+ output,
2561
+ text_inputs.input_ids,
2562
+ {
2563
+ // TODO: support separate threshold values
2564
+ box_threshold: threshold,
2565
+ text_threshold: threshold,
2566
+ target_sizes: imageSize,
2567
+ },
2568
+ )[0];
2569
+ result = processed.boxes.map((box, i) => ({
2570
+ score: processed.scores[i],
2571
+ label: processed.labels[i],
2572
+ box: get_bounding_box(box, !percentage),
2573
+ }))
2574
+ } else {
2575
+ // @ts-ignore
2576
+ const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
2577
+ result = processed.boxes.map((box, i) => ({
2578
+ score: processed.scores[i],
2579
+ label: candidate_labels[processed.classes[i]],
2580
+ box: get_bounding_box(box, !percentage),
2581
+ }))
2582
+ }
2583
+ result.sort((a, b) => b.score - a.score);
2584
+
2549
2585
  if (top_k !== null) {
2550
2586
  result = result.slice(0, top_k);
2551
2587
  }
@@ -2614,6 +2650,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
2614
2650
  // Run model
2615
2651
  const output = await this.model.generate({
2616
2652
  inputs: pixel_values,
2653
+ // @ts-expect-error TS2339
2617
2654
  max_length: this.model.config.decoder.max_position_embeddings,
2618
2655
  decoder_input_ids,
2619
2656
  ...generate_kwargs,
@@ -2664,7 +2701,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
2664
2701
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { quantized: false });
2665
2702
  * const speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
2666
2703
  * const out = await synthesizer('Hello, my dog is cute', { speaker_embeddings });
2667
- * // {
2704
+ * // RawAudio {
2668
2705
  * // audio: Float32Array(26112) [-0.00005657337896991521, 0.00020583874720614403, ...],
2669
2706
  * // sampling_rate: 16000
2670
2707
  * // }
@@ -2684,7 +2721,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
2684
2721
  * ```javascript
2685
2722
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/mms-tts-fra');
2686
2723
  * const out = await synthesizer('Bonjour');
2687
- * // {
2724
+ * // RawAudio {
2688
2725
  * // audio: Float32Array(23808) [-0.00037693005288019776, 0.0003325853613205254, ...],
2689
2726
  * // sampling_rate: 16000
2690
2727
  * // }
@@ -2729,11 +2766,12 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi
2729
2766
  // Generate waveform
2730
2767
  const { waveform } = await this.model(inputs);
2731
2768
 
2769
+ // @ts-expect-error TS2339
2732
2770
  const sampling_rate = this.model.config.sampling_rate;
2733
- return {
2734
- audio: waveform.data,
2771
+ return new RawAudio(
2772
+ waveform.data,
2735
2773
  sampling_rate,
2736
- }
2774
+ )
2737
2775
  }
2738
2776
 
2739
2777
  async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) {
@@ -2773,10 +2811,10 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi
2773
2811
  const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder });
2774
2812
 
2775
2813
  const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
2776
- return {
2777
- audio: waveform.data,
2814
+ return new RawAudio(
2815
+ waveform.data,
2778
2816
  sampling_rate,
2779
- }
2817
+ )
2780
2818
  }
2781
2819
  }
2782
2820
 
@@ -2886,11 +2924,23 @@ export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipe
2886
2924
 
2887
2925
  const toReturn = [];
2888
2926
  for (let i = 0; i < preparedImages.length; ++i) {
2889
- const prediction = interpolate(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
2890
- const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
2927
+ const batch = predicted_depth[i];
2928
+ const [height, width] = batch.dims.slice(-2);
2929
+ const [new_width, new_height] = preparedImages[i].size;
2930
+
2931
+ // Interpolate to original size
2932
+ const prediction = (await interpolate_4d(batch.view(1, 1, height, width), {
2933
+ size: [new_height, new_width],
2934
+ mode: 'bilinear',
2935
+ })).view(new_height, new_width);
2936
+
2937
+ const minval = /** @type {number} */(prediction.min().item());
2938
+ const maxval = /** @type {number} */(prediction.max().item());
2939
+ const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
2940
+ const depth = RawImage.fromTensor(formatted);
2891
2941
  toReturn.push({
2892
- predicted_depth: predicted_depth[i],
2893
- depth: RawImage.fromTensor(formatted),
2942
+ predicted_depth: prediction,
2943
+ depth,
2894
2944
  });
2895
2945
  }
2896
2946
 
@@ -3368,4 +3418,4 @@ async function loadItems(mapping, model, pretrainedOptions) {
3368
3418
  }
3369
3419
 
3370
3420
  return result;
3371
- }
3421
+ }
package/src/tokenizers.js CHANGED
@@ -47,10 +47,8 @@ import {
47
47
  import { Template } from '@huggingface/jinja';
48
48
 
49
49
  import {
50
- WHISPER_LANGUAGE_MAPPING,
51
- whisper_language_to_code,
50
+ WHISPER_LANGUAGE_MAPPING
52
51
  } from './models/whisper/common_whisper.js';
53
- import { GITHUB_ISSUE_URL } from './utils/constants.js';
54
52
 
55
53
  /**
56
54
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
@@ -535,7 +533,7 @@ class Unigram extends TokenizerModel {
535
533
  * Create a new Unigram tokenizer model.
536
534
  * @param {Object} config The configuration object for the Unigram model.
537
535
  * @param {number} config.unk_id The ID of the unknown token
538
- * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
536
+ * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
539
537
  * @param {Object} moreConfig Additional configuration object for the Unigram model.
540
538
  */
541
539
  constructor(config, moreConfig) {
@@ -543,11 +541,10 @@ class Unigram extends TokenizerModel {
543
541
 
544
542
  const vocabSize = config.vocab.length;
545
543
  this.vocab = new Array(vocabSize);
544
+ /** @type {number[]} */
546
545
  this.scores = new Array(vocabSize);
547
546
  for (let i = 0; i < vocabSize; ++i) {
548
- const piece = config.vocab[i];
549
- this.vocab[i] = piece[0];
550
- this.scores[i] = piece[1];
547
+ [this.vocab[i], this.scores[i]] = config.vocab[i];
551
548
  }
552
549
 
553
550
  this.unk_token_id = config.unk_id;