@huggingface/transformers 3.3.3 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +9 -3
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
  3. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  4. package/dist/transformers.js +2480 -1457
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.js +1 -1
  7. package/dist/transformers.min.js.map +1 -1
  8. package/dist/{transformers.cjs → transformers.node.cjs} +1412 -2395
  9. package/dist/transformers.node.cjs.map +1 -0
  10. package/dist/transformers.node.min.cjs +2 -0
  11. package/dist/transformers.node.min.cjs.map +1 -0
  12. package/dist/transformers.node.min.mjs +2 -0
  13. package/dist/transformers.node.min.mjs.map +1 -0
  14. package/dist/{transformers.mjs → transformers.node.mjs} +1440 -2375
  15. package/dist/transformers.node.mjs.map +1 -0
  16. package/dist/transformers.web.js +35713 -0
  17. package/dist/transformers.web.js.map +1 -0
  18. package/dist/transformers.web.min.js +2 -0
  19. package/dist/transformers.web.min.js.map +1 -0
  20. package/package.json +6 -6
  21. package/src/backends/onnx.js +14 -15
  22. package/src/configs.js +4 -1
  23. package/src/env.js +1 -1
  24. package/src/generation/streamers.js +4 -3
  25. package/src/models/dac/feature_extraction_dac.js +3 -0
  26. package/src/models/encodec/feature_extraction_encodec.js +32 -0
  27. package/src/models/feature_extractors.js +2 -0
  28. package/src/models/idefics3/image_processing_idefics3.js +1 -1
  29. package/src/models/image_processors.js +1 -0
  30. package/src/models/processors.js +2 -0
  31. package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
  32. package/src/models/smolvlm/processing_smolvlm.js +2 -0
  33. package/src/models/ultravox/processing_ultravox.js +54 -0
  34. package/src/models/whisper/common_whisper.js +7 -1
  35. package/src/models/whisper/feature_extraction_whisper.js +18 -10
  36. package/src/models.js +456 -76
  37. package/src/pipelines.js +111 -7
  38. package/src/tokenizers.js +42 -28
  39. package/src/transformers.js +1 -0
  40. package/src/utils/audio.js +2 -0
  41. package/src/utils/hub.js +140 -80
  42. package/src/utils/maths.js +1 -1
  43. package/src/utils/tensor.js +6 -3
  44. package/src/utils/video.js +128 -0
  45. package/types/backends/onnx.d.ts +2 -2
  46. package/types/backends/onnx.d.ts.map +1 -1
  47. package/types/configs.d.ts +1 -1
  48. package/types/configs.d.ts.map +1 -1
  49. package/types/generation/streamers.d.ts.map +1 -1
  50. package/types/models/dac/feature_extraction_dac.d.ts +4 -0
  51. package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
  52. package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
  53. package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
  54. package/types/models/feature_extractors.d.ts +2 -0
  55. package/types/models/florence2/processing_florence2.d.ts +1 -1
  56. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  57. package/types/models/image_processors.d.ts +1 -0
  58. package/types/models/processors.d.ts +2 -0
  59. package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
  60. package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
  61. package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
  62. package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
  63. package/types/models/ultravox/processing_ultravox.d.ts +16 -0
  64. package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
  65. package/types/models/whisper/common_whisper.d.ts.map +1 -1
  66. package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
  67. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  68. package/types/models.d.ts +132 -4
  69. package/types/models.d.ts.map +1 -1
  70. package/types/pipelines.d.ts +50 -4
  71. package/types/pipelines.d.ts.map +1 -1
  72. package/types/tokenizers.d.ts.map +1 -1
  73. package/types/transformers.d.ts +1 -0
  74. package/types/tsconfig.tsbuildinfo +1 -1
  75. package/types/utils/audio.d.ts.map +1 -1
  76. package/types/utils/hub.d.ts +19 -7
  77. package/types/utils/hub.d.ts.map +1 -1
  78. package/types/utils/maths.d.ts +2 -2
  79. package/types/utils/maths.d.ts.map +1 -1
  80. package/types/utils/tensor.d.ts +17 -18
  81. package/types/utils/tensor.d.ts.map +1 -1
  82. package/types/utils/video.d.ts +37 -0
  83. package/types/utils/video.d.ts.map +1 -0
  84. package/dist/transformers.cjs.map +0 -1
  85. package/dist/transformers.min.cjs +0 -2
  86. package/dist/transformers.min.cjs.map +0 -1
  87. package/dist/transformers.min.mjs +0 -2
  88. package/dist/transformers.min.mjs.map +0 -1
  89. package/dist/transformers.mjs.map +0 -1
package/src/models.js CHANGED
@@ -68,6 +68,7 @@ import {
68
68
  import {
69
69
  getModelFile,
70
70
  getModelJSON,
71
+ MAX_EXTERNAL_DATA_CHUNKS,
71
72
  } from './utils/hub.js';
72
73
 
73
74
  import {
@@ -108,6 +109,7 @@ import {
108
109
  stack,
109
110
  std_mean,
110
111
  Tensor,
112
+ DataTypeMap,
111
113
  } from './utils/tensor.js';
112
114
  import { RawImage } from './utils/image.js';
113
115
 
@@ -132,6 +134,8 @@ const MODEL_TYPES = {
132
134
  Musicgen: 7,
133
135
  MultiModality: 8,
134
136
  Phi3V: 9,
137
+ AudioTextToText: 10,
138
+ AutoEncoder: 11,
135
139
  }
136
140
  //////////////////////////////////////////////////
137
141
 
@@ -150,7 +154,7 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map();
150
154
  * @param {string} pretrained_model_name_or_path The path to the directory containing the model file.
151
155
  * @param {string} fileName The name of the model file.
152
156
  * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the model.
153
- * @returns {Promise<{buffer: Uint8Array, session_options: Object, session_config: Object}>} A Promise that resolves to the data needed to create an InferenceSession object.
157
+ * @returns {Promise<{buffer_or_path: Uint8Array|string, session_options: Object, session_config: Object}>} A Promise that resolves to the data needed to create an InferenceSession object.
154
158
  * @private
155
159
  */
156
160
  async function getSession(pretrained_model_name_or_path, fileName, options) {
@@ -225,7 +229,8 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
225
229
 
226
230
  // Construct the model file name
227
231
  const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[selectedDtype];
228
- const modelFileName = `${options.subfolder ?? ''}/${fileName}${suffix}.onnx`;
232
+ const baseName = `${fileName}${suffix}.onnx`;
233
+ const modelFileName = `${options.subfolder ?? ''}/${baseName}`;
229
234
 
230
235
  const session_options = { ...options.session_options };
231
236
 
@@ -243,29 +248,38 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
243
248
  );
244
249
  }
245
250
 
246
- const bufferPromise = getModelFile(pretrained_model_name_or_path, modelFileName, true, options);
251
+ const bufferOrPathPromise = getModelFile(pretrained_model_name_or_path, modelFileName, true, options, apis.IS_NODE_ENV);
247
252
 
248
253
  // handle onnx external data files
249
254
  const use_external_data_format = options.use_external_data_format ?? custom_config.use_external_data_format;
250
- /** @type {Promise<{path: string, data: Uint8Array}>[]} */
255
+ /** @type {Promise<string|{path: string, data: Uint8Array}>[]} */
251
256
  let externalDataPromises = [];
252
- if (use_external_data_format && (
253
- use_external_data_format === true ||
254
- (
255
- typeof use_external_data_format === 'object' &&
256
- use_external_data_format.hasOwnProperty(fileName) &&
257
- use_external_data_format[fileName] === true
258
- )
259
- )) {
260
- if (apis.IS_NODE_ENV) {
261
- throw new Error('External data format is not yet supported in Node.js');
257
+ if (use_external_data_format) {
258
+ let external_data_format;
259
+ if (typeof use_external_data_format === 'object') {
260
+ if (use_external_data_format.hasOwnProperty(baseName)) {
261
+ external_data_format = use_external_data_format[baseName];
262
+ } else if (use_external_data_format.hasOwnProperty(fileName)) {
263
+ external_data_format = use_external_data_format[fileName];
264
+ } else {
265
+ external_data_format = false;
266
+ }
267
+ } else {
268
+ external_data_format = use_external_data_format;
269
+ }
270
+
271
+ const num_chunks = +external_data_format; // (false=0, true=1, number remains the same)
272
+ if (num_chunks > MAX_EXTERNAL_DATA_CHUNKS) {
273
+ throw new Error(`The number of external data chunks (${num_chunks}) exceeds the maximum allowed value (${MAX_EXTERNAL_DATA_CHUNKS}).`);
274
+ }
275
+ for (let i = 0; i < num_chunks; ++i) {
276
+ const path = `${baseName}_data${i === 0 ? '' : '_' + i}`;
277
+ const fullPath = `${options.subfolder ?? ''}/${path}`;
278
+ externalDataPromises.push(new Promise(async (resolve, reject) => {
279
+ const data = await getModelFile(pretrained_model_name_or_path, fullPath, true, options, apis.IS_NODE_ENV);
280
+ resolve(data instanceof Uint8Array ? { path, data } : path);
281
+ }));
262
282
  }
263
- const path = `${fileName}${suffix}.onnx_data`;
264
- const fullPath = `${options.subfolder ?? ''}/${path}`;
265
- externalDataPromises.push(new Promise(async (resolve, reject) => {
266
- const data = await getModelFile(pretrained_model_name_or_path, fullPath, true, options);
267
- resolve({ path, data })
268
- }));
269
283
 
270
284
  } else if (session_options.externalData !== undefined) {
271
285
  externalDataPromises = session_options.externalData.map(async (ext) => {
@@ -282,7 +296,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
282
296
  }
283
297
 
284
298
  if (externalDataPromises.length > 0) {
285
- session_options.externalData = await Promise.all(externalDataPromises);
299
+ const externalData = await Promise.all(externalDataPromises);
300
+ if (!apis.IS_NODE_ENV) {
301
+ session_options.externalData = externalData;
302
+ }
286
303
  }
287
304
 
288
305
  if (selectedDevice === 'webgpu') {
@@ -300,9 +317,9 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
300
317
  }
301
318
  }
302
319
 
303
- const buffer = await bufferPromise;
320
+ const buffer_or_path = await bufferOrPathPromise;
304
321
 
305
- return { buffer, session_options, session_config };
322
+ return { buffer_or_path, session_options, session_config };
306
323
  }
307
324
 
308
325
  /**
@@ -317,8 +334,8 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
317
334
  async function constructSessions(pretrained_model_name_or_path, names, options) {
318
335
  return Object.fromEntries(await Promise.all(
319
336
  Object.keys(names).map(async (name) => {
320
- const { buffer, session_options, session_config } = await getSession(pretrained_model_name_or_path, names[name], options);
321
- const session = await createInferenceSession(buffer, session_options, session_config);
337
+ const { buffer_or_path, session_options, session_config } = await getSession(pretrained_model_name_or_path, names[name], options);
338
+ const session = await createInferenceSession(buffer_or_path, session_options, session_config);
322
339
  return [name, session];
323
340
  })
324
341
  ));
@@ -548,10 +565,16 @@ async function encoderForward(self, model_inputs) {
548
565
  const dims = encoderFeeds.pixel_values.dims;
549
566
  encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]);
550
567
  }
551
-
568
+
552
569
  return await sessionRun(session, encoderFeeds);
553
570
  }
554
571
 
572
+ async function autoEncoderForward(self, model_inputs) {
573
+ const encoded = await self.encode(model_inputs);
574
+ const decoded = await self.decode(encoded);
575
+ return decoded;
576
+ }
577
+
555
578
  /**
556
579
  * Forward pass of a decoder model.
557
580
  * @param {Object} self The decoder model.
@@ -586,58 +609,98 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
586
609
 
587
610
 
588
611
 
589
- function default_merge_input_ids_with_image_features({
590
- image_token_id,
612
+ function default_merge_input_ids_with_features({
613
+ modality_token_id,
591
614
  inputs_embeds,
592
- image_features,
615
+ modality_features,
593
616
  input_ids,
594
617
  attention_mask,
595
618
  }) {
596
- const image_tokens = input_ids.tolist().map(ids =>
619
+ const token_positions = input_ids.tolist().map(ids =>
597
620
  ids.reduce((acc, x, idx) => {
598
- if (x == image_token_id) acc.push(idx);
621
+ if (x == modality_token_id) acc.push(idx);
599
622
  return acc;
600
623
  }, [])
601
624
  );
602
- const n_image_tokens = image_tokens.reduce((acc, x) => acc + x.length, 0);
603
- const n_image_features = image_features.dims[0];
604
- if (n_image_tokens !== n_image_features) {
605
- throw new Error(`Image features and image tokens do not match: tokens: ${n_image_tokens}, features ${n_image_features}`);
625
+ const n_tokens = token_positions.reduce((acc, x) => acc + x.length, 0);
626
+ const n_features = modality_features.dims[0];
627
+ if (n_tokens !== n_features) {
628
+ throw new Error(`Number of tokens and features do not match: tokens: ${n_tokens}, features ${n_features}`);
606
629
  }
607
630
 
608
631
  // Equivalent to performing a masked_scatter
609
632
  let img = 0;
610
- for (let i = 0; i < image_tokens.length; ++i) {
611
- const tokens = image_tokens[i];
633
+ for (let i = 0; i < token_positions.length; ++i) {
634
+ const tokens = token_positions[i];
612
635
  const embeds = inputs_embeds[i];
613
636
  for (let j = 0; j < tokens.length; ++j) {
614
- embeds[tokens[j]].data.set(image_features[img++].data)
637
+ embeds[tokens[j]].data.set(modality_features[img++].data)
615
638
  }
616
639
  }
617
640
  return { inputs_embeds, attention_mask }
618
641
  }
619
642
 
620
643
 
621
- /**
622
- * Forward pass of an image-text-to-text model.
623
- * @param {Object} self The image-text-to-text model model.
624
- * @param {Object} model_inputs The input data to be used for the forward pass.
625
- * @param {Tensor} [model_inputs.input_ids=null]
626
- * @param {Tensor} [model_inputs.attention_mask=null]
627
- * @param {Tensor} [model_inputs.pixel_values=null]
628
- * @param {Tensor} [model_inputs.position_ids=null]
629
- * @param {Tensor} [model_inputs.inputs_embeds=null]
630
- * @param {Tensor} [model_inputs.past_key_values=null]
631
- * @param {Object} [model_inputs.generation_config=null]
632
- * @param {Object} [model_inputs.logits_processor=null]
644
+ function default_merge_input_ids_with_image_features({
645
+ image_token_id,
646
+ inputs_embeds,
647
+ image_features,
648
+ input_ids,
649
+ attention_mask,
650
+ }) {
651
+ return default_merge_input_ids_with_features({
652
+ modality_token_id: image_token_id,
653
+ inputs_embeds,
654
+ modality_features: image_features,
655
+ input_ids,
656
+ attention_mask,
657
+ })
658
+ }
659
+
660
+ function default_merge_input_ids_with_audio_features({
661
+ audio_token_id,
662
+ inputs_embeds,
663
+ audio_features,
664
+ input_ids,
665
+ attention_mask,
666
+ }) {
667
+ return default_merge_input_ids_with_features({
668
+ modality_token_id: audio_token_id,
669
+ inputs_embeds,
670
+ modality_features: audio_features,
671
+ input_ids,
672
+ attention_mask,
673
+ })
674
+ }
675
+
676
+ /**
677
+ * Abstract forward pass function for image-text-to-text or audio-text-to-text models.
678
+ * @param {Object} self The model object.
679
+ * @param {Object} params Additional parameters.
680
+ * @param {Function} [params.encode_function] The function to encode the modality values.
681
+ * @param {Function} [params.merge_function] The function to merge the modality features with the input embeddings.
682
+ * @param {string} [params.modality_input_name] The modality input name.
683
+ * @param {string} [params.modality_output_name] The modality output name.
684
+ * @param {Tensor} [params.input_ids=null]
685
+ * @param {Tensor} [params.attention_mask=null]
686
+ * @param {Tensor} [params.position_ids=null]
687
+ * @param {Tensor} [params.inputs_embeds=null]
688
+ * @param {Tensor} [params.past_key_values=null]
689
+ * @param {Object} [params.generation_config=null]
690
+ * @param {Object} [params.logits_processor=null]
633
691
  * @returns {Promise<Tensor>} The model's output tensor
634
692
  * @private
635
693
  */
636
- async function imageTextToTextForward(self, {
694
+ async function genericTextToTextForward(self, {
695
+ // Generic parameters:
696
+ encode_function,
697
+ merge_function,
698
+ modality_input_name,
699
+ modality_output_name,
700
+
637
701
  // Produced by the tokenizer/processor:
638
702
  input_ids = null,
639
703
  attention_mask = null,
640
- pixel_values = null,
641
704
 
642
705
  // Used during generation:
643
706
  position_ids = null,
@@ -648,27 +711,31 @@ async function imageTextToTextForward(self, {
648
711
  generation_config = null,
649
712
  logits_processor = null,
650
713
 
651
- // TODO: needed?
714
+ // Additional parameters
652
715
  ...kwargs
653
716
  }) {
654
-
717
+ const modality_values = kwargs[modality_input_name];
655
718
  if (!inputs_embeds) {
656
- // 1. Extract the input embeddings
719
+ // 1. Extract the text embeddings.
657
720
  inputs_embeds = await self.encode_text({ input_ids, ...kwargs });
658
721
 
659
- // 2. Possibly, merge text and images
660
- if (pixel_values && input_ids.dims[1] !== 1) {
661
- const image_features = await self.encode_image({ pixel_values, ...kwargs });
662
-
663
- ({ inputs_embeds, attention_mask } = self._merge_input_ids_with_image_features({
664
- image_features,
722
+ // 2. Possibly, merge text and modality values
723
+ if (modality_values && input_ids.dims[1] !== 1) {
724
+ const modality_features = await encode_function({
725
+ // Pass the modality values under its expected key.
726
+ // The caller knows whether this is audio or image.
727
+ [modality_input_name]: modality_values,
728
+ ...kwargs
729
+ });
730
+ ({ inputs_embeds, attention_mask } = merge_function({
731
+ [modality_output_name]: modality_features,
665
732
  inputs_embeds,
666
733
  input_ids,
667
734
  attention_mask,
668
735
  }));
669
736
 
670
- } else if (past_key_values && pixel_values && input_ids.dims[1] === 1) {
671
- // This is the case when we are generating with cache
737
+ } else if (past_key_values && modality_values && input_ids.dims[1] === 1) {
738
+ // This branch handles the cache case.
672
739
  const target_length = input_ids.dims[1]; // always 1
673
740
  const past_length = Object.values(past_key_values)[0].dims.at(-2);
674
741
 
@@ -689,6 +756,7 @@ async function imageTextToTextForward(self, {
689
756
  }
690
757
  }
691
758
 
759
+ // 3. Call the decoder forward using the updated inputs.
692
760
  const outputs = await decoderForward(self, {
693
761
  inputs_embeds,
694
762
  past_key_values,
@@ -700,6 +768,40 @@ async function imageTextToTextForward(self, {
700
768
  return outputs;
701
769
  }
702
770
 
771
+ /**
772
+ * Forward pass of an audio-text-to-text model.
773
+ * @param {Object} self The audio-text-to-text model.
774
+ * @param {Object} params The inputs for the audio-text-to-text forward pass.
775
+ * @returns {Promise<Tensor>} The model's output tensor.
776
+ * @private
777
+ */
778
+ async function audioTextToTextForward(self, params) {
779
+ return await genericTextToTextForward(self, {
780
+ ...params,
781
+ modality_input_name: 'audio_values',
782
+ modality_output_name: 'audio_features',
783
+ encode_function: self.encode_audio.bind(self),
784
+ merge_function: self._merge_input_ids_with_audio_features.bind(self),
785
+ });
786
+ }
787
+
788
+ /**
789
+ * Forward pass of an image-text-to-text model.
790
+ * @param {Object} self The image-text-to-text model.
791
+ * @param {Object} params The inputs for the image-text-to-text forward pass.
792
+ * @returns {Promise<Tensor>} The model's output tensor.
793
+ * @private
794
+ */
795
+ async function imageTextToTextForward(self, params) {
796
+ return await genericTextToTextForward(self, {
797
+ ...params,
798
+ modality_input_name: 'pixel_values',
799
+ modality_output_name: 'image_features',
800
+ encode_function: self.encode_image.bind(self),
801
+ merge_function: self._merge_input_ids_with_image_features.bind(self),
802
+ });
803
+ }
804
+
703
805
  /**
704
806
  * Helper function to perform the following:
705
807
  * ```python
@@ -813,7 +915,7 @@ function encoder_decoder_prepare_inputs_for_generation(self, input_ids, model_in
813
915
  };
814
916
  }
815
917
 
816
- function image_text_to_text_prepare_inputs_for_generation(self, ...args) {
918
+ function multimodal_text_to_text_prepare_inputs_for_generation(self, ...args) {
817
919
  if (self.config.is_encoder_decoder) {
818
920
  return encoder_decoder_prepare_inputs_for_generation(self, ...args);
819
921
  } else {
@@ -917,18 +1019,24 @@ export class PreTrainedModel extends Callable {
917
1019
  case MODEL_TYPES.ImageTextToText:
918
1020
  this.can_generate = true;
919
1021
  this._forward = imageTextToTextForward;
920
- this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
1022
+ this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
1023
+ break;
1024
+ case MODEL_TYPES.AudioTextToText:
1025
+ this.can_generate = true;
1026
+ this._forward = audioTextToTextForward;
1027
+ this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
921
1028
  break;
922
1029
  case MODEL_TYPES.Phi3V:
923
1030
  this.can_generate = true;
924
- this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
1031
+ this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
925
1032
  break;
926
-
927
1033
  case MODEL_TYPES.MultiModality:
928
1034
  this.can_generate = true;
929
1035
  this._prepare_inputs_for_generation = multimodality_prepare_inputs_for_generation;
930
1036
  break;
931
-
1037
+ case MODEL_TYPES.AutoEncoder:
1038
+ this._forward = autoEncoderForward;
1039
+ break;
932
1040
  default:
933
1041
  // should be MODEL_TYPES.EncoderOnly
934
1042
  this._forward = encoderForward;
@@ -1060,6 +1168,19 @@ export class PreTrainedModel extends Callable {
1060
1168
  }, options),
1061
1169
  ]);
1062
1170
 
1171
+ } else if (modelType === MODEL_TYPES.AudioTextToText) {
1172
+ const sessions = {
1173
+ embed_tokens: 'embed_tokens',
1174
+ audio_encoder: 'audio_encoder',
1175
+ decoder_model_merged: 'decoder_model_merged',
1176
+ }
1177
+ info = await Promise.all([
1178
+ constructSessions(pretrained_model_name_or_path, sessions, options),
1179
+ getOptionalConfigs(pretrained_model_name_or_path, {
1180
+ generation_config: 'generation_config.json',
1181
+ }, options),
1182
+ ]);
1183
+
1063
1184
  } else if (modelType === MODEL_TYPES.Musicgen) {
1064
1185
  info = await Promise.all([
1065
1186
  constructSessions(pretrained_model_name_or_path, {
@@ -1098,7 +1219,13 @@ export class PreTrainedModel extends Callable {
1098
1219
  generation_config: 'generation_config.json',
1099
1220
  }, options),
1100
1221
  ]);
1101
-
1222
+ } else if (modelType === MODEL_TYPES.AutoEncoder) {
1223
+ info = await Promise.all([
1224
+ constructSessions(pretrained_model_name_or_path, {
1225
+ encoder_model: 'encoder_model',
1226
+ decoder_model: 'decoder_model',
1227
+ }, options),
1228
+ ]);
1102
1229
  } else { // should be MODEL_TYPES.EncoderOnly
1103
1230
  if (modelType !== MODEL_TYPES.EncoderOnly) {
1104
1231
  const type = modelName ?? config?.model_type;
@@ -1847,7 +1974,7 @@ export class PreTrainedModel extends Callable {
1847
1974
  } else {
1848
1975
  const session = this.sessions['decoder_model_merged'] ?? this.sessions['model'];
1849
1976
  const dtype = session?.config?.kv_cache_dtype ?? 'float32';
1850
- const empty = (dtype === 'float16') ? new Uint16Array() : [];
1977
+ const empty = (dtype === 'float16') ? new DataTypeMap.float16() : [];
1851
1978
 
1852
1979
  const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1;
1853
1980
  const shapes = getKeyValueShapes(this.config, { batch_size });
@@ -1877,6 +2004,11 @@ export class PreTrainedModel extends Callable {
1877
2004
  // text_inputs === { input_ids, attention_mask }
1878
2005
  return (await sessionRun(this.sessions['embed_tokens'], { input_ids })).inputs_embeds;
1879
2006
  }
2007
+
2008
+ async encode_audio({ audio_values }) {
2009
+ // audio_inputs === { audio_values }
2010
+ return (await sessionRun(this.sessions['audio_encoder'], { audio_values })).audio_features;
2011
+ }
1880
2012
  }
1881
2013
 
1882
2014
  //////////////////////////////////////////////////
@@ -3420,6 +3552,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3420
3552
  }
3421
3553
  //////////////////////////////////////////////////
3422
3554
 
3555
+ export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration { }
3423
3556
 
3424
3557
  //////////////////////////////////////////////////
3425
3558
  // Moonshine models
@@ -3691,7 +3824,7 @@ export class Idefics3PreTrainedModel extends PreTrainedModel {
3691
3824
  }
3692
3825
 
3693
3826
  /**
3694
- * The LLAVA model which consists of a vision backbone and a language model.
3827
+ * The Idefics3 model which consists of a vision backbone and a language model.
3695
3828
  */
3696
3829
  export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
3697
3830
 
@@ -3714,6 +3847,13 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
3714
3847
  }
3715
3848
  //////////////////////////////////////////////////
3716
3849
 
3850
+ /**
3851
+ * The SmolVLM Model with a language modeling head.
3852
+ * It is made up a SigLIP vision encoder, with a language modeling head on top.
3853
+ */
3854
+ export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration { }
3855
+
3856
+ //////////////////////////////////////////////////
3717
3857
  export class Phi3VPreTrainedModel extends PreTrainedModel {
3718
3858
  forward_params = [
3719
3859
  'input_ids',
@@ -5112,6 +5252,7 @@ export class SwinForImageClassification extends SwinPreTrainedModel {
5112
5252
  return new SequenceClassifierOutput(await super._call(model_inputs));
5113
5253
  }
5114
5254
  }
5255
+ export class SwinForSemanticSegmentation extends SwinPreTrainedModel { }
5115
5256
  //////////////////////////////////////////////////
5116
5257
 
5117
5258
  //////////////////////////////////////////////////
@@ -6714,6 +6855,8 @@ export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedMode
6714
6855
  return new SequenceClassifierOutput(await super._call(model_inputs));
6715
6856
  }
6716
6857
  }
6858
+
6859
+ export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel { }
6717
6860
  //////////////////////////////////////////////////
6718
6861
 
6719
6862
  //////////////////////////////////////////////////
@@ -6737,6 +6880,7 @@ export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedMode
6737
6880
  return new SequenceClassifierOutput(await super._call(model_inputs));
6738
6881
  }
6739
6882
  }
6883
+ export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel { }
6740
6884
  //////////////////////////////////////////////////
6741
6885
 
6742
6886
  //////////////////////////////////////////////////
@@ -6760,6 +6904,7 @@ export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedMode
6760
6904
  return new SequenceClassifierOutput(await super._call(model_inputs));
6761
6905
  }
6762
6906
  }
6907
+ export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel { }
6763
6908
  //////////////////////////////////////////////////
6764
6909
 
6765
6910
  //////////////////////////////////////////////////
@@ -6783,6 +6928,7 @@ export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedMode
6783
6928
  return new SequenceClassifierOutput(await super._call(model_inputs));
6784
6929
  }
6785
6930
  }
6931
+ export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel { }
6786
6932
  //////////////////////////////////////////////////
6787
6933
 
6788
6934
  //////////////////////////////////////////////////
@@ -6963,6 +7109,183 @@ export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel { }
6963
7109
  export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel { }
6964
7110
  //////////////////////////////////////////////////
6965
7111
 
7112
+ //////////////////////////////////////////////////
7113
+ export class UltravoxPreTrainedModel extends PreTrainedModel {
7114
+ forward_params = [
7115
+ 'input_ids',
7116
+ 'attention_mask',
7117
+ 'position_ids',
7118
+ 'audio_values',
7119
+ 'past_key_values',
7120
+ ];
7121
+ }
7122
+
7123
+ export class UltravoxModel extends UltravoxPreTrainedModel {
7124
+
7125
+ _merge_input_ids_with_audio_features(kwargs) {
7126
+ const audio_hidden_size = kwargs.audio_features.dims.at(-1);
7127
+ const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
7128
+
7129
+ return default_merge_input_ids_with_audio_features({
7130
+ // @ts-ignore
7131
+ audio_token_id: this.config.ignore_index,
7132
+ ...kwargs,
7133
+ audio_features: reshaped_audio_features,
7134
+ })
7135
+ }
7136
+ }
7137
+ //////////////////////////////////////////////////
7138
+
7139
+ //////////////////////////////////////////////////
7140
+ // Mimi models
7141
+ export class MimiPreTrainedModel extends PreTrainedModel {
7142
+ main_input_name = 'input_values';
7143
+ forward_params = ['input_values'];
7144
+ }
7145
+
7146
+ export class MimiEncoderOutput extends ModelOutput {
7147
+ /**
7148
+ * @param {Object} output The output of the model.
7149
+ * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
7150
+ */
7151
+ constructor({ audio_codes }) {
7152
+ super();
7153
+ this.audio_codes = audio_codes;
7154
+ }
7155
+ }
7156
+
7157
+ export class MimiDecoderOutput extends ModelOutput {
7158
+ /**
7159
+ * @param {Object} output The output of the model.
7160
+ * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
7161
+ */
7162
+ constructor({ audio_values }) {
7163
+ super();
7164
+ this.audio_values = audio_values;
7165
+ }
7166
+ }
7167
+
7168
+ /**
7169
+ * The Mimi neural audio codec model.
7170
+ */
7171
+ export class MimiModel extends MimiPreTrainedModel {
7172
+ /**
7173
+ * Encodes the input audio waveform into discrete codes.
7174
+ * @param {Object} inputs Model inputs
7175
+ * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
7176
+ * @returns {Promise<MimiEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
7177
+ */
7178
+ async encode(inputs) {
7179
+ return new MimiEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs));
7180
+ }
7181
+
7182
+ /**
7183
+ * Decodes the given frames into an output audio waveform.
7184
+ * @param {MimiEncoderOutput} inputs The encoded audio codes.
7185
+ * @returns {Promise<MimiDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
7186
+ */
7187
+ async decode(inputs) {
7188
+ return new MimiDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs));
7189
+ }
7190
+ }
7191
+
7192
+ export class MimiEncoderModel extends MimiPreTrainedModel {
7193
+ /** @type {typeof PreTrainedModel.from_pretrained} */
7194
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
7195
+ return super.from_pretrained(pretrained_model_name_or_path, {
7196
+ ...options,
7197
+ // Update default model file name if not provided
7198
+ model_file_name: options.model_file_name ?? 'encoder_model',
7199
+ });
7200
+ }
7201
+ }
7202
+ export class MimiDecoderModel extends MimiPreTrainedModel {
7203
+ /** @type {typeof PreTrainedModel.from_pretrained} */
7204
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
7205
+ return super.from_pretrained(pretrained_model_name_or_path, {
7206
+ ...options,
7207
+ // Update default model file name if not provided
7208
+ model_file_name: options.model_file_name ?? 'decoder_model',
7209
+ });
7210
+ }
7211
+ }
7212
+ //////////////////////////////////////////////////
7213
+
7214
+
7215
+ //////////////////////////////////////////////////
7216
+ // Dac models
7217
+ export class DacPreTrainedModel extends PreTrainedModel {
7218
+ main_input_name = 'input_values';
7219
+ forward_params = ['input_values'];
7220
+ }
7221
+
7222
+ export class DacEncoderOutput extends ModelOutput {
7223
+ /**
7224
+ * @param {Object} output The output of the model.
7225
+ * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
7226
+ */
7227
+ constructor({ audio_codes }) {
7228
+ super();
7229
+ this.audio_codes = audio_codes;
7230
+ }
7231
+ }
7232
+
7233
+ export class DacDecoderOutput extends ModelOutput {
7234
+ /**
7235
+ * @param {Object} output The output of the model.
7236
+ * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
7237
+ */
7238
+ constructor({ audio_values }) {
7239
+ super();
7240
+ this.audio_values = audio_values;
7241
+ }
7242
+ }
7243
+
7244
+ /**
7245
+ * The DAC (Descript Audio Codec) model.
7246
+ */
7247
+ export class DacModel extends DacPreTrainedModel {
7248
+ /**
7249
+ * Encodes the input audio waveform into discrete codes.
7250
+ * @param {Object} inputs Model inputs
7251
+ * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
7252
+ * @returns {Promise<DacEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
7253
+ */
7254
+ async encode(inputs) {
7255
+ return new DacEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs));
7256
+ }
7257
+
7258
+ /**
7259
+ * Decodes the given frames into an output audio waveform.
7260
+ * @param {DacEncoderOutput} inputs The encoded audio codes.
7261
+ * @returns {Promise<DacDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
7262
+ */
7263
+ async decode(inputs) {
7264
+ return new DacDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs));
7265
+ }
7266
+ }
7267
+
7268
+ export class DacEncoderModel extends DacPreTrainedModel {
7269
+ /** @type {typeof PreTrainedModel.from_pretrained} */
7270
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
7271
+ return super.from_pretrained(pretrained_model_name_or_path, {
7272
+ ...options,
7273
+ // Update default model file name if not provided
7274
+ model_file_name: options.model_file_name ?? 'encoder_model',
7275
+ });
7276
+ }
7277
+ }
7278
+ export class DacDecoderModel extends DacPreTrainedModel {
7279
+ /** @type {typeof PreTrainedModel.from_pretrained} */
7280
+ static async from_pretrained(pretrained_model_name_or_path, options = {}) {
7281
+ return super.from_pretrained(pretrained_model_name_or_path, {
7282
+ ...options,
7283
+ // Update default model file name if not provided
7284
+ model_file_name: options.model_file_name ?? 'decoder_model',
7285
+ });
7286
+ }
7287
+ }
7288
+ //////////////////////////////////////////////////
6966
7289
 
6967
7290
  //////////////////////////////////////////////////
6968
7291
  // AutoModels, used to simplify construction of PreTrainedModels
@@ -7019,20 +7342,29 @@ export class PretrainedMixin {
7019
7342
  if (!this.MODEL_CLASS_MAPPINGS) {
7020
7343
  throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
7021
7344
  }
7022
-
7345
+ const model_type = options.config.model_type;
7023
7346
  for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
7024
- const modelInfo = MODEL_CLASS_MAPPING.get(options.config.model_type);
7347
+ let modelInfo = MODEL_CLASS_MAPPING.get(model_type);
7025
7348
  if (!modelInfo) {
7026
- continue; // Item not found in this mapping
7349
+ // As a fallback, we check if model_type is specified as the exact class
7350
+ for (const cls of MODEL_CLASS_MAPPING.values()) {
7351
+ if (cls[0] === model_type) {
7352
+ modelInfo = cls;
7353
+ break;
7354
+ }
7355
+ }
7356
+ if (!modelInfo) continue; // Item not found in this mapping
7027
7357
  }
7028
7358
  return await modelInfo[1].from_pretrained(pretrained_model_name_or_path, options);
7029
7359
  }
7030
7360
 
7031
7361
  if (this.BASE_IF_FAIL) {
7032
- console.warn(`Unknown model class "${options.config.model_type}", attempting to construct from base class.`);
7362
+ if (!(CUSTOM_ARCHITECTURES.has(model_type))) {
7363
+ console.warn(`Unknown model class "${model_type}", attempting to construct from base class.`);
7364
+ }
7033
7365
  return await PreTrainedModel.from_pretrained(pretrained_model_name_or_path, options);
7034
7366
  } else {
7035
- throw Error(`Unsupported model type: ${options.config.model_type}`)
7367
+ throw Error(`Unsupported model type: ${model_type}`)
7036
7368
  }
7037
7369
  }
7038
7370
  }
@@ -7133,6 +7465,10 @@ const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
7133
7465
  ['blenderbot-small', ['BlenderbotSmallModel', BlenderbotSmallModel]],
7134
7466
  ]);
7135
7467
 
7468
+ const MODEL_MAPPING_NAMES_AUTO_ENCODER = new Map([
7469
+ ['mimi', ['MimiModel', MimiModel]],
7470
+ ['dac', ['DacModel', DacModel]],
7471
+ ]);
7136
7472
 
7137
7473
  const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
7138
7474
  ['bloom', ['BloomModel', BloomModel]],
@@ -7169,6 +7505,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
7169
7505
  const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
7170
7506
  ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
7171
7507
  ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
7508
+ ['lite-whisper', ['LiteWhisperForConditionalGeneration', LiteWhisperForConditionalGeneration]],
7172
7509
  ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
7173
7510
  ]);
7174
7511
 
@@ -7315,6 +7652,7 @@ const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
7315
7652
  const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([
7316
7653
  ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
7317
7654
  ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
7655
+ ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
7318
7656
  ]);
7319
7657
 
7320
7658
  const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
@@ -7324,9 +7662,15 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
7324
7662
  ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
7325
7663
  ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
7326
7664
  ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
7665
+ ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
7327
7666
  ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
7328
7667
  ]);
7329
7668
 
7669
+ const MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
7670
+ ['ultravox', ['UltravoxModel', UltravoxModel]],
7671
+ ]);
7672
+
7673
+
7330
7674
  const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
7331
7675
  ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
7332
7676
  ]);
@@ -7378,6 +7722,12 @@ const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
7378
7722
  const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
7379
7723
  ['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]],
7380
7724
  ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
7725
+
7726
+ ['swin', ['SwinForSemanticSegmentation', SwinForSemanticSegmentation]],
7727
+ ['mobilenet_v1', ['MobileNetV1ForSemanticSegmentation', MobileNetV1ForSemanticSegmentation]],
7728
+ ['mobilenet_v2', ['MobileNetV2ForSemanticSegmentation', MobileNetV2ForSemanticSegmentation]],
7729
+ ['mobilenet_v3', ['MobileNetV3ForSemanticSegmentation', MobileNetV3ForSemanticSegmentation]],
7730
+ ['mobilenet_v4', ['MobileNetV4ForSemanticSegmentation', MobileNetV4ForSemanticSegmentation]],
7381
7731
  ]);
7382
7732
 
7383
7733
  const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
@@ -7457,9 +7807,12 @@ const MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES = new Map([
7457
7807
  ])
7458
7808
 
7459
7809
  const MODEL_CLASS_TYPE_MAPPING = [
7810
+ // MODEL_MAPPING_NAMES:
7460
7811
  [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly],
7461
7812
  [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES.EncoderDecoder],
7462
7813
  [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES.DecoderOnly],
7814
+ [MODEL_MAPPING_NAMES_AUTO_ENCODER, MODEL_TYPES.AutoEncoder],
7815
+
7463
7816
  [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
7464
7817
  [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
7465
7818
  [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
@@ -7470,6 +7823,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
7470
7823
  [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
7471
7824
  [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq],
7472
7825
  [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
7826
+ [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.AudioTextToText],
7473
7827
  [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
7474
7828
  [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
7475
7829
  [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
@@ -7514,6 +7868,11 @@ const CUSTOM_MAPPING = [
7514
7868
  ['JinaCLIPTextModel', JinaCLIPTextModel, MODEL_TYPES.EncoderOnly],
7515
7869
  ['ClapTextModelWithProjection', ClapTextModelWithProjection, MODEL_TYPES.EncoderOnly],
7516
7870
  ['ClapAudioModelWithProjection', ClapAudioModelWithProjection, MODEL_TYPES.EncoderOnly],
7871
+
7872
+ ['DacEncoderModel', DacEncoderModel, MODEL_TYPES.EncoderOnly],
7873
+ ['DacDecoderModel', DacDecoderModel, MODEL_TYPES.EncoderOnly],
7874
+ ['MimiEncoderModel', MimiEncoderModel, MODEL_TYPES.EncoderOnly],
7875
+ ['MimiDecoderModel', MimiDecoderModel, MODEL_TYPES.EncoderOnly],
7517
7876
  ]
7518
7877
  for (const [name, model, type] of CUSTOM_MAPPING) {
7519
7878
  MODEL_TYPE_MAPPING.set(name, type);
@@ -7521,6 +7880,19 @@ for (const [name, model, type] of CUSTOM_MAPPING) {
7521
7880
  MODEL_NAME_TO_CLASS_MAPPING.set(name, model);
7522
7881
  }
7523
7882
 
7883
+ const CUSTOM_ARCHITECTURES = new Map([
7884
+ ['modnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
7885
+ ['birefnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
7886
+ ['isnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
7887
+ ['ben', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
7888
+ ]);
7889
+ for (const [name, mapping] of CUSTOM_ARCHITECTURES.entries()) {
7890
+ mapping.set(name, ['PreTrainedModel', PreTrainedModel])
7891
+ MODEL_TYPE_MAPPING.set(name, MODEL_TYPES.EncoderOnly);
7892
+ MODEL_CLASS_TO_NAME_MAPPING.set(PreTrainedModel, name);
7893
+ MODEL_NAME_TO_CLASS_MAPPING.set(name, PreTrainedModel);
7894
+ }
7895
+
7524
7896
 
7525
7897
  /**
7526
7898
  * Helper class which is used to instantiate pretrained models with the `from_pretrained` function.
@@ -7761,6 +8133,14 @@ export class AutoModelForImageFeatureExtraction extends PretrainedMixin {
7761
8133
  static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES];
7762
8134
  }
7763
8135
 
8136
+ export class AutoModelForImageTextToText extends PretrainedMixin {
8137
+ static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES];
8138
+ }
8139
+
8140
+ export class AutoModelForAudioTextToText extends PretrainedMixin {
8141
+ static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES];
8142
+ }
8143
+
7764
8144
  //////////////////////////////////////////////////
7765
8145
 
7766
8146
  //////////////////////////////////////////////////