@huggingface/transformers 3.3.3 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -3
- package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.js +2480 -1457
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/{transformers.cjs → transformers.node.cjs} +1412 -2395
- package/dist/transformers.node.cjs.map +1 -0
- package/dist/transformers.node.min.cjs +2 -0
- package/dist/transformers.node.min.cjs.map +1 -0
- package/dist/transformers.node.min.mjs +2 -0
- package/dist/transformers.node.min.mjs.map +1 -0
- package/dist/{transformers.mjs → transformers.node.mjs} +1440 -2375
- package/dist/transformers.node.mjs.map +1 -0
- package/dist/transformers.web.js +35713 -0
- package/dist/transformers.web.js.map +1 -0
- package/dist/transformers.web.min.js +2 -0
- package/dist/transformers.web.min.js.map +1 -0
- package/package.json +6 -6
- package/src/backends/onnx.js +14 -15
- package/src/configs.js +4 -1
- package/src/env.js +1 -1
- package/src/generation/streamers.js +4 -3
- package/src/models/dac/feature_extraction_dac.js +3 -0
- package/src/models/encodec/feature_extraction_encodec.js +32 -0
- package/src/models/feature_extractors.js +2 -0
- package/src/models/idefics3/image_processing_idefics3.js +1 -1
- package/src/models/image_processors.js +1 -0
- package/src/models/processors.js +2 -0
- package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
- package/src/models/smolvlm/processing_smolvlm.js +2 -0
- package/src/models/ultravox/processing_ultravox.js +54 -0
- package/src/models/whisper/common_whisper.js +7 -1
- package/src/models/whisper/feature_extraction_whisper.js +18 -10
- package/src/models.js +456 -76
- package/src/pipelines.js +111 -7
- package/src/tokenizers.js +42 -28
- package/src/transformers.js +1 -0
- package/src/utils/audio.js +2 -0
- package/src/utils/hub.js +140 -80
- package/src/utils/maths.js +1 -1
- package/src/utils/tensor.js +6 -3
- package/src/utils/video.js +128 -0
- package/types/backends/onnx.d.ts +2 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/configs.d.ts +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/dac/feature_extraction_dac.d.ts +4 -0
- package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
- package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
- package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/florence2/processing_florence2.d.ts +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
- package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
- package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
- package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/processing_ultravox.d.ts +16 -0
- package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
- package/types/models/whisper/common_whisper.d.ts.map +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/models.d.ts +132 -4
- package/types/models.d.ts.map +1 -1
- package/types/pipelines.d.ts +50 -4
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/tsconfig.tsbuildinfo +1 -1
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/hub.d.ts +19 -7
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/maths.d.ts +2 -2
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +17 -18
- package/types/utils/tensor.d.ts.map +1 -1
- package/types/utils/video.d.ts +37 -0
- package/types/utils/video.d.ts.map +1 -0
- package/dist/transformers.cjs.map +0 -1
- package/dist/transformers.min.cjs +0 -2
- package/dist/transformers.min.cjs.map +0 -1
- package/dist/transformers.min.mjs +0 -2
- package/dist/transformers.min.mjs.map +0 -1
- package/dist/transformers.mjs.map +0 -1
package/src/models.js
CHANGED
|
@@ -68,6 +68,7 @@ import {
|
|
|
68
68
|
import {
|
|
69
69
|
getModelFile,
|
|
70
70
|
getModelJSON,
|
|
71
|
+
MAX_EXTERNAL_DATA_CHUNKS,
|
|
71
72
|
} from './utils/hub.js';
|
|
72
73
|
|
|
73
74
|
import {
|
|
@@ -108,6 +109,7 @@ import {
|
|
|
108
109
|
stack,
|
|
109
110
|
std_mean,
|
|
110
111
|
Tensor,
|
|
112
|
+
DataTypeMap,
|
|
111
113
|
} from './utils/tensor.js';
|
|
112
114
|
import { RawImage } from './utils/image.js';
|
|
113
115
|
|
|
@@ -132,6 +134,8 @@ const MODEL_TYPES = {
|
|
|
132
134
|
Musicgen: 7,
|
|
133
135
|
MultiModality: 8,
|
|
134
136
|
Phi3V: 9,
|
|
137
|
+
AudioTextToText: 10,
|
|
138
|
+
AutoEncoder: 11,
|
|
135
139
|
}
|
|
136
140
|
//////////////////////////////////////////////////
|
|
137
141
|
|
|
@@ -150,7 +154,7 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map();
|
|
|
150
154
|
* @param {string} pretrained_model_name_or_path The path to the directory containing the model file.
|
|
151
155
|
* @param {string} fileName The name of the model file.
|
|
152
156
|
* @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the model.
|
|
153
|
-
* @returns {Promise<{
|
|
157
|
+
* @returns {Promise<{buffer_or_path: Uint8Array|string, session_options: Object, session_config: Object}>} A Promise that resolves to the data needed to create an InferenceSession object.
|
|
154
158
|
* @private
|
|
155
159
|
*/
|
|
156
160
|
async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
@@ -225,7 +229,8 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
225
229
|
|
|
226
230
|
// Construct the model file name
|
|
227
231
|
const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[selectedDtype];
|
|
228
|
-
const
|
|
232
|
+
const baseName = `${fileName}${suffix}.onnx`;
|
|
233
|
+
const modelFileName = `${options.subfolder ?? ''}/${baseName}`;
|
|
229
234
|
|
|
230
235
|
const session_options = { ...options.session_options };
|
|
231
236
|
|
|
@@ -243,29 +248,38 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
243
248
|
);
|
|
244
249
|
}
|
|
245
250
|
|
|
246
|
-
const
|
|
251
|
+
const bufferOrPathPromise = getModelFile(pretrained_model_name_or_path, modelFileName, true, options, apis.IS_NODE_ENV);
|
|
247
252
|
|
|
248
253
|
// handle onnx external data files
|
|
249
254
|
const use_external_data_format = options.use_external_data_format ?? custom_config.use_external_data_format;
|
|
250
|
-
/** @type {Promise<{path: string, data: Uint8Array}>[]} */
|
|
255
|
+
/** @type {Promise<string|{path: string, data: Uint8Array}>[]} */
|
|
251
256
|
let externalDataPromises = [];
|
|
252
|
-
if (use_external_data_format
|
|
253
|
-
|
|
254
|
-
(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
use_external_data_format
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
257
|
+
if (use_external_data_format) {
|
|
258
|
+
let external_data_format;
|
|
259
|
+
if (typeof use_external_data_format === 'object') {
|
|
260
|
+
if (use_external_data_format.hasOwnProperty(baseName)) {
|
|
261
|
+
external_data_format = use_external_data_format[baseName];
|
|
262
|
+
} else if (use_external_data_format.hasOwnProperty(fileName)) {
|
|
263
|
+
external_data_format = use_external_data_format[fileName];
|
|
264
|
+
} else {
|
|
265
|
+
external_data_format = false;
|
|
266
|
+
}
|
|
267
|
+
} else {
|
|
268
|
+
external_data_format = use_external_data_format;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
const num_chunks = +external_data_format; // (false=0, true=1, number remains the same)
|
|
272
|
+
if (num_chunks > MAX_EXTERNAL_DATA_CHUNKS) {
|
|
273
|
+
throw new Error(`The number of external data chunks (${num_chunks}) exceeds the maximum allowed value (${MAX_EXTERNAL_DATA_CHUNKS}).`);
|
|
274
|
+
}
|
|
275
|
+
for (let i = 0; i < num_chunks; ++i) {
|
|
276
|
+
const path = `${baseName}_data${i === 0 ? '' : '_' + i}`;
|
|
277
|
+
const fullPath = `${options.subfolder ?? ''}/${path}`;
|
|
278
|
+
externalDataPromises.push(new Promise(async (resolve, reject) => {
|
|
279
|
+
const data = await getModelFile(pretrained_model_name_or_path, fullPath, true, options, apis.IS_NODE_ENV);
|
|
280
|
+
resolve(data instanceof Uint8Array ? { path, data } : path);
|
|
281
|
+
}));
|
|
262
282
|
}
|
|
263
|
-
const path = `${fileName}${suffix}.onnx_data`;
|
|
264
|
-
const fullPath = `${options.subfolder ?? ''}/${path}`;
|
|
265
|
-
externalDataPromises.push(new Promise(async (resolve, reject) => {
|
|
266
|
-
const data = await getModelFile(pretrained_model_name_or_path, fullPath, true, options);
|
|
267
|
-
resolve({ path, data })
|
|
268
|
-
}));
|
|
269
283
|
|
|
270
284
|
} else if (session_options.externalData !== undefined) {
|
|
271
285
|
externalDataPromises = session_options.externalData.map(async (ext) => {
|
|
@@ -282,7 +296,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
282
296
|
}
|
|
283
297
|
|
|
284
298
|
if (externalDataPromises.length > 0) {
|
|
285
|
-
|
|
299
|
+
const externalData = await Promise.all(externalDataPromises);
|
|
300
|
+
if (!apis.IS_NODE_ENV) {
|
|
301
|
+
session_options.externalData = externalData;
|
|
302
|
+
}
|
|
286
303
|
}
|
|
287
304
|
|
|
288
305
|
if (selectedDevice === 'webgpu') {
|
|
@@ -300,9 +317,9 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
300
317
|
}
|
|
301
318
|
}
|
|
302
319
|
|
|
303
|
-
const
|
|
320
|
+
const buffer_or_path = await bufferOrPathPromise;
|
|
304
321
|
|
|
305
|
-
return {
|
|
322
|
+
return { buffer_or_path, session_options, session_config };
|
|
306
323
|
}
|
|
307
324
|
|
|
308
325
|
/**
|
|
@@ -317,8 +334,8 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
317
334
|
async function constructSessions(pretrained_model_name_or_path, names, options) {
|
|
318
335
|
return Object.fromEntries(await Promise.all(
|
|
319
336
|
Object.keys(names).map(async (name) => {
|
|
320
|
-
const {
|
|
321
|
-
const session = await createInferenceSession(
|
|
337
|
+
const { buffer_or_path, session_options, session_config } = await getSession(pretrained_model_name_or_path, names[name], options);
|
|
338
|
+
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
322
339
|
return [name, session];
|
|
323
340
|
})
|
|
324
341
|
));
|
|
@@ -548,10 +565,16 @@ async function encoderForward(self, model_inputs) {
|
|
|
548
565
|
const dims = encoderFeeds.pixel_values.dims;
|
|
549
566
|
encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]);
|
|
550
567
|
}
|
|
551
|
-
|
|
568
|
+
|
|
552
569
|
return await sessionRun(session, encoderFeeds);
|
|
553
570
|
}
|
|
554
571
|
|
|
572
|
+
async function autoEncoderForward(self, model_inputs) {
|
|
573
|
+
const encoded = await self.encode(model_inputs);
|
|
574
|
+
const decoded = await self.decode(encoded);
|
|
575
|
+
return decoded;
|
|
576
|
+
}
|
|
577
|
+
|
|
555
578
|
/**
|
|
556
579
|
* Forward pass of a decoder model.
|
|
557
580
|
* @param {Object} self The decoder model.
|
|
@@ -586,58 +609,98 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
|
|
|
586
609
|
|
|
587
610
|
|
|
588
611
|
|
|
589
|
-
function
|
|
590
|
-
|
|
612
|
+
function default_merge_input_ids_with_features({
|
|
613
|
+
modality_token_id,
|
|
591
614
|
inputs_embeds,
|
|
592
|
-
|
|
615
|
+
modality_features,
|
|
593
616
|
input_ids,
|
|
594
617
|
attention_mask,
|
|
595
618
|
}) {
|
|
596
|
-
const
|
|
619
|
+
const token_positions = input_ids.tolist().map(ids =>
|
|
597
620
|
ids.reduce((acc, x, idx) => {
|
|
598
|
-
if (x ==
|
|
621
|
+
if (x == modality_token_id) acc.push(idx);
|
|
599
622
|
return acc;
|
|
600
623
|
}, [])
|
|
601
624
|
);
|
|
602
|
-
const
|
|
603
|
-
const
|
|
604
|
-
if (
|
|
605
|
-
throw new Error(`
|
|
625
|
+
const n_tokens = token_positions.reduce((acc, x) => acc + x.length, 0);
|
|
626
|
+
const n_features = modality_features.dims[0];
|
|
627
|
+
if (n_tokens !== n_features) {
|
|
628
|
+
throw new Error(`Number of tokens and features do not match: tokens: ${n_tokens}, features ${n_features}`);
|
|
606
629
|
}
|
|
607
630
|
|
|
608
631
|
// Equivalent to performing a masked_scatter
|
|
609
632
|
let img = 0;
|
|
610
|
-
for (let i = 0; i <
|
|
611
|
-
const tokens =
|
|
633
|
+
for (let i = 0; i < token_positions.length; ++i) {
|
|
634
|
+
const tokens = token_positions[i];
|
|
612
635
|
const embeds = inputs_embeds[i];
|
|
613
636
|
for (let j = 0; j < tokens.length; ++j) {
|
|
614
|
-
embeds[tokens[j]].data.set(
|
|
637
|
+
embeds[tokens[j]].data.set(modality_features[img++].data)
|
|
615
638
|
}
|
|
616
639
|
}
|
|
617
640
|
return { inputs_embeds, attention_mask }
|
|
618
641
|
}
|
|
619
642
|
|
|
620
643
|
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
644
|
+
function default_merge_input_ids_with_image_features({
|
|
645
|
+
image_token_id,
|
|
646
|
+
inputs_embeds,
|
|
647
|
+
image_features,
|
|
648
|
+
input_ids,
|
|
649
|
+
attention_mask,
|
|
650
|
+
}) {
|
|
651
|
+
return default_merge_input_ids_with_features({
|
|
652
|
+
modality_token_id: image_token_id,
|
|
653
|
+
inputs_embeds,
|
|
654
|
+
modality_features: image_features,
|
|
655
|
+
input_ids,
|
|
656
|
+
attention_mask,
|
|
657
|
+
})
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
function default_merge_input_ids_with_audio_features({
|
|
661
|
+
audio_token_id,
|
|
662
|
+
inputs_embeds,
|
|
663
|
+
audio_features,
|
|
664
|
+
input_ids,
|
|
665
|
+
attention_mask,
|
|
666
|
+
}) {
|
|
667
|
+
return default_merge_input_ids_with_features({
|
|
668
|
+
modality_token_id: audio_token_id,
|
|
669
|
+
inputs_embeds,
|
|
670
|
+
modality_features: audio_features,
|
|
671
|
+
input_ids,
|
|
672
|
+
attention_mask,
|
|
673
|
+
})
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
/**
|
|
677
|
+
* Abstract forward pass function for image-text-to-text or audio-text-to-text models.
|
|
678
|
+
* @param {Object} self The model object.
|
|
679
|
+
* @param {Object} params Additional parameters.
|
|
680
|
+
* @param {Function} [params.encode_function] The function to encode the modality values.
|
|
681
|
+
* @param {Function} [params.merge_function] The function to merge the modality features with the input embeddings.
|
|
682
|
+
* @param {string} [params.modality_input_name] The modality input name.
|
|
683
|
+
* @param {string} [params.modality_output_name] The modality output name.
|
|
684
|
+
* @param {Tensor} [params.input_ids=null]
|
|
685
|
+
* @param {Tensor} [params.attention_mask=null]
|
|
686
|
+
* @param {Tensor} [params.position_ids=null]
|
|
687
|
+
* @param {Tensor} [params.inputs_embeds=null]
|
|
688
|
+
* @param {Tensor} [params.past_key_values=null]
|
|
689
|
+
* @param {Object} [params.generation_config=null]
|
|
690
|
+
* @param {Object} [params.logits_processor=null]
|
|
633
691
|
* @returns {Promise<Tensor>} The model's output tensor
|
|
634
692
|
* @private
|
|
635
693
|
*/
|
|
636
|
-
async function
|
|
694
|
+
async function genericTextToTextForward(self, {
|
|
695
|
+
// Generic parameters:
|
|
696
|
+
encode_function,
|
|
697
|
+
merge_function,
|
|
698
|
+
modality_input_name,
|
|
699
|
+
modality_output_name,
|
|
700
|
+
|
|
637
701
|
// Produced by the tokenizer/processor:
|
|
638
702
|
input_ids = null,
|
|
639
703
|
attention_mask = null,
|
|
640
|
-
pixel_values = null,
|
|
641
704
|
|
|
642
705
|
// Used during generation:
|
|
643
706
|
position_ids = null,
|
|
@@ -648,27 +711,31 @@ async function imageTextToTextForward(self, {
|
|
|
648
711
|
generation_config = null,
|
|
649
712
|
logits_processor = null,
|
|
650
713
|
|
|
651
|
-
//
|
|
714
|
+
// Additional parameters
|
|
652
715
|
...kwargs
|
|
653
716
|
}) {
|
|
654
|
-
|
|
717
|
+
const modality_values = kwargs[modality_input_name];
|
|
655
718
|
if (!inputs_embeds) {
|
|
656
|
-
// 1. Extract the
|
|
719
|
+
// 1. Extract the text embeddings.
|
|
657
720
|
inputs_embeds = await self.encode_text({ input_ids, ...kwargs });
|
|
658
721
|
|
|
659
|
-
// 2. Possibly, merge text and
|
|
660
|
-
if (
|
|
661
|
-
const
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
722
|
+
// 2. Possibly, merge text and modality values
|
|
723
|
+
if (modality_values && input_ids.dims[1] !== 1) {
|
|
724
|
+
const modality_features = await encode_function({
|
|
725
|
+
// Pass the modality values under its expected key.
|
|
726
|
+
// The caller knows whether this is audio or image.
|
|
727
|
+
[modality_input_name]: modality_values,
|
|
728
|
+
...kwargs
|
|
729
|
+
});
|
|
730
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
731
|
+
[modality_output_name]: modality_features,
|
|
665
732
|
inputs_embeds,
|
|
666
733
|
input_ids,
|
|
667
734
|
attention_mask,
|
|
668
735
|
}));
|
|
669
736
|
|
|
670
|
-
} else if (past_key_values &&
|
|
671
|
-
// This
|
|
737
|
+
} else if (past_key_values && modality_values && input_ids.dims[1] === 1) {
|
|
738
|
+
// This branch handles the cache case.
|
|
672
739
|
const target_length = input_ids.dims[1]; // always 1
|
|
673
740
|
const past_length = Object.values(past_key_values)[0].dims.at(-2);
|
|
674
741
|
|
|
@@ -689,6 +756,7 @@ async function imageTextToTextForward(self, {
|
|
|
689
756
|
}
|
|
690
757
|
}
|
|
691
758
|
|
|
759
|
+
// 3. Call the decoder forward using the updated inputs.
|
|
692
760
|
const outputs = await decoderForward(self, {
|
|
693
761
|
inputs_embeds,
|
|
694
762
|
past_key_values,
|
|
@@ -700,6 +768,40 @@ async function imageTextToTextForward(self, {
|
|
|
700
768
|
return outputs;
|
|
701
769
|
}
|
|
702
770
|
|
|
771
|
+
/**
|
|
772
|
+
* Forward pass of an audio-text-to-text model.
|
|
773
|
+
* @param {Object} self The audio-text-to-text model.
|
|
774
|
+
* @param {Object} params The inputs for the audio-text-to-text forward pass.
|
|
775
|
+
* @returns {Promise<Tensor>} The model's output tensor.
|
|
776
|
+
* @private
|
|
777
|
+
*/
|
|
778
|
+
async function audioTextToTextForward(self, params) {
|
|
779
|
+
return await genericTextToTextForward(self, {
|
|
780
|
+
...params,
|
|
781
|
+
modality_input_name: 'audio_values',
|
|
782
|
+
modality_output_name: 'audio_features',
|
|
783
|
+
encode_function: self.encode_audio.bind(self),
|
|
784
|
+
merge_function: self._merge_input_ids_with_audio_features.bind(self),
|
|
785
|
+
});
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
/**
|
|
789
|
+
* Forward pass of an image-text-to-text model.
|
|
790
|
+
* @param {Object} self The image-text-to-text model.
|
|
791
|
+
* @param {Object} params The inputs for the image-text-to-text forward pass.
|
|
792
|
+
* @returns {Promise<Tensor>} The model's output tensor.
|
|
793
|
+
* @private
|
|
794
|
+
*/
|
|
795
|
+
async function imageTextToTextForward(self, params) {
|
|
796
|
+
return await genericTextToTextForward(self, {
|
|
797
|
+
...params,
|
|
798
|
+
modality_input_name: 'pixel_values',
|
|
799
|
+
modality_output_name: 'image_features',
|
|
800
|
+
encode_function: self.encode_image.bind(self),
|
|
801
|
+
merge_function: self._merge_input_ids_with_image_features.bind(self),
|
|
802
|
+
});
|
|
803
|
+
}
|
|
804
|
+
|
|
703
805
|
/**
|
|
704
806
|
* Helper function to perform the following:
|
|
705
807
|
* ```python
|
|
@@ -813,7 +915,7 @@ function encoder_decoder_prepare_inputs_for_generation(self, input_ids, model_in
|
|
|
813
915
|
};
|
|
814
916
|
}
|
|
815
917
|
|
|
816
|
-
function
|
|
918
|
+
function multimodal_text_to_text_prepare_inputs_for_generation(self, ...args) {
|
|
817
919
|
if (self.config.is_encoder_decoder) {
|
|
818
920
|
return encoder_decoder_prepare_inputs_for_generation(self, ...args);
|
|
819
921
|
} else {
|
|
@@ -917,18 +1019,24 @@ export class PreTrainedModel extends Callable {
|
|
|
917
1019
|
case MODEL_TYPES.ImageTextToText:
|
|
918
1020
|
this.can_generate = true;
|
|
919
1021
|
this._forward = imageTextToTextForward;
|
|
920
|
-
this._prepare_inputs_for_generation =
|
|
1022
|
+
this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
|
|
1023
|
+
break;
|
|
1024
|
+
case MODEL_TYPES.AudioTextToText:
|
|
1025
|
+
this.can_generate = true;
|
|
1026
|
+
this._forward = audioTextToTextForward;
|
|
1027
|
+
this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
|
|
921
1028
|
break;
|
|
922
1029
|
case MODEL_TYPES.Phi3V:
|
|
923
1030
|
this.can_generate = true;
|
|
924
|
-
this._prepare_inputs_for_generation =
|
|
1031
|
+
this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
|
|
925
1032
|
break;
|
|
926
|
-
|
|
927
1033
|
case MODEL_TYPES.MultiModality:
|
|
928
1034
|
this.can_generate = true;
|
|
929
1035
|
this._prepare_inputs_for_generation = multimodality_prepare_inputs_for_generation;
|
|
930
1036
|
break;
|
|
931
|
-
|
|
1037
|
+
case MODEL_TYPES.AutoEncoder:
|
|
1038
|
+
this._forward = autoEncoderForward;
|
|
1039
|
+
break;
|
|
932
1040
|
default:
|
|
933
1041
|
// should be MODEL_TYPES.EncoderOnly
|
|
934
1042
|
this._forward = encoderForward;
|
|
@@ -1060,6 +1168,19 @@ export class PreTrainedModel extends Callable {
|
|
|
1060
1168
|
}, options),
|
|
1061
1169
|
]);
|
|
1062
1170
|
|
|
1171
|
+
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
1172
|
+
const sessions = {
|
|
1173
|
+
embed_tokens: 'embed_tokens',
|
|
1174
|
+
audio_encoder: 'audio_encoder',
|
|
1175
|
+
decoder_model_merged: 'decoder_model_merged',
|
|
1176
|
+
}
|
|
1177
|
+
info = await Promise.all([
|
|
1178
|
+
constructSessions(pretrained_model_name_or_path, sessions, options),
|
|
1179
|
+
getOptionalConfigs(pretrained_model_name_or_path, {
|
|
1180
|
+
generation_config: 'generation_config.json',
|
|
1181
|
+
}, options),
|
|
1182
|
+
]);
|
|
1183
|
+
|
|
1063
1184
|
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
1064
1185
|
info = await Promise.all([
|
|
1065
1186
|
constructSessions(pretrained_model_name_or_path, {
|
|
@@ -1098,7 +1219,13 @@ export class PreTrainedModel extends Callable {
|
|
|
1098
1219
|
generation_config: 'generation_config.json',
|
|
1099
1220
|
}, options),
|
|
1100
1221
|
]);
|
|
1101
|
-
|
|
1222
|
+
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
1223
|
+
info = await Promise.all([
|
|
1224
|
+
constructSessions(pretrained_model_name_or_path, {
|
|
1225
|
+
encoder_model: 'encoder_model',
|
|
1226
|
+
decoder_model: 'decoder_model',
|
|
1227
|
+
}, options),
|
|
1228
|
+
]);
|
|
1102
1229
|
} else { // should be MODEL_TYPES.EncoderOnly
|
|
1103
1230
|
if (modelType !== MODEL_TYPES.EncoderOnly) {
|
|
1104
1231
|
const type = modelName ?? config?.model_type;
|
|
@@ -1847,7 +1974,7 @@ export class PreTrainedModel extends Callable {
|
|
|
1847
1974
|
} else {
|
|
1848
1975
|
const session = this.sessions['decoder_model_merged'] ?? this.sessions['model'];
|
|
1849
1976
|
const dtype = session?.config?.kv_cache_dtype ?? 'float32';
|
|
1850
|
-
const empty = (dtype === 'float16') ? new
|
|
1977
|
+
const empty = (dtype === 'float16') ? new DataTypeMap.float16() : [];
|
|
1851
1978
|
|
|
1852
1979
|
const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1;
|
|
1853
1980
|
const shapes = getKeyValueShapes(this.config, { batch_size });
|
|
@@ -1877,6 +2004,11 @@ export class PreTrainedModel extends Callable {
|
|
|
1877
2004
|
// text_inputs === { input_ids, attention_mask }
|
|
1878
2005
|
return (await sessionRun(this.sessions['embed_tokens'], { input_ids })).inputs_embeds;
|
|
1879
2006
|
}
|
|
2007
|
+
|
|
2008
|
+
async encode_audio({ audio_values }) {
|
|
2009
|
+
// audio_inputs === { audio_values }
|
|
2010
|
+
return (await sessionRun(this.sessions['audio_encoder'], { audio_values })).audio_features;
|
|
2011
|
+
}
|
|
1880
2012
|
}
|
|
1881
2013
|
|
|
1882
2014
|
//////////////////////////////////////////////////
|
|
@@ -3420,6 +3552,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3420
3552
|
}
|
|
3421
3553
|
//////////////////////////////////////////////////
|
|
3422
3554
|
|
|
3555
|
+
export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration { }
|
|
3423
3556
|
|
|
3424
3557
|
//////////////////////////////////////////////////
|
|
3425
3558
|
// Moonshine models
|
|
@@ -3691,7 +3824,7 @@ export class Idefics3PreTrainedModel extends PreTrainedModel {
|
|
|
3691
3824
|
}
|
|
3692
3825
|
|
|
3693
3826
|
/**
|
|
3694
|
-
* The
|
|
3827
|
+
* The Idefics3 model which consists of a vision backbone and a language model.
|
|
3695
3828
|
*/
|
|
3696
3829
|
export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
3697
3830
|
|
|
@@ -3714,6 +3847,13 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
|
3714
3847
|
}
|
|
3715
3848
|
//////////////////////////////////////////////////
|
|
3716
3849
|
|
|
3850
|
+
/**
|
|
3851
|
+
* The SmolVLM Model with a language modeling head.
|
|
3852
|
+
* It is made up a SigLIP vision encoder, with a language modeling head on top.
|
|
3853
|
+
*/
|
|
3854
|
+
export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration { }
|
|
3855
|
+
|
|
3856
|
+
//////////////////////////////////////////////////
|
|
3717
3857
|
export class Phi3VPreTrainedModel extends PreTrainedModel {
|
|
3718
3858
|
forward_params = [
|
|
3719
3859
|
'input_ids',
|
|
@@ -5112,6 +5252,7 @@ export class SwinForImageClassification extends SwinPreTrainedModel {
|
|
|
5112
5252
|
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
5113
5253
|
}
|
|
5114
5254
|
}
|
|
5255
|
+
export class SwinForSemanticSegmentation extends SwinPreTrainedModel { }
|
|
5115
5256
|
//////////////////////////////////////////////////
|
|
5116
5257
|
|
|
5117
5258
|
//////////////////////////////////////////////////
|
|
@@ -6714,6 +6855,8 @@ export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedMode
|
|
|
6714
6855
|
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
6715
6856
|
}
|
|
6716
6857
|
}
|
|
6858
|
+
|
|
6859
|
+
export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel { }
|
|
6717
6860
|
//////////////////////////////////////////////////
|
|
6718
6861
|
|
|
6719
6862
|
//////////////////////////////////////////////////
|
|
@@ -6737,6 +6880,7 @@ export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedMode
|
|
|
6737
6880
|
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
6738
6881
|
}
|
|
6739
6882
|
}
|
|
6883
|
+
export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel { }
|
|
6740
6884
|
//////////////////////////////////////////////////
|
|
6741
6885
|
|
|
6742
6886
|
//////////////////////////////////////////////////
|
|
@@ -6760,6 +6904,7 @@ export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedMode
|
|
|
6760
6904
|
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
6761
6905
|
}
|
|
6762
6906
|
}
|
|
6907
|
+
export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel { }
|
|
6763
6908
|
//////////////////////////////////////////////////
|
|
6764
6909
|
|
|
6765
6910
|
//////////////////////////////////////////////////
|
|
@@ -6783,6 +6928,7 @@ export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedMode
|
|
|
6783
6928
|
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
6784
6929
|
}
|
|
6785
6930
|
}
|
|
6931
|
+
export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel { }
|
|
6786
6932
|
//////////////////////////////////////////////////
|
|
6787
6933
|
|
|
6788
6934
|
//////////////////////////////////////////////////
|
|
@@ -6963,6 +7109,183 @@ export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel { }
|
|
|
6963
7109
|
export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel { }
|
|
6964
7110
|
//////////////////////////////////////////////////
|
|
6965
7111
|
|
|
7112
|
+
//////////////////////////////////////////////////
|
|
7113
|
+
export class UltravoxPreTrainedModel extends PreTrainedModel {
|
|
7114
|
+
forward_params = [
|
|
7115
|
+
'input_ids',
|
|
7116
|
+
'attention_mask',
|
|
7117
|
+
'position_ids',
|
|
7118
|
+
'audio_values',
|
|
7119
|
+
'past_key_values',
|
|
7120
|
+
];
|
|
7121
|
+
}
|
|
7122
|
+
|
|
7123
|
+
export class UltravoxModel extends UltravoxPreTrainedModel {
|
|
7124
|
+
|
|
7125
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
7126
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
7127
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
7128
|
+
|
|
7129
|
+
return default_merge_input_ids_with_audio_features({
|
|
7130
|
+
// @ts-ignore
|
|
7131
|
+
audio_token_id: this.config.ignore_index,
|
|
7132
|
+
...kwargs,
|
|
7133
|
+
audio_features: reshaped_audio_features,
|
|
7134
|
+
})
|
|
7135
|
+
}
|
|
7136
|
+
}
|
|
7137
|
+
//////////////////////////////////////////////////
|
|
7138
|
+
|
|
7139
|
+
//////////////////////////////////////////////////
|
|
7140
|
+
// Mimi models
|
|
7141
|
+
export class MimiPreTrainedModel extends PreTrainedModel {
|
|
7142
|
+
main_input_name = 'input_values';
|
|
7143
|
+
forward_params = ['input_values'];
|
|
7144
|
+
}
|
|
7145
|
+
|
|
7146
|
+
export class MimiEncoderOutput extends ModelOutput {
|
|
7147
|
+
/**
|
|
7148
|
+
* @param {Object} output The output of the model.
|
|
7149
|
+
* @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
|
|
7150
|
+
*/
|
|
7151
|
+
constructor({ audio_codes }) {
|
|
7152
|
+
super();
|
|
7153
|
+
this.audio_codes = audio_codes;
|
|
7154
|
+
}
|
|
7155
|
+
}
|
|
7156
|
+
|
|
7157
|
+
export class MimiDecoderOutput extends ModelOutput {
|
|
7158
|
+
/**
|
|
7159
|
+
* @param {Object} output The output of the model.
|
|
7160
|
+
* @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
|
|
7161
|
+
*/
|
|
7162
|
+
constructor({ audio_values }) {
|
|
7163
|
+
super();
|
|
7164
|
+
this.audio_values = audio_values;
|
|
7165
|
+
}
|
|
7166
|
+
}
|
|
7167
|
+
|
|
7168
|
+
/**
|
|
7169
|
+
* The Mimi neural audio codec model.
|
|
7170
|
+
*/
|
|
7171
|
+
export class MimiModel extends MimiPreTrainedModel {
|
|
7172
|
+
/**
|
|
7173
|
+
* Encodes the input audio waveform into discrete codes.
|
|
7174
|
+
* @param {Object} inputs Model inputs
|
|
7175
|
+
* @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
|
|
7176
|
+
* @returns {Promise<MimiEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
|
|
7177
|
+
*/
|
|
7178
|
+
async encode(inputs) {
|
|
7179
|
+
return new MimiEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs));
|
|
7180
|
+
}
|
|
7181
|
+
|
|
7182
|
+
/**
|
|
7183
|
+
* Decodes the given frames into an output audio waveform.
|
|
7184
|
+
* @param {MimiEncoderOutput} inputs The encoded audio codes.
|
|
7185
|
+
* @returns {Promise<MimiDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
|
|
7186
|
+
*/
|
|
7187
|
+
async decode(inputs) {
|
|
7188
|
+
return new MimiDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs));
|
|
7189
|
+
}
|
|
7190
|
+
}
|
|
7191
|
+
|
|
7192
|
+
export class MimiEncoderModel extends MimiPreTrainedModel {
|
|
7193
|
+
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
7194
|
+
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
7195
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
7196
|
+
...options,
|
|
7197
|
+
// Update default model file name if not provided
|
|
7198
|
+
model_file_name: options.model_file_name ?? 'encoder_model',
|
|
7199
|
+
});
|
|
7200
|
+
}
|
|
7201
|
+
}
|
|
7202
|
+
export class MimiDecoderModel extends MimiPreTrainedModel {
|
|
7203
|
+
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
7204
|
+
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
7205
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
7206
|
+
...options,
|
|
7207
|
+
// Update default model file name if not provided
|
|
7208
|
+
model_file_name: options.model_file_name ?? 'decoder_model',
|
|
7209
|
+
});
|
|
7210
|
+
}
|
|
7211
|
+
}
|
|
7212
|
+
//////////////////////////////////////////////////
|
|
7213
|
+
|
|
7214
|
+
|
|
7215
|
+
//////////////////////////////////////////////////
|
|
7216
|
+
// Dac models
|
|
7217
|
+
export class DacPreTrainedModel extends PreTrainedModel {
|
|
7218
|
+
main_input_name = 'input_values';
|
|
7219
|
+
forward_params = ['input_values'];
|
|
7220
|
+
}
|
|
7221
|
+
|
|
7222
|
+
export class DacEncoderOutput extends ModelOutput {
|
|
7223
|
+
/**
|
|
7224
|
+
* @param {Object} output The output of the model.
|
|
7225
|
+
* @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
|
|
7226
|
+
*/
|
|
7227
|
+
constructor({ audio_codes }) {
|
|
7228
|
+
super();
|
|
7229
|
+
this.audio_codes = audio_codes;
|
|
7230
|
+
}
|
|
7231
|
+
}
|
|
7232
|
+
|
|
7233
|
+
export class DacDecoderOutput extends ModelOutput {
|
|
7234
|
+
/**
|
|
7235
|
+
* @param {Object} output The output of the model.
|
|
7236
|
+
* @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
|
|
7237
|
+
*/
|
|
7238
|
+
constructor({ audio_values }) {
|
|
7239
|
+
super();
|
|
7240
|
+
this.audio_values = audio_values;
|
|
7241
|
+
}
|
|
7242
|
+
}
|
|
7243
|
+
|
|
7244
|
+
/**
|
|
7245
|
+
* The DAC (Descript Audio Codec) model.
|
|
7246
|
+
*/
|
|
7247
|
+
export class DacModel extends DacPreTrainedModel {
|
|
7248
|
+
/**
|
|
7249
|
+
* Encodes the input audio waveform into discrete codes.
|
|
7250
|
+
* @param {Object} inputs Model inputs
|
|
7251
|
+
* @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
|
|
7252
|
+
* @returns {Promise<DacEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
|
|
7253
|
+
*/
|
|
7254
|
+
async encode(inputs) {
|
|
7255
|
+
return new DacEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs));
|
|
7256
|
+
}
|
|
7257
|
+
|
|
7258
|
+
/**
|
|
7259
|
+
* Decodes the given frames into an output audio waveform.
|
|
7260
|
+
* @param {DacEncoderOutput} inputs The encoded audio codes.
|
|
7261
|
+
* @returns {Promise<DacDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
|
|
7262
|
+
*/
|
|
7263
|
+
async decode(inputs) {
|
|
7264
|
+
return new DacDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs));
|
|
7265
|
+
}
|
|
7266
|
+
}
|
|
7267
|
+
|
|
7268
|
+
export class DacEncoderModel extends DacPreTrainedModel {
|
|
7269
|
+
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
7270
|
+
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
7271
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
7272
|
+
...options,
|
|
7273
|
+
// Update default model file name if not provided
|
|
7274
|
+
model_file_name: options.model_file_name ?? 'encoder_model',
|
|
7275
|
+
});
|
|
7276
|
+
}
|
|
7277
|
+
}
|
|
7278
|
+
export class DacDecoderModel extends DacPreTrainedModel {
|
|
7279
|
+
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
7280
|
+
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
7281
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
7282
|
+
...options,
|
|
7283
|
+
// Update default model file name if not provided
|
|
7284
|
+
model_file_name: options.model_file_name ?? 'decoder_model',
|
|
7285
|
+
});
|
|
7286
|
+
}
|
|
7287
|
+
}
|
|
7288
|
+
//////////////////////////////////////////////////
|
|
6966
7289
|
|
|
6967
7290
|
//////////////////////////////////////////////////
|
|
6968
7291
|
// AutoModels, used to simplify construction of PreTrainedModels
|
|
@@ -7019,20 +7342,29 @@ export class PretrainedMixin {
|
|
|
7019
7342
|
if (!this.MODEL_CLASS_MAPPINGS) {
|
|
7020
7343
|
throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
|
|
7021
7344
|
}
|
|
7022
|
-
|
|
7345
|
+
const model_type = options.config.model_type;
|
|
7023
7346
|
for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
|
|
7024
|
-
|
|
7347
|
+
let modelInfo = MODEL_CLASS_MAPPING.get(model_type);
|
|
7025
7348
|
if (!modelInfo) {
|
|
7026
|
-
|
|
7349
|
+
// As a fallback, we check if model_type is specified as the exact class
|
|
7350
|
+
for (const cls of MODEL_CLASS_MAPPING.values()) {
|
|
7351
|
+
if (cls[0] === model_type) {
|
|
7352
|
+
modelInfo = cls;
|
|
7353
|
+
break;
|
|
7354
|
+
}
|
|
7355
|
+
}
|
|
7356
|
+
if (!modelInfo) continue; // Item not found in this mapping
|
|
7027
7357
|
}
|
|
7028
7358
|
return await modelInfo[1].from_pretrained(pretrained_model_name_or_path, options);
|
|
7029
7359
|
}
|
|
7030
7360
|
|
|
7031
7361
|
if (this.BASE_IF_FAIL) {
|
|
7032
|
-
|
|
7362
|
+
if (!(CUSTOM_ARCHITECTURES.has(model_type))) {
|
|
7363
|
+
console.warn(`Unknown model class "${model_type}", attempting to construct from base class.`);
|
|
7364
|
+
}
|
|
7033
7365
|
return await PreTrainedModel.from_pretrained(pretrained_model_name_or_path, options);
|
|
7034
7366
|
} else {
|
|
7035
|
-
throw Error(`Unsupported model type: ${
|
|
7367
|
+
throw Error(`Unsupported model type: ${model_type}`)
|
|
7036
7368
|
}
|
|
7037
7369
|
}
|
|
7038
7370
|
}
|
|
@@ -7133,6 +7465,10 @@ const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
|
|
|
7133
7465
|
['blenderbot-small', ['BlenderbotSmallModel', BlenderbotSmallModel]],
|
|
7134
7466
|
]);
|
|
7135
7467
|
|
|
7468
|
+
const MODEL_MAPPING_NAMES_AUTO_ENCODER = new Map([
|
|
7469
|
+
['mimi', ['MimiModel', MimiModel]],
|
|
7470
|
+
['dac', ['DacModel', DacModel]],
|
|
7471
|
+
]);
|
|
7136
7472
|
|
|
7137
7473
|
const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
7138
7474
|
['bloom', ['BloomModel', BloomModel]],
|
|
@@ -7169,6 +7505,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
|
7169
7505
|
const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
|
|
7170
7506
|
['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
|
|
7171
7507
|
['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
|
|
7508
|
+
['lite-whisper', ['LiteWhisperForConditionalGeneration', LiteWhisperForConditionalGeneration]],
|
|
7172
7509
|
['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
|
|
7173
7510
|
]);
|
|
7174
7511
|
|
|
@@ -7315,6 +7652,7 @@ const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
|
|
|
7315
7652
|
const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([
|
|
7316
7653
|
['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
|
|
7317
7654
|
['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
|
|
7655
|
+
['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
|
|
7318
7656
|
]);
|
|
7319
7657
|
|
|
7320
7658
|
const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
|
|
@@ -7324,9 +7662,15 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
|
|
|
7324
7662
|
['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
|
|
7325
7663
|
['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
|
|
7326
7664
|
['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
|
|
7665
|
+
['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
|
|
7327
7666
|
['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
|
|
7328
7667
|
]);
|
|
7329
7668
|
|
|
7669
|
+
const MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
|
|
7670
|
+
['ultravox', ['UltravoxModel', UltravoxModel]],
|
|
7671
|
+
]);
|
|
7672
|
+
|
|
7673
|
+
|
|
7330
7674
|
const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
|
|
7331
7675
|
['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
|
|
7332
7676
|
]);
|
|
@@ -7378,6 +7722,12 @@ const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
|
|
|
7378
7722
|
const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
|
|
7379
7723
|
['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]],
|
|
7380
7724
|
['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
|
|
7725
|
+
|
|
7726
|
+
['swin', ['SwinForSemanticSegmentation', SwinForSemanticSegmentation]],
|
|
7727
|
+
['mobilenet_v1', ['MobileNetV1ForSemanticSegmentation', MobileNetV1ForSemanticSegmentation]],
|
|
7728
|
+
['mobilenet_v2', ['MobileNetV2ForSemanticSegmentation', MobileNetV2ForSemanticSegmentation]],
|
|
7729
|
+
['mobilenet_v3', ['MobileNetV3ForSemanticSegmentation', MobileNetV3ForSemanticSegmentation]],
|
|
7730
|
+
['mobilenet_v4', ['MobileNetV4ForSemanticSegmentation', MobileNetV4ForSemanticSegmentation]],
|
|
7381
7731
|
]);
|
|
7382
7732
|
|
|
7383
7733
|
const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
|
|
@@ -7457,9 +7807,12 @@ const MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES = new Map([
|
|
|
7457
7807
|
])
|
|
7458
7808
|
|
|
7459
7809
|
const MODEL_CLASS_TYPE_MAPPING = [
|
|
7810
|
+
// MODEL_MAPPING_NAMES:
|
|
7460
7811
|
[MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly],
|
|
7461
7812
|
[MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES.EncoderDecoder],
|
|
7462
7813
|
[MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES.DecoderOnly],
|
|
7814
|
+
[MODEL_MAPPING_NAMES_AUTO_ENCODER, MODEL_TYPES.AutoEncoder],
|
|
7815
|
+
|
|
7463
7816
|
[MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
7464
7817
|
[MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
7465
7818
|
[MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
|
|
@@ -7470,6 +7823,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
|
|
|
7470
7823
|
[MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
7471
7824
|
[MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq],
|
|
7472
7825
|
[MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
|
|
7826
|
+
[MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.AudioTextToText],
|
|
7473
7827
|
[MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
7474
7828
|
[MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
7475
7829
|
[MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
@@ -7514,6 +7868,11 @@ const CUSTOM_MAPPING = [
|
|
|
7514
7868
|
['JinaCLIPTextModel', JinaCLIPTextModel, MODEL_TYPES.EncoderOnly],
|
|
7515
7869
|
['ClapTextModelWithProjection', ClapTextModelWithProjection, MODEL_TYPES.EncoderOnly],
|
|
7516
7870
|
['ClapAudioModelWithProjection', ClapAudioModelWithProjection, MODEL_TYPES.EncoderOnly],
|
|
7871
|
+
|
|
7872
|
+
['DacEncoderModel', DacEncoderModel, MODEL_TYPES.EncoderOnly],
|
|
7873
|
+
['DacDecoderModel', DacDecoderModel, MODEL_TYPES.EncoderOnly],
|
|
7874
|
+
['MimiEncoderModel', MimiEncoderModel, MODEL_TYPES.EncoderOnly],
|
|
7875
|
+
['MimiDecoderModel', MimiDecoderModel, MODEL_TYPES.EncoderOnly],
|
|
7517
7876
|
]
|
|
7518
7877
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
7519
7878
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -7521,6 +7880,19 @@ for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
|
7521
7880
|
MODEL_NAME_TO_CLASS_MAPPING.set(name, model);
|
|
7522
7881
|
}
|
|
7523
7882
|
|
|
7883
|
+
const CUSTOM_ARCHITECTURES = new Map([
|
|
7884
|
+
['modnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
|
|
7885
|
+
['birefnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
|
|
7886
|
+
['isnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
|
|
7887
|
+
['ben', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
|
|
7888
|
+
]);
|
|
7889
|
+
for (const [name, mapping] of CUSTOM_ARCHITECTURES.entries()) {
|
|
7890
|
+
mapping.set(name, ['PreTrainedModel', PreTrainedModel])
|
|
7891
|
+
MODEL_TYPE_MAPPING.set(name, MODEL_TYPES.EncoderOnly);
|
|
7892
|
+
MODEL_CLASS_TO_NAME_MAPPING.set(PreTrainedModel, name);
|
|
7893
|
+
MODEL_NAME_TO_CLASS_MAPPING.set(name, PreTrainedModel);
|
|
7894
|
+
}
|
|
7895
|
+
|
|
7524
7896
|
|
|
7525
7897
|
/**
|
|
7526
7898
|
* Helper class which is used to instantiate pretrained models with the `from_pretrained` function.
|
|
@@ -7761,6 +8133,14 @@ export class AutoModelForImageFeatureExtraction extends PretrainedMixin {
|
|
|
7761
8133
|
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES];
|
|
7762
8134
|
}
|
|
7763
8135
|
|
|
8136
|
+
export class AutoModelForImageTextToText extends PretrainedMixin {
|
|
8137
|
+
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES];
|
|
8138
|
+
}
|
|
8139
|
+
|
|
8140
|
+
export class AutoModelForAudioTextToText extends PretrainedMixin {
|
|
8141
|
+
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES];
|
|
8142
|
+
}
|
|
8143
|
+
|
|
7764
8144
|
//////////////////////////////////////////////////
|
|
7765
8145
|
|
|
7766
8146
|
//////////////////////////////////////////////////
|