@huggingface/transformers 4.0.0-next.5 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -4
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +2189 -1015
- package/dist/transformers.min.js +16 -16
- package/dist/transformers.node.cjs +2234 -1029
- package/dist/transformers.node.min.cjs +20 -20
- package/dist/transformers.node.min.mjs +20 -20
- package/dist/transformers.node.mjs +2194 -1017
- package/dist/transformers.web.js +2175 -1001
- package/dist/transformers.web.min.js +18 -18
- package/package.json +4 -4
- package/src/backends/onnx.js +77 -58
- package/src/backends/utils/cacheWasm.js +22 -43
- package/src/cache_utils.js +62 -0
- package/src/configs.js +32 -5
- package/src/env.js +36 -6
- package/src/image_processors_utils.js +3 -3
- package/src/models/auto/modeling_auto.js +14 -1
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +234 -292
- package/src/models/models.js +9 -0
- package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +36 -3
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
- package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
- package/src/models/registry.js +39 -4
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/pipelines/index.js +2 -84
- package/src/pipelines.js +40 -77
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/FileCache.js +128 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +8 -3
- package/src/utils/hub/{files.js → FileResponse.js} +0 -105
- package/src/utils/hub/utils.js +35 -1
- package/src/utils/hub.js +6 -5
- package/src/utils/image.js +12 -13
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/ModelRegistry.js +70 -23
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +63 -78
- package/src/utils/model_registry/get_pipeline_files.js +15 -24
- package/src/utils/model_registry/is_cached.js +81 -4
- package/src/utils/tensor.js +18 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/backends/utils/cacheWasm.d.ts +3 -17
- package/types/backends/utils/cacheWasm.d.ts.map +1 -1
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +18 -3
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/auto/modeling_auto.d.ts +6 -0
- package/types/models/auto/modeling_auto.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -24
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +9 -0
- package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
- package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
- package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
- package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
- package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
- package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
- package/types/models/registry.d.ts +2 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/pipelines/index.d.ts +0 -34
- package/types/pipelines/index.d.ts.map +1 -1
- package/types/pipelines.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache/FileCache.d.ts +39 -0
- package/types/utils/cache/FileCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts +4 -4
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -38
- package/types/utils/hub/FileResponse.d.ts.map +1 -0
- package/types/utils/hub/utils.d.ts +17 -2
- package/types/utils/hub/utils.d.ts.map +1 -1
- package/types/utils/hub.d.ts +7 -7
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/ModelRegistry.d.ts +66 -6
- package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/model_registry/get_pipeline_files.d.ts +2 -1
- package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -1
- package/types/utils/model_registry/is_cached.d.ts +47 -4
- package/types/utils/model_registry/is_cached.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
- package/types/utils/hub/files.d.ts.map +0 -1
|
@@ -8,7 +8,7 @@ import {
|
|
|
8
8
|
import { full } from '../../utils/tensor.js';
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
|
-
* @typedef {
|
|
11
|
+
* @typedef {Object} DetrFeatureExtractorResultProps
|
|
12
12
|
* @property {import('../../utils/tensor.js').Tensor} pixel_mask
|
|
13
13
|
* @typedef {import('../../image_processors_utils.js').ImageProcessorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult
|
|
14
14
|
*/
|
|
@@ -4,6 +4,7 @@ export * from './chatterbox/feature_extraction_chatterbox.js';
|
|
|
4
4
|
export * from './clap/feature_extraction_clap.js';
|
|
5
5
|
export * from './dac/feature_extraction_dac.js';
|
|
6
6
|
export * from './gemma3n/feature_extraction_gemma3n.js';
|
|
7
|
+
export * from './granite_speech/feature_extraction_granite_speech.js';
|
|
7
8
|
export * from './moonshine/feature_extraction_moonshine.js';
|
|
8
9
|
export * from './parakeet/feature_extraction_parakeet.js';
|
|
9
10
|
export * from './pyannote/feature_extraction_pyannote.js';
|
|
@@ -12,6 +13,7 @@ export * from './snac/feature_extraction_snac.js';
|
|
|
12
13
|
export * from './speecht5/feature_extraction_speecht5.js';
|
|
13
14
|
export * from './wav2vec2/feature_extraction_wav2vec2.js';
|
|
14
15
|
export * from './wespeaker/feature_extraction_wespeaker.js';
|
|
16
|
+
export * from './voxtral_realtime/feature_extraction_voxtral_realtime.js';
|
|
15
17
|
export * from './whisper/feature_extraction_whisper.js';
|
|
16
18
|
|
|
17
19
|
export { FeatureExtractor } from '../feature_extraction_utils.js';
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
|
|
2
|
+
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
3
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
4
|
+
|
|
5
|
+
export class GraniteSpeechFeatureExtractor extends FeatureExtractor {
|
|
6
|
+
constructor(config) {
|
|
7
|
+
super(config);
|
|
8
|
+
|
|
9
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
10
|
+
|
|
11
|
+
// torchaudio uses HTK mel scale with no norm by default
|
|
12
|
+
this.mel_filters = mel_filter_bank(
|
|
13
|
+
Math.floor(1 + n_fft / 2), // num_frequency_bins = 257
|
|
14
|
+
n_mels, // 80
|
|
15
|
+
0, // min_frequency
|
|
16
|
+
sample_rate / 2, // max_frequency = 8000
|
|
17
|
+
sample_rate, // 16000
|
|
18
|
+
null, // norm (torchaudio default: no norm)
|
|
19
|
+
'htk', // mel_scale (torchaudio default)
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
// torchaudio center-pads the window when win_length < n_fft:
|
|
23
|
+
// pad_amount = (n_fft - win_length) // 2 on each side
|
|
24
|
+
const raw_window = window_function(win_length, 'hann');
|
|
25
|
+
this.window = new Float64Array(n_fft);
|
|
26
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
27
|
+
this.window.set(raw_window, pad);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
32
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
33
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
34
|
+
*/
|
|
35
|
+
async _call(audio) {
|
|
36
|
+
validate_audio_inputs(audio, 'GraniteSpeechFeatureExtractor');
|
|
37
|
+
|
|
38
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
39
|
+
|
|
40
|
+
// Truncate to even number of frames for pair-stacking
|
|
41
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
42
|
+
const max_num_frames = num_frames - (num_frames % 2);
|
|
43
|
+
|
|
44
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
45
|
+
power: 2.0,
|
|
46
|
+
mel_filters: this.mel_filters,
|
|
47
|
+
log_mel: 'log10_max_norm',
|
|
48
|
+
transpose: true, // [time, n_mels]
|
|
49
|
+
max_num_frames,
|
|
50
|
+
do_pad: false,
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
// Stack adjacent frame pairs: [time, n_mels] → [1, time/2, 2*n_mels]
|
|
54
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
55
|
+
|
|
56
|
+
return { input_features };
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
|
|
2
|
+
import { AutoTokenizer } from '../auto/tokenization_auto.js';
|
|
3
|
+
import { Processor } from '../../processing_utils.js';
|
|
4
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
5
|
+
|
|
6
|
+
export class GraniteSpeechProcessor extends Processor {
|
|
7
|
+
static tokenizer_class = AutoTokenizer;
|
|
8
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
9
|
+
static uses_processor_config = true;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
13
|
+
* @param {number} audioLength Raw audio sample count.
|
|
14
|
+
* @returns {number} Number of projector output tokens.
|
|
15
|
+
*/
|
|
16
|
+
_get_num_audio_features(audioLength) {
|
|
17
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
18
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
19
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
20
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
21
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
22
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
23
|
+
return nblocks * effective_window_size;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @param {string} text The text input to process.
|
|
28
|
+
* @param {Float32Array} audio The audio input to process.
|
|
29
|
+
*/
|
|
30
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
31
|
+
if (Array.isArray(text)) {
|
|
32
|
+
throw new Error('Batched inputs are not supported yet.');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
let audio_inputs = {};
|
|
36
|
+
if (audio) {
|
|
37
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
38
|
+
audio_inputs['input_features'] = input_features;
|
|
39
|
+
|
|
40
|
+
// Compute audio embed sizes and mask in the processor
|
|
41
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
42
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
43
|
+
audio_inputs['input_features_mask'] = new Tensor('bool', mask_data, [1, audio_embed_size]);
|
|
44
|
+
|
|
45
|
+
const audio_token = this.config.audio_token ?? '<|audio|>';
|
|
46
|
+
if (!text.includes(audio_token)) {
|
|
47
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
48
|
+
}
|
|
49
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const text_inputs = this.tokenizer(text, {
|
|
53
|
+
add_special_tokens: false,
|
|
54
|
+
...kwargs,
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
...text_inputs,
|
|
59
|
+
...audio_inputs,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
@@ -2,7 +2,7 @@ import { ImageProcessor } from '../../image_processors_utils.js';
|
|
|
2
2
|
import { ones } from '../../utils/tensor.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
* @typedef {
|
|
5
|
+
* @typedef {Object} GroundingDinoFeatureExtractorResultProps
|
|
6
6
|
* @property {import('../../utils/tensor.js').Tensor} pixel_mask
|
|
7
7
|
* @typedef {import('../../image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
|
|
8
8
|
*/
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { sessionRun } from '../session.js';
|
|
1
|
+
import { LlavaForConditionalGeneration } from '../llava/modeling_llava.js';
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
/**
|
|
4
|
+
* The Idefics3 model which consists of a vision backbone and a language model.
|
|
5
|
+
*/
|
|
6
|
+
export class Idefics3ForConditionalGeneration extends LlavaForConditionalGeneration {
|
|
5
7
|
forward_params = [
|
|
6
8
|
'input_ids',
|
|
7
9
|
'attention_mask',
|
|
@@ -11,32 +13,3 @@ export class Idefics3PreTrainedModel extends PreTrainedModel {
|
|
|
11
13
|
'past_key_values',
|
|
12
14
|
];
|
|
13
15
|
}
|
|
14
|
-
|
|
15
|
-
/**
|
|
16
|
-
* The Idefics3 model which consists of a vision backbone and a language model.
|
|
17
|
-
*/
|
|
18
|
-
export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
19
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
20
|
-
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask }))
|
|
21
|
-
.image_features;
|
|
22
|
-
return features;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
27
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
28
|
-
|
|
29
|
-
return default_merge_input_ids_with_image_features({
|
|
30
|
-
// @ts-ignore
|
|
31
|
-
image_token_id: this.config.image_token_id,
|
|
32
|
-
...kwargs,
|
|
33
|
-
image_features: reshaped_image_hidden_states,
|
|
34
|
-
});
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* The SmolVLM Model with a language modeling head.
|
|
40
|
-
* It is made up a SigLIP vision encoder, with a language modeling head on top.
|
|
41
|
-
*/
|
|
42
|
-
export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {}
|
|
@@ -14,6 +14,7 @@ export * from './grounding_dino/image_processing_grounding_dino.js';
|
|
|
14
14
|
export * from './idefics3/image_processing_idefics3.js';
|
|
15
15
|
export * from './janus/image_processing_janus.js';
|
|
16
16
|
export * from './jina_clip/image_processing_jina_clip.js';
|
|
17
|
+
export * from './lfm2_vl/image_processing_lfm2_vl.js';
|
|
17
18
|
export * from './llava_onevision/image_processing_llava_onevision.js';
|
|
18
19
|
export * from './mask2former/image_processing_mask2former.js';
|
|
19
20
|
export * from './maskformer/image_processing_maskformer.js';
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
import { ImageProcessor, smart_resize } from '../../image_processors_utils.js';
|
|
2
|
+
import { Tensor, cat, interpolate_4d, stack } from '../../utils/tensor.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Returns the closest integer to `number` that is divisible by `factor`.
|
|
10
|
+
* @param {number} number
|
|
11
|
+
* @param {number} factor
|
|
12
|
+
* @returns {number}
|
|
13
|
+
*/
|
|
14
|
+
function round_by_factor(number, factor) {
|
|
15
|
+
return Math.round(number / factor) * factor;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Find the closest aspect ratio from target_ratios to match the input aspect ratio.
|
|
20
|
+
* @param {number} aspect_ratio
|
|
21
|
+
* @param {number[][]} target_ratios
|
|
22
|
+
* @param {number} width
|
|
23
|
+
* @param {number} height
|
|
24
|
+
* @param {number} image_size
|
|
25
|
+
* @returns {number[]}
|
|
26
|
+
*/
|
|
27
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
28
|
+
let best_ratio_diff = Infinity;
|
|
29
|
+
let best_ratio = [1, 1];
|
|
30
|
+
const area = width * height;
|
|
31
|
+
for (const ratio of target_ratios) {
|
|
32
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
33
|
+
if (ratio_diff < best_ratio_diff) {
|
|
34
|
+
best_ratio_diff = ratio_diff;
|
|
35
|
+
best_ratio = ratio;
|
|
36
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
37
|
+
best_ratio = ratio;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return best_ratio;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Compute all valid (width, height) tile ratios for the given range.
|
|
45
|
+
* @param {number} min_tiles
|
|
46
|
+
* @param {number} max_tiles
|
|
47
|
+
* @returns {number[][]}
|
|
48
|
+
*/
|
|
49
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
50
|
+
/** @type {number[][]} */
|
|
51
|
+
const ratios = [];
|
|
52
|
+
const seen = new Set();
|
|
53
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
54
|
+
for (let w = 1; w <= n; ++w) {
|
|
55
|
+
for (let h = 1; h <= n; ++h) {
|
|
56
|
+
const product = w * h;
|
|
57
|
+
if (product >= min_tiles && product <= max_tiles) {
|
|
58
|
+
const key = (w << 16) | h;
|
|
59
|
+
if (!seen.has(key)) {
|
|
60
|
+
seen.add(key);
|
|
61
|
+
ratios.push([w, h]);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Convert image tensor to flattened patches.
|
|
72
|
+
*
|
|
73
|
+
* Equivalent to PyTorch: `images.reshape(B, C, ph, ps, pw, ps).permute(0, 2, 4, 3, 5, 1).reshape(B, ph*pw, -1)`
|
|
74
|
+
* @param {Tensor} images Shape: [batch, channels, height, width]
|
|
75
|
+
* @param {number} patch_size
|
|
76
|
+
* @returns {Tensor} Shape: [batch, num_patches, patch_size * patch_size * channels]
|
|
77
|
+
*/
|
|
78
|
+
function convert_image_to_patches(images, patch_size) {
|
|
79
|
+
const [B, C, H, W] = images.dims;
|
|
80
|
+
const ph = Math.floor(H / patch_size),
|
|
81
|
+
pw = Math.floor(W / patch_size);
|
|
82
|
+
const patch_dim = patch_size * patch_size * C;
|
|
83
|
+
const data = /** @type {Float32Array} */ (images.data);
|
|
84
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
85
|
+
const ch_stride = H * W;
|
|
86
|
+
|
|
87
|
+
for (let b = 0; b < B; ++b) {
|
|
88
|
+
const b_src = b * C * ch_stride;
|
|
89
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
90
|
+
for (let py = 0; py < ph; ++py) {
|
|
91
|
+
for (let px = 0; px < pw; ++px) {
|
|
92
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
93
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
94
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
95
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
96
|
+
const pixel = row + dx;
|
|
97
|
+
for (let c = 0; c < C; ++c) {
|
|
98
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return new Tensor('float32', result, [B, ph * pw, patch_dim]);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Pad patches along the patch dimension to `target_length`.
|
|
111
|
+
* @param {Tensor} patches Shape: [1, current_length, patch_dim]
|
|
112
|
+
* @param {number} target_length
|
|
113
|
+
* @returns {{ padded: Tensor, mask: Tensor }}
|
|
114
|
+
*/
|
|
115
|
+
function pad_along_first_dim(patches, target_length) {
|
|
116
|
+
const [, len, dim] = patches.dims;
|
|
117
|
+
const mask_data = new BigInt64Array(target_length);
|
|
118
|
+
mask_data.fill(1n, 0, len);
|
|
119
|
+
|
|
120
|
+
let padded = patches;
|
|
121
|
+
if (len < target_length) {
|
|
122
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
123
|
+
padded_data.set(/** @type {Float32Array} */ (patches.data));
|
|
124
|
+
padded = new Tensor('float32', padded_data, [1, target_length, dim]);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return { padded, mask: new Tensor('int64', mask_data, [target_length]) };
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
export class Lfm2VlImageProcessor extends ImageProcessor {
|
|
131
|
+
constructor(/** @type {Record<string, any>} */ config) {
|
|
132
|
+
super(config);
|
|
133
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
134
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
135
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
136
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
137
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
138
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
139
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
140
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
141
|
+
this.tile_size = config.tile_size ?? 512;
|
|
142
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2.0;
|
|
143
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
144
|
+
|
|
145
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
146
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
147
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Check if the image is too large to be processed as a single tile.
|
|
152
|
+
* @param {number} height
|
|
153
|
+
* @param {number} width
|
|
154
|
+
* @returns {boolean}
|
|
155
|
+
*/
|
|
156
|
+
_is_image_too_large(height, width) {
|
|
157
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
158
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
159
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
160
|
+
return (
|
|
161
|
+
h_bar * w_bar >
|
|
162
|
+
this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Get the grid layout for tiling a large image.
|
|
168
|
+
* @param {number} height
|
|
169
|
+
* @param {number} width
|
|
170
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
171
|
+
*/
|
|
172
|
+
_get_grid_layout(height, width) {
|
|
173
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
174
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
175
|
+
width / height,
|
|
176
|
+
target_ratios,
|
|
177
|
+
width,
|
|
178
|
+
height,
|
|
179
|
+
this.tile_size,
|
|
180
|
+
);
|
|
181
|
+
return {
|
|
182
|
+
grid_width,
|
|
183
|
+
grid_height,
|
|
184
|
+
target_width: this.tile_size * grid_width,
|
|
185
|
+
target_height: this.tile_size * grid_height,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
190
|
+
// @ts-expect-error
|
|
191
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
192
|
+
/** @type {RawImage[][]} */
|
|
193
|
+
let batched_images;
|
|
194
|
+
if (!Array.isArray(images)) {
|
|
195
|
+
batched_images = [[images]];
|
|
196
|
+
} else if (!Array.isArray(images[0])) {
|
|
197
|
+
batched_images = [/** @type {RawImage[]} */ (images)];
|
|
198
|
+
} else {
|
|
199
|
+
batched_images = /** @type {RawImage[][]} */ (images);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/** @type {Tensor[]} */
|
|
203
|
+
const all_pixel_values = [];
|
|
204
|
+
/** @type {Tensor[]} */
|
|
205
|
+
const all_pixel_masks = [];
|
|
206
|
+
/** @type {number[][]} */
|
|
207
|
+
const all_spatial_shapes = [];
|
|
208
|
+
/** @type {number[]} */
|
|
209
|
+
const all_rows = [];
|
|
210
|
+
/** @type {number[]} */
|
|
211
|
+
const all_cols = [];
|
|
212
|
+
/** @type {number[][]} */
|
|
213
|
+
const all_image_sizes = [];
|
|
214
|
+
|
|
215
|
+
for (const image_batch of batched_images) {
|
|
216
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
217
|
+
|
|
218
|
+
for (const { pixel_values } of preprocessed) {
|
|
219
|
+
const [, height, width] = pixel_values.dims;
|
|
220
|
+
const img = pixel_values.unsqueeze_(0);
|
|
221
|
+
|
|
222
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
223
|
+
const f2 = total_factor ** 2;
|
|
224
|
+
const [new_height, new_width] = smart_resize(
|
|
225
|
+
Math.max(total_factor, height),
|
|
226
|
+
Math.max(total_factor, width),
|
|
227
|
+
total_factor,
|
|
228
|
+
this.min_image_tokens * f2,
|
|
229
|
+
this.max_image_tokens * f2,
|
|
230
|
+
).map((x) => Math.max(total_factor, x));
|
|
231
|
+
|
|
232
|
+
/** @type {Tensor[]} */
|
|
233
|
+
let tiles;
|
|
234
|
+
let num_rows = 1,
|
|
235
|
+
num_cols = 1;
|
|
236
|
+
|
|
237
|
+
const is_large = this._is_image_too_large(height, width);
|
|
238
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
239
|
+
|
|
240
|
+
if (is_large && do_splitting) {
|
|
241
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
242
|
+
height,
|
|
243
|
+
width,
|
|
244
|
+
);
|
|
245
|
+
num_rows = grid_height;
|
|
246
|
+
num_cols = grid_width;
|
|
247
|
+
|
|
248
|
+
const resized = await interpolate_4d(img, {
|
|
249
|
+
size: [target_height, target_width],
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
tiles = [];
|
|
253
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
254
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
255
|
+
const y = r * this.tile_size;
|
|
256
|
+
const x = c * this.tile_size;
|
|
257
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
262
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
263
|
+
}
|
|
264
|
+
} else {
|
|
265
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
for (const tile of tiles) {
|
|
269
|
+
const [, , th, tw] = tile.dims;
|
|
270
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
271
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
272
|
+
|
|
273
|
+
all_pixel_values.push(padded);
|
|
274
|
+
all_pixel_masks.push(mask);
|
|
275
|
+
all_spatial_shapes.push([
|
|
276
|
+
Math.floor(th / this.encoder_patch_size),
|
|
277
|
+
Math.floor(tw / this.encoder_patch_size),
|
|
278
|
+
]);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
all_rows.push(num_rows);
|
|
282
|
+
all_cols.push(num_cols);
|
|
283
|
+
all_image_sizes.push([new_height, new_width]);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/** @type {Record<string, any>} */
|
|
288
|
+
const result = {
|
|
289
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
290
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
291
|
+
spatial_shapes: new Tensor('int64', BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
292
|
+
all_spatial_shapes.length,
|
|
293
|
+
2,
|
|
294
|
+
]),
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
298
|
+
result.image_rows = all_rows;
|
|
299
|
+
result.image_cols = all_cols;
|
|
300
|
+
result.image_sizes = all_image_sizes;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
return result;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { LlavaForConditionalGeneration } from '../llava/modeling_llava.js';
|
|
2
|
+
|
|
3
|
+
export class Lfm2VlForConditionalGeneration extends LlavaForConditionalGeneration {
|
|
4
|
+
forward_params = [
|
|
5
|
+
'input_ids',
|
|
6
|
+
'attention_mask',
|
|
7
|
+
'pixel_values',
|
|
8
|
+
'pixel_attention_mask',
|
|
9
|
+
'spatial_shapes',
|
|
10
|
+
'position_ids',
|
|
11
|
+
'past_key_values',
|
|
12
|
+
];
|
|
13
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { Processor } from '../../processing_utils.js';
|
|
2
|
+
import { AutoImageProcessor } from '../auto/image_processing_auto.js';
|
|
3
|
+
import { AutoTokenizer } from '../auto/tokenization_auto.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
export class Lfm2VlProcessor extends Processor {
|
|
10
|
+
static tokenizer_class = AutoTokenizer;
|
|
11
|
+
static image_processor_class = AutoImageProcessor;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* @param {RawImage|RawImage[]} images
|
|
15
|
+
* @param {string|string[]|null} [text]
|
|
16
|
+
* @param {Record<string, any>} [kwargs]
|
|
17
|
+
*/
|
|
18
|
+
async _call(images, text = null, kwargs = {}) {
|
|
19
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
20
|
+
...kwargs,
|
|
21
|
+
return_row_col_info: true,
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
if (text) {
|
|
25
|
+
const image_token = this.config.image_token ?? '<image>';
|
|
26
|
+
const {
|
|
27
|
+
tile_size = 512,
|
|
28
|
+
downsample_factor = 2,
|
|
29
|
+
encoder_patch_size = 16,
|
|
30
|
+
use_thumbnail = true,
|
|
31
|
+
} = /** @type {Record<string, any>} */ (this.image_processor.config);
|
|
32
|
+
|
|
33
|
+
const ds = (/** @type {number} */ s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
34
|
+
const tokens_per_tile = ds(tile_size) ** 2;
|
|
35
|
+
const image_start = this.config.image_start_token ?? '<|image_start|>';
|
|
36
|
+
const image_end = this.config.image_end_token ?? '<|image_end|>';
|
|
37
|
+
const thumbnail_token = this.config.image_thumbnail ?? '<|img_thumbnail|>';
|
|
38
|
+
|
|
39
|
+
if (!Array.isArray(text)) text = [text];
|
|
40
|
+
|
|
41
|
+
let image_idx = 0;
|
|
42
|
+
text = text.map((sample) => {
|
|
43
|
+
const parts = sample.split(image_token);
|
|
44
|
+
return (
|
|
45
|
+
parts[0] +
|
|
46
|
+
parts
|
|
47
|
+
.slice(1)
|
|
48
|
+
.map((part) => {
|
|
49
|
+
const idx = image_idx++;
|
|
50
|
+
const [h, w] = image_sizes[idx];
|
|
51
|
+
const rows = image_rows[idx],
|
|
52
|
+
cols = image_cols[idx];
|
|
53
|
+
const tokens_for_image = ds(h) * ds(w);
|
|
54
|
+
|
|
55
|
+
let expanded = image_start;
|
|
56
|
+
if (rows > 1 || cols > 1) {
|
|
57
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
58
|
+
for (let r = 0; r < rows; ++r)
|
|
59
|
+
for (let c = 0; c < cols; ++c)
|
|
60
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
61
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
62
|
+
} else {
|
|
63
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
64
|
+
}
|
|
65
|
+
return expanded + image_end + part;
|
|
66
|
+
})
|
|
67
|
+
.join('')
|
|
68
|
+
);
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
...image_inputs,
|
|
74
|
+
...(text ? this.tokenizer(text, kwargs) : {}),
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
}
|
|
@@ -14,7 +14,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
|
|
|
14
14
|
|
|
15
15
|
return default_merge_input_ids_with_image_features({
|
|
16
16
|
// @ts-ignore
|
|
17
|
-
image_token_id: this.config.image_token_index,
|
|
17
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
18
18
|
...kwargs,
|
|
19
19
|
image_features: reshaped_image_hidden_states,
|
|
20
20
|
});
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { LlavaForConditionalGeneration } from '../llava/modeling_llava.js';
|
|
2
2
|
|
|
3
|
-
export class Mistral3ForConditionalGeneration extends
|
|
3
|
+
export class Mistral3ForConditionalGeneration extends LlavaForConditionalGeneration {}
|