@huggingface/transformers 4.0.0-next.5 → 4.0.0-next.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/README.md +12 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
  3. package/dist/transformers.js +2189 -1015
  4. package/dist/transformers.min.js +16 -16
  5. package/dist/transformers.node.cjs +2234 -1029
  6. package/dist/transformers.node.min.cjs +20 -20
  7. package/dist/transformers.node.min.mjs +20 -20
  8. package/dist/transformers.node.mjs +2194 -1017
  9. package/dist/transformers.web.js +2175 -1001
  10. package/dist/transformers.web.min.js +18 -18
  11. package/package.json +4 -4
  12. package/src/backends/onnx.js +77 -58
  13. package/src/backends/utils/cacheWasm.js +22 -43
  14. package/src/cache_utils.js +62 -0
  15. package/src/configs.js +32 -5
  16. package/src/env.js +36 -6
  17. package/src/image_processors_utils.js +3 -3
  18. package/src/models/auto/modeling_auto.js +14 -1
  19. package/src/models/chatterbox/modeling_chatterbox.js +1 -1
  20. package/src/models/detr/image_processing_detr.js +1 -1
  21. package/src/models/feature_extractors.js +2 -0
  22. package/src/models/gemma3n/modeling_gemma3n.js +2 -0
  23. package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
  24. package/src/models/granite_speech/modeling_granite_speech.js +5 -0
  25. package/src/models/granite_speech/processing_granite_speech.js +62 -0
  26. package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
  27. package/src/models/idefics3/modeling_idefics3.js +5 -32
  28. package/src/models/image_processors.js +1 -0
  29. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
  30. package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
  31. package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
  32. package/src/models/llava/modeling_llava.js +1 -1
  33. package/src/models/mistral3/modeling_mistral3.js +2 -2
  34. package/src/models/modeling_utils.js +234 -292
  35. package/src/models/models.js +9 -0
  36. package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
  37. package/src/models/paligemma/modeling_paligemma.js +2 -25
  38. package/src/models/processors.js +3 -0
  39. package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
  40. package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
  41. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
  42. package/src/models/qwen2_vl/modeling_qwen2_vl.js +36 -3
  43. package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
  44. package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
  45. package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
  46. package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
  47. package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
  48. package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
  49. package/src/models/registry.js +39 -4
  50. package/src/models/sam/image_processing_sam.js +1 -1
  51. package/src/models/session.js +17 -6
  52. package/src/models/smolvlm/modeling_smolvlm.js +7 -0
  53. package/src/models/ultravox/modeling_ultravox.js +1 -3
  54. package/src/models/voxtral/modeling_voxtral.js +3 -0
  55. package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
  56. package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
  57. package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
  58. package/src/models/whisper/feature_extraction_whisper.js +2 -12
  59. package/src/pipelines/index.js +2 -84
  60. package/src/pipelines.js +40 -77
  61. package/src/transformers.js +2 -0
  62. package/src/utils/audio.js +18 -2
  63. package/src/utils/cache/CrossOriginStorageCache.js +251 -0
  64. package/src/utils/cache/FileCache.js +128 -0
  65. package/src/utils/cache/cross-origin-storage.d.ts +38 -0
  66. package/src/utils/cache.js +8 -3
  67. package/src/utils/hub/{files.js → FileResponse.js} +0 -105
  68. package/src/utils/hub/utils.js +35 -1
  69. package/src/utils/hub.js +6 -5
  70. package/src/utils/image.js +12 -13
  71. package/src/utils/lru_cache.js +67 -0
  72. package/src/utils/memoize_promise.js +45 -0
  73. package/src/utils/model_registry/ModelRegistry.js +70 -23
  74. package/src/utils/model_registry/get_file_metadata.js +14 -2
  75. package/src/utils/model_registry/get_model_files.js +63 -78
  76. package/src/utils/model_registry/get_pipeline_files.js +15 -24
  77. package/src/utils/model_registry/is_cached.js +81 -4
  78. package/src/utils/tensor.js +18 -2
  79. package/types/backends/onnx.d.ts.map +1 -1
  80. package/types/backends/utils/cacheWasm.d.ts +3 -17
  81. package/types/backends/utils/cacheWasm.d.ts.map +1 -1
  82. package/types/cache_utils.d.ts +29 -0
  83. package/types/cache_utils.d.ts.map +1 -0
  84. package/types/configs.d.ts.map +1 -1
  85. package/types/env.d.ts +18 -3
  86. package/types/env.d.ts.map +1 -1
  87. package/types/image_processors_utils.d.ts +17 -1
  88. package/types/image_processors_utils.d.ts.map +1 -1
  89. package/types/models/auto/modeling_auto.d.ts +6 -0
  90. package/types/models/auto/modeling_auto.d.ts.map +1 -1
  91. package/types/models/detr/image_processing_detr.d.ts +1 -1
  92. package/types/models/feature_extractors.d.ts +2 -0
  93. package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
  94. package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
  95. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
  96. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
  97. package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
  98. package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
  99. package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
  100. package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
  101. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
  102. package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
  103. package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
  104. package/types/models/image_processors.d.ts +1 -0
  105. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
  106. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
  107. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
  108. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
  109. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
  110. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
  111. package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
  112. package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
  113. package/types/models/modeling_utils.d.ts +44 -24
  114. package/types/models/modeling_utils.d.ts.map +1 -1
  115. package/types/models/models.d.ts +9 -0
  116. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
  117. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
  118. package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
  119. package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
  120. package/types/models/processors.d.ts +3 -0
  121. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
  122. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
  123. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
  124. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
  125. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
  126. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
  127. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  128. package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
  129. package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
  130. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
  131. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
  132. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
  133. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
  134. package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
  135. package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
  136. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
  137. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
  138. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
  139. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
  140. package/types/models/registry.d.ts +2 -1
  141. package/types/models/registry.d.ts.map +1 -1
  142. package/types/models/sam/image_processing_sam.d.ts +1 -1
  143. package/types/models/session.d.ts +3 -2
  144. package/types/models/session.d.ts.map +1 -1
  145. package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
  146. package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
  147. package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
  148. package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
  149. package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
  150. package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
  151. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
  152. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
  153. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
  154. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
  155. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
  156. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
  157. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  158. package/types/pipelines/index.d.ts +0 -34
  159. package/types/pipelines/index.d.ts.map +1 -1
  160. package/types/pipelines.d.ts.map +1 -1
  161. package/types/transformers.d.ts +1 -0
  162. package/types/transformers.d.ts.map +1 -1
  163. package/types/utils/audio.d.ts +5 -2
  164. package/types/utils/audio.d.ts.map +1 -1
  165. package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
  166. package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
  167. package/types/utils/cache/FileCache.d.ts +39 -0
  168. package/types/utils/cache/FileCache.d.ts.map +1 -0
  169. package/types/utils/cache.d.ts +4 -4
  170. package/types/utils/cache.d.ts.map +1 -1
  171. package/types/utils/dtypes.d.ts +1 -1
  172. package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -38
  173. package/types/utils/hub/FileResponse.d.ts.map +1 -0
  174. package/types/utils/hub/utils.d.ts +17 -2
  175. package/types/utils/hub/utils.d.ts.map +1 -1
  176. package/types/utils/hub.d.ts +7 -7
  177. package/types/utils/hub.d.ts.map +1 -1
  178. package/types/utils/image.d.ts +1 -1
  179. package/types/utils/image.d.ts.map +1 -1
  180. package/types/utils/lru_cache.d.ts +38 -0
  181. package/types/utils/lru_cache.d.ts.map +1 -0
  182. package/types/utils/memoize_promise.d.ts +14 -0
  183. package/types/utils/memoize_promise.d.ts.map +1 -0
  184. package/types/utils/model_registry/ModelRegistry.d.ts +66 -6
  185. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
  186. package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
  187. package/types/utils/model_registry/get_model_files.d.ts +1 -0
  188. package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
  189. package/types/utils/model_registry/get_pipeline_files.d.ts +2 -1
  190. package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -1
  191. package/types/utils/model_registry/is_cached.d.ts +47 -4
  192. package/types/utils/model_registry/is_cached.d.ts.map +1 -1
  193. package/types/utils/tensor.d.ts.map +1 -1
  194. package/src/utils/data-structures.js +0 -572
  195. package/types/utils/data-structures.d.ts +0 -294
  196. package/types/utils/data-structures.d.ts.map +0 -1
  197. package/types/utils/hub/files.d.ts.map +0 -1
@@ -8,7 +8,7 @@ import {
8
8
  import { full } from '../../utils/tensor.js';
9
9
 
10
10
  /**
11
- * @typedef {object} DetrFeatureExtractorResultProps
11
+ * @typedef {Object} DetrFeatureExtractorResultProps
12
12
  * @property {import('../../utils/tensor.js').Tensor} pixel_mask
13
13
  * @typedef {import('../../image_processors_utils.js').ImageProcessorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult
14
14
  */
@@ -4,6 +4,7 @@ export * from './chatterbox/feature_extraction_chatterbox.js';
4
4
  export * from './clap/feature_extraction_clap.js';
5
5
  export * from './dac/feature_extraction_dac.js';
6
6
  export * from './gemma3n/feature_extraction_gemma3n.js';
7
+ export * from './granite_speech/feature_extraction_granite_speech.js';
7
8
  export * from './moonshine/feature_extraction_moonshine.js';
8
9
  export * from './parakeet/feature_extraction_parakeet.js';
9
10
  export * from './pyannote/feature_extraction_pyannote.js';
@@ -12,6 +13,7 @@ export * from './snac/feature_extraction_snac.js';
12
13
  export * from './speecht5/feature_extraction_speecht5.js';
13
14
  export * from './wav2vec2/feature_extraction_wav2vec2.js';
14
15
  export * from './wespeaker/feature_extraction_wespeaker.js';
16
+ export * from './voxtral_realtime/feature_extraction_voxtral_realtime.js';
15
17
  export * from './whisper/feature_extraction_whisper.js';
16
18
 
17
19
  export { FeatureExtractor } from '../feature_extraction_utils.js';
@@ -115,3 +115,5 @@ export class Gemma3nForConditionalGeneration extends Gemma3nPreTrainedModel {
115
115
  });
116
116
  }
117
117
  }
118
+
119
+ export class Gemma3nForCausalLM extends Gemma3nForConditionalGeneration {}
@@ -0,0 +1,58 @@
1
+ import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
2
+ import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
3
+ import { Tensor } from '../../utils/tensor.js';
4
+
5
+ export class GraniteSpeechFeatureExtractor extends FeatureExtractor {
6
+ constructor(config) {
7
+ super(config);
8
+
9
+ const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
10
+
11
+ // torchaudio uses HTK mel scale with no norm by default
12
+ this.mel_filters = mel_filter_bank(
13
+ Math.floor(1 + n_fft / 2), // num_frequency_bins = 257
14
+ n_mels, // 80
15
+ 0, // min_frequency
16
+ sample_rate / 2, // max_frequency = 8000
17
+ sample_rate, // 16000
18
+ null, // norm (torchaudio default: no norm)
19
+ 'htk', // mel_scale (torchaudio default)
20
+ );
21
+
22
+ // torchaudio center-pads the window when win_length < n_fft:
23
+ // pad_amount = (n_fft - win_length) // 2 on each side
24
+ const raw_window = window_function(win_length, 'hann');
25
+ this.window = new Float64Array(n_fft);
26
+ const pad = Math.floor((n_fft - win_length) / 2);
27
+ this.window.set(raw_window, pad);
28
+ }
29
+
30
+ /**
31
+ * Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
32
+ * @param {Float32Array|Float64Array} audio The audio waveform.
33
+ * @returns {Promise<{input_features: Tensor}>}
34
+ */
35
+ async _call(audio) {
36
+ validate_audio_inputs(audio, 'GraniteSpeechFeatureExtractor');
37
+
38
+ const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
39
+
40
+ // Truncate to even number of frames for pair-stacking
41
+ const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
42
+ const max_num_frames = num_frames - (num_frames % 2);
43
+
44
+ const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
45
+ power: 2.0,
46
+ mel_filters: this.mel_filters,
47
+ log_mel: 'log10_max_norm',
48
+ transpose: true, // [time, n_mels]
49
+ max_num_frames,
50
+ do_pad: false,
51
+ });
52
+
53
+ // Stack adjacent frame pairs: [time, n_mels] → [1, time/2, 2*n_mels]
54
+ const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
55
+
56
+ return { input_features };
57
+ }
58
+ }
@@ -0,0 +1,5 @@
1
+ import { UltravoxModel } from '../ultravox/modeling_ultravox.js';
2
+
3
+ export class GraniteSpeechForConditionalGeneration extends UltravoxModel {
4
+ forward_params = ['input_ids', 'attention_mask', 'input_features', 'past_key_values'];
5
+ }
@@ -0,0 +1,62 @@
1
+ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
2
+ import { AutoTokenizer } from '../auto/tokenization_auto.js';
3
+ import { Processor } from '../../processing_utils.js';
4
+ import { Tensor } from '../../utils/tensor.js';
5
+
6
+ export class GraniteSpeechProcessor extends Processor {
7
+ static tokenizer_class = AutoTokenizer;
8
+ static feature_extractor_class = AutoFeatureExtractor;
9
+ static uses_processor_config = true;
10
+
11
+ /**
12
+ * Compute the number of audio tokens for a given raw audio length.
13
+ * @param {number} audioLength Raw audio sample count.
14
+ * @returns {number} Number of projector output tokens.
15
+ */
16
+ _get_num_audio_features(audioLength) {
17
+ const { hop_length } = this.feature_extractor.config.melspec_kwargs;
18
+ const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
19
+ const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
20
+ const mel_length = Math.floor(audioLength / hop_length) + 1;
21
+ const encoder_length = Math.floor(mel_length / 2);
22
+ const nblocks = Math.ceil(encoder_length / projector_window_size);
23
+ return nblocks * effective_window_size;
24
+ }
25
+
26
+ /**
27
+ * @param {string} text The text input to process.
28
+ * @param {Float32Array} audio The audio input to process.
29
+ */
30
+ async _call(text, audio = null, kwargs = {}) {
31
+ if (Array.isArray(text)) {
32
+ throw new Error('Batched inputs are not supported yet.');
33
+ }
34
+
35
+ let audio_inputs = {};
36
+ if (audio) {
37
+ const { input_features } = await this.feature_extractor(audio);
38
+ audio_inputs['input_features'] = input_features;
39
+
40
+ // Compute audio embed sizes and mask in the processor
41
+ const audio_embed_size = this._get_num_audio_features(audio.length);
42
+ const mask_data = new Uint8Array(audio_embed_size).fill(1);
43
+ audio_inputs['input_features_mask'] = new Tensor('bool', mask_data, [1, audio_embed_size]);
44
+
45
+ const audio_token = this.config.audio_token ?? '<|audio|>';
46
+ if (!text.includes(audio_token)) {
47
+ throw new Error(`The input text does not contain the audio token ${audio_token}.`);
48
+ }
49
+ text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
50
+ }
51
+
52
+ const text_inputs = this.tokenizer(text, {
53
+ add_special_tokens: false,
54
+ ...kwargs,
55
+ });
56
+
57
+ return {
58
+ ...text_inputs,
59
+ ...audio_inputs,
60
+ };
61
+ }
62
+ }
@@ -2,7 +2,7 @@ import { ImageProcessor } from '../../image_processors_utils.js';
2
2
  import { ones } from '../../utils/tensor.js';
3
3
 
4
4
  /**
5
- * @typedef {object} GroundingDinoFeatureExtractorResultProps
5
+ * @typedef {Object} GroundingDinoFeatureExtractorResultProps
6
6
  * @property {import('../../utils/tensor.js').Tensor} pixel_mask
7
7
  * @typedef {import('../../image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
8
8
  */
@@ -1,7 +1,9 @@
1
- import { PreTrainedModel, default_merge_input_ids_with_image_features } from '../modeling_utils.js';
2
- import { sessionRun } from '../session.js';
1
+ import { LlavaForConditionalGeneration } from '../llava/modeling_llava.js';
3
2
 
4
- export class Idefics3PreTrainedModel extends PreTrainedModel {
3
+ /**
4
+ * The Idefics3 model which consists of a vision backbone and a language model.
5
+ */
6
+ export class Idefics3ForConditionalGeneration extends LlavaForConditionalGeneration {
5
7
  forward_params = [
6
8
  'input_ids',
7
9
  'attention_mask',
@@ -11,32 +13,3 @@ export class Idefics3PreTrainedModel extends PreTrainedModel {
11
13
  'past_key_values',
12
14
  ];
13
15
  }
14
-
15
- /**
16
- * The Idefics3 model which consists of a vision backbone and a language model.
17
- */
18
- export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
19
- async encode_image({ pixel_values, pixel_attention_mask }) {
20
- const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask }))
21
- .image_features;
22
- return features;
23
- }
24
-
25
- _merge_input_ids_with_image_features(kwargs) {
26
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
27
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
28
-
29
- return default_merge_input_ids_with_image_features({
30
- // @ts-ignore
31
- image_token_id: this.config.image_token_id,
32
- ...kwargs,
33
- image_features: reshaped_image_hidden_states,
34
- });
35
- }
36
- }
37
-
38
- /**
39
- * The SmolVLM Model with a language modeling head.
40
- * It is made up a SigLIP vision encoder, with a language modeling head on top.
41
- */
42
- export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {}
@@ -14,6 +14,7 @@ export * from './grounding_dino/image_processing_grounding_dino.js';
14
14
  export * from './idefics3/image_processing_idefics3.js';
15
15
  export * from './janus/image_processing_janus.js';
16
16
  export * from './jina_clip/image_processing_jina_clip.js';
17
+ export * from './lfm2_vl/image_processing_lfm2_vl.js';
17
18
  export * from './llava_onevision/image_processing_llava_onevision.js';
18
19
  export * from './mask2former/image_processing_mask2former.js';
19
20
  export * from './maskformer/image_processing_maskformer.js';
@@ -0,0 +1,305 @@
1
+ import { ImageProcessor, smart_resize } from '../../image_processors_utils.js';
2
+ import { Tensor, cat, interpolate_4d, stack } from '../../utils/tensor.js';
3
+
4
+ /**
5
+ * @typedef {import('../../utils/image.js').RawImage} RawImage
6
+ */
7
+
8
+ /**
9
+ * Returns the closest integer to `number` that is divisible by `factor`.
10
+ * @param {number} number
11
+ * @param {number} factor
12
+ * @returns {number}
13
+ */
14
+ function round_by_factor(number, factor) {
15
+ return Math.round(number / factor) * factor;
16
+ }
17
+
18
+ /**
19
+ * Find the closest aspect ratio from target_ratios to match the input aspect ratio.
20
+ * @param {number} aspect_ratio
21
+ * @param {number[][]} target_ratios
22
+ * @param {number} width
23
+ * @param {number} height
24
+ * @param {number} image_size
25
+ * @returns {number[]}
26
+ */
27
+ function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
28
+ let best_ratio_diff = Infinity;
29
+ let best_ratio = [1, 1];
30
+ const area = width * height;
31
+ for (const ratio of target_ratios) {
32
+ const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
33
+ if (ratio_diff < best_ratio_diff) {
34
+ best_ratio_diff = ratio_diff;
35
+ best_ratio = ratio;
36
+ } else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
37
+ best_ratio = ratio;
38
+ }
39
+ }
40
+ return best_ratio;
41
+ }
42
+
43
+ /**
44
+ * Compute all valid (width, height) tile ratios for the given range.
45
+ * @param {number} min_tiles
46
+ * @param {number} max_tiles
47
+ * @returns {number[][]}
48
+ */
49
+ function get_target_ratios(min_tiles, max_tiles) {
50
+ /** @type {number[][]} */
51
+ const ratios = [];
52
+ const seen = new Set();
53
+ for (let n = min_tiles; n <= max_tiles; ++n) {
54
+ for (let w = 1; w <= n; ++w) {
55
+ for (let h = 1; h <= n; ++h) {
56
+ const product = w * h;
57
+ if (product >= min_tiles && product <= max_tiles) {
58
+ const key = (w << 16) | h;
59
+ if (!seen.has(key)) {
60
+ seen.add(key);
61
+ ratios.push([w, h]);
62
+ }
63
+ }
64
+ }
65
+ }
66
+ }
67
+ return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
68
+ }
69
+
70
+ /**
71
+ * Convert image tensor to flattened patches.
72
+ *
73
+ * Equivalent to PyTorch: `images.reshape(B, C, ph, ps, pw, ps).permute(0, 2, 4, 3, 5, 1).reshape(B, ph*pw, -1)`
74
+ * @param {Tensor} images Shape: [batch, channels, height, width]
75
+ * @param {number} patch_size
76
+ * @returns {Tensor} Shape: [batch, num_patches, patch_size * patch_size * channels]
77
+ */
78
+ function convert_image_to_patches(images, patch_size) {
79
+ const [B, C, H, W] = images.dims;
80
+ const ph = Math.floor(H / patch_size),
81
+ pw = Math.floor(W / patch_size);
82
+ const patch_dim = patch_size * patch_size * C;
83
+ const data = /** @type {Float32Array} */ (images.data);
84
+ const result = new Float32Array(B * ph * pw * patch_dim);
85
+ const ch_stride = H * W;
86
+
87
+ for (let b = 0; b < B; ++b) {
88
+ const b_src = b * C * ch_stride;
89
+ const b_dst = b * ph * pw * patch_dim;
90
+ for (let py = 0; py < ph; ++py) {
91
+ for (let px = 0; px < pw; ++px) {
92
+ let off = b_dst + (py * pw + px) * patch_dim;
93
+ for (let dy = 0; dy < patch_size; ++dy) {
94
+ const row = (py * patch_size + dy) * W + px * patch_size;
95
+ for (let dx = 0; dx < patch_size; ++dx) {
96
+ const pixel = row + dx;
97
+ for (let c = 0; c < C; ++c) {
98
+ result[off++] = data[b_src + c * ch_stride + pixel];
99
+ }
100
+ }
101
+ }
102
+ }
103
+ }
104
+ }
105
+
106
+ return new Tensor('float32', result, [B, ph * pw, patch_dim]);
107
+ }
108
+
109
+ /**
110
+ * Pad patches along the patch dimension to `target_length`.
111
+ * @param {Tensor} patches Shape: [1, current_length, patch_dim]
112
+ * @param {number} target_length
113
+ * @returns {{ padded: Tensor, mask: Tensor }}
114
+ */
115
+ function pad_along_first_dim(patches, target_length) {
116
+ const [, len, dim] = patches.dims;
117
+ const mask_data = new BigInt64Array(target_length);
118
+ mask_data.fill(1n, 0, len);
119
+
120
+ let padded = patches;
121
+ if (len < target_length) {
122
+ const padded_data = new Float32Array(target_length * dim);
123
+ padded_data.set(/** @type {Float32Array} */ (patches.data));
124
+ padded = new Tensor('float32', padded_data, [1, target_length, dim]);
125
+ }
126
+
127
+ return { padded, mask: new Tensor('int64', mask_data, [target_length]) };
128
+ }
129
+
130
+ export class Lfm2VlImageProcessor extends ImageProcessor {
131
+ constructor(/** @type {Record<string, any>} */ config) {
132
+ super(config);
133
+ this.downsample_factor = config.downsample_factor ?? 2;
134
+ this.do_image_splitting = config.do_image_splitting ?? true;
135
+ this.min_tiles = config.min_tiles ?? 2;
136
+ this.max_tiles = config.max_tiles ?? 10;
137
+ this.use_thumbnail = config.use_thumbnail ?? true;
138
+ this.min_image_tokens = config.min_image_tokens ?? 64;
139
+ this.max_image_tokens = config.max_image_tokens ?? 256;
140
+ this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
141
+ this.tile_size = config.tile_size ?? 512;
142
+ this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2.0;
143
+ this.return_row_col_info = config.return_row_col_info ?? false;
144
+
145
+ const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
146
+ const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
147
+ this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
148
+ }
149
+
150
+ /**
151
+ * Check if the image is too large to be processed as a single tile.
152
+ * @param {number} height
153
+ * @param {number} width
154
+ * @returns {boolean}
155
+ */
156
+ _is_image_too_large(height, width) {
157
+ const total_factor = this.encoder_patch_size * this.downsample_factor;
158
+ const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
159
+ const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
160
+ return (
161
+ h_bar * w_bar >
162
+ this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance
163
+ );
164
+ }
165
+
166
+ /**
167
+ * Get the grid layout for tiling a large image.
168
+ * @param {number} height
169
+ * @param {number} width
170
+ * @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
171
+ */
172
+ _get_grid_layout(height, width) {
173
+ const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
174
+ const [grid_width, grid_height] = find_closest_aspect_ratio(
175
+ width / height,
176
+ target_ratios,
177
+ width,
178
+ height,
179
+ this.tile_size,
180
+ );
181
+ return {
182
+ grid_width,
183
+ grid_height,
184
+ target_width: this.tile_size * grid_width,
185
+ target_height: this.tile_size * grid_height,
186
+ };
187
+ }
188
+
189
+ /** @param {RawImage|RawImage[]|RawImage[][]} images */
190
+ // @ts-expect-error
191
+ async _call(images, { return_row_col_info = null } = {}) {
192
+ /** @type {RawImage[][]} */
193
+ let batched_images;
194
+ if (!Array.isArray(images)) {
195
+ batched_images = [[images]];
196
+ } else if (!Array.isArray(images[0])) {
197
+ batched_images = [/** @type {RawImage[]} */ (images)];
198
+ } else {
199
+ batched_images = /** @type {RawImage[][]} */ (images);
200
+ }
201
+
202
+ /** @type {Tensor[]} */
203
+ const all_pixel_values = [];
204
+ /** @type {Tensor[]} */
205
+ const all_pixel_masks = [];
206
+ /** @type {number[][]} */
207
+ const all_spatial_shapes = [];
208
+ /** @type {number[]} */
209
+ const all_rows = [];
210
+ /** @type {number[]} */
211
+ const all_cols = [];
212
+ /** @type {number[][]} */
213
+ const all_image_sizes = [];
214
+
215
+ for (const image_batch of batched_images) {
216
+ const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
217
+
218
+ for (const { pixel_values } of preprocessed) {
219
+ const [, height, width] = pixel_values.dims;
220
+ const img = pixel_values.unsqueeze_(0);
221
+
222
+ const total_factor = this.encoder_patch_size * this.downsample_factor;
223
+ const f2 = total_factor ** 2;
224
+ const [new_height, new_width] = smart_resize(
225
+ Math.max(total_factor, height),
226
+ Math.max(total_factor, width),
227
+ total_factor,
228
+ this.min_image_tokens * f2,
229
+ this.max_image_tokens * f2,
230
+ ).map((x) => Math.max(total_factor, x));
231
+
232
+ /** @type {Tensor[]} */
233
+ let tiles;
234
+ let num_rows = 1,
235
+ num_cols = 1;
236
+
237
+ const is_large = this._is_image_too_large(height, width);
238
+ const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
239
+
240
+ if (is_large && do_splitting) {
241
+ const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
242
+ height,
243
+ width,
244
+ );
245
+ num_rows = grid_height;
246
+ num_cols = grid_width;
247
+
248
+ const resized = await interpolate_4d(img, {
249
+ size: [target_height, target_width],
250
+ });
251
+
252
+ tiles = [];
253
+ for (let r = 0; r < grid_height; ++r) {
254
+ for (let c = 0; c < grid_width; ++c) {
255
+ const y = r * this.tile_size;
256
+ const x = c * this.tile_size;
257
+ tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
258
+ }
259
+ }
260
+
261
+ if (this.use_thumbnail && grid_width * grid_height !== 1) {
262
+ tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
263
+ }
264
+ } else {
265
+ tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
266
+ }
267
+
268
+ for (const tile of tiles) {
269
+ const [, , th, tw] = tile.dims;
270
+ const patches = convert_image_to_patches(tile, this.encoder_patch_size);
271
+ const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
272
+
273
+ all_pixel_values.push(padded);
274
+ all_pixel_masks.push(mask);
275
+ all_spatial_shapes.push([
276
+ Math.floor(th / this.encoder_patch_size),
277
+ Math.floor(tw / this.encoder_patch_size),
278
+ ]);
279
+ }
280
+
281
+ all_rows.push(num_rows);
282
+ all_cols.push(num_cols);
283
+ all_image_sizes.push([new_height, new_width]);
284
+ }
285
+ }
286
+
287
+ /** @type {Record<string, any>} */
288
+ const result = {
289
+ pixel_values: cat(all_pixel_values, 0),
290
+ pixel_attention_mask: stack(all_pixel_masks, 0),
291
+ spatial_shapes: new Tensor('int64', BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
292
+ all_spatial_shapes.length,
293
+ 2,
294
+ ]),
295
+ };
296
+
297
+ if (return_row_col_info ?? this.return_row_col_info) {
298
+ result.image_rows = all_rows;
299
+ result.image_cols = all_cols;
300
+ result.image_sizes = all_image_sizes;
301
+ }
302
+
303
+ return result;
304
+ }
305
+ }
@@ -0,0 +1,13 @@
1
+ import { LlavaForConditionalGeneration } from '../llava/modeling_llava.js';
2
+
3
+ export class Lfm2VlForConditionalGeneration extends LlavaForConditionalGeneration {
4
+ forward_params = [
5
+ 'input_ids',
6
+ 'attention_mask',
7
+ 'pixel_values',
8
+ 'pixel_attention_mask',
9
+ 'spatial_shapes',
10
+ 'position_ids',
11
+ 'past_key_values',
12
+ ];
13
+ }
@@ -0,0 +1,77 @@
1
+ import { Processor } from '../../processing_utils.js';
2
+ import { AutoImageProcessor } from '../auto/image_processing_auto.js';
3
+ import { AutoTokenizer } from '../auto/tokenization_auto.js';
4
+
5
+ /**
6
+ * @typedef {import('../../utils/image.js').RawImage} RawImage
7
+ */
8
+
9
+ export class Lfm2VlProcessor extends Processor {
10
+ static tokenizer_class = AutoTokenizer;
11
+ static image_processor_class = AutoImageProcessor;
12
+
13
+ /**
14
+ * @param {RawImage|RawImage[]} images
15
+ * @param {string|string[]|null} [text]
16
+ * @param {Record<string, any>} [kwargs]
17
+ */
18
+ async _call(images, text = null, kwargs = {}) {
19
+ const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
20
+ ...kwargs,
21
+ return_row_col_info: true,
22
+ });
23
+
24
+ if (text) {
25
+ const image_token = this.config.image_token ?? '<image>';
26
+ const {
27
+ tile_size = 512,
28
+ downsample_factor = 2,
29
+ encoder_patch_size = 16,
30
+ use_thumbnail = true,
31
+ } = /** @type {Record<string, any>} */ (this.image_processor.config);
32
+
33
+ const ds = (/** @type {number} */ s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
34
+ const tokens_per_tile = ds(tile_size) ** 2;
35
+ const image_start = this.config.image_start_token ?? '<|image_start|>';
36
+ const image_end = this.config.image_end_token ?? '<|image_end|>';
37
+ const thumbnail_token = this.config.image_thumbnail ?? '<|img_thumbnail|>';
38
+
39
+ if (!Array.isArray(text)) text = [text];
40
+
41
+ let image_idx = 0;
42
+ text = text.map((sample) => {
43
+ const parts = sample.split(image_token);
44
+ return (
45
+ parts[0] +
46
+ parts
47
+ .slice(1)
48
+ .map((part) => {
49
+ const idx = image_idx++;
50
+ const [h, w] = image_sizes[idx];
51
+ const rows = image_rows[idx],
52
+ cols = image_cols[idx];
53
+ const tokens_for_image = ds(h) * ds(w);
54
+
55
+ let expanded = image_start;
56
+ if (rows > 1 || cols > 1) {
57
+ const tile_str = image_token.repeat(tokens_per_tile);
58
+ for (let r = 0; r < rows; ++r)
59
+ for (let c = 0; c < cols; ++c)
60
+ expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
61
+ if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
62
+ } else {
63
+ expanded += image_token.repeat(tokens_for_image);
64
+ }
65
+ return expanded + image_end + part;
66
+ })
67
+ .join('')
68
+ );
69
+ });
70
+ }
71
+
72
+ return {
73
+ ...image_inputs,
74
+ ...(text ? this.tokenizer(text, kwargs) : {}),
75
+ };
76
+ }
77
+ }
@@ -14,7 +14,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
14
14
 
15
15
  return default_merge_input_ids_with_image_features({
16
16
  // @ts-ignore
17
- image_token_id: this.config.image_token_index,
17
+ image_token_id: this.config.image_token_index ?? this.config.image_token_id,
18
18
  ...kwargs,
19
19
  image_features: reshaped_image_hidden_states,
20
20
  });
@@ -1,3 +1,3 @@
1
- import { LlavaQwen2ForCausalLM } from '../llava/modeling_llava.js';
1
+ import { LlavaForConditionalGeneration } from '../llava/modeling_llava.js';
2
2
 
3
- export class Mistral3ForConditionalGeneration extends LlavaQwen2ForCausalLM {}
3
+ export class Mistral3ForConditionalGeneration extends LlavaForConditionalGeneration {}