@huggingface/transformers 4.0.0-next.5 → 4.0.0-next.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/README.md +12 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
  3. package/dist/transformers.js +2189 -1015
  4. package/dist/transformers.min.js +16 -16
  5. package/dist/transformers.node.cjs +2234 -1029
  6. package/dist/transformers.node.min.cjs +20 -20
  7. package/dist/transformers.node.min.mjs +20 -20
  8. package/dist/transformers.node.mjs +2194 -1017
  9. package/dist/transformers.web.js +2175 -1001
  10. package/dist/transformers.web.min.js +18 -18
  11. package/package.json +4 -4
  12. package/src/backends/onnx.js +77 -58
  13. package/src/backends/utils/cacheWasm.js +22 -43
  14. package/src/cache_utils.js +62 -0
  15. package/src/configs.js +32 -5
  16. package/src/env.js +36 -6
  17. package/src/image_processors_utils.js +3 -3
  18. package/src/models/auto/modeling_auto.js +14 -1
  19. package/src/models/chatterbox/modeling_chatterbox.js +1 -1
  20. package/src/models/detr/image_processing_detr.js +1 -1
  21. package/src/models/feature_extractors.js +2 -0
  22. package/src/models/gemma3n/modeling_gemma3n.js +2 -0
  23. package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
  24. package/src/models/granite_speech/modeling_granite_speech.js +5 -0
  25. package/src/models/granite_speech/processing_granite_speech.js +62 -0
  26. package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
  27. package/src/models/idefics3/modeling_idefics3.js +5 -32
  28. package/src/models/image_processors.js +1 -0
  29. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
  30. package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
  31. package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
  32. package/src/models/llava/modeling_llava.js +1 -1
  33. package/src/models/mistral3/modeling_mistral3.js +2 -2
  34. package/src/models/modeling_utils.js +234 -292
  35. package/src/models/models.js +9 -0
  36. package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
  37. package/src/models/paligemma/modeling_paligemma.js +2 -25
  38. package/src/models/processors.js +3 -0
  39. package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
  40. package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
  41. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
  42. package/src/models/qwen2_vl/modeling_qwen2_vl.js +36 -3
  43. package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
  44. package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
  45. package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
  46. package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
  47. package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
  48. package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
  49. package/src/models/registry.js +39 -4
  50. package/src/models/sam/image_processing_sam.js +1 -1
  51. package/src/models/session.js +17 -6
  52. package/src/models/smolvlm/modeling_smolvlm.js +7 -0
  53. package/src/models/ultravox/modeling_ultravox.js +1 -3
  54. package/src/models/voxtral/modeling_voxtral.js +3 -0
  55. package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
  56. package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
  57. package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
  58. package/src/models/whisper/feature_extraction_whisper.js +2 -12
  59. package/src/pipelines/index.js +2 -84
  60. package/src/pipelines.js +40 -77
  61. package/src/transformers.js +2 -0
  62. package/src/utils/audio.js +18 -2
  63. package/src/utils/cache/CrossOriginStorageCache.js +251 -0
  64. package/src/utils/cache/FileCache.js +128 -0
  65. package/src/utils/cache/cross-origin-storage.d.ts +38 -0
  66. package/src/utils/cache.js +8 -3
  67. package/src/utils/hub/{files.js → FileResponse.js} +0 -105
  68. package/src/utils/hub/utils.js +35 -1
  69. package/src/utils/hub.js +6 -5
  70. package/src/utils/image.js +12 -13
  71. package/src/utils/lru_cache.js +67 -0
  72. package/src/utils/memoize_promise.js +45 -0
  73. package/src/utils/model_registry/ModelRegistry.js +70 -23
  74. package/src/utils/model_registry/get_file_metadata.js +14 -2
  75. package/src/utils/model_registry/get_model_files.js +63 -78
  76. package/src/utils/model_registry/get_pipeline_files.js +15 -24
  77. package/src/utils/model_registry/is_cached.js +81 -4
  78. package/src/utils/tensor.js +18 -2
  79. package/types/backends/onnx.d.ts.map +1 -1
  80. package/types/backends/utils/cacheWasm.d.ts +3 -17
  81. package/types/backends/utils/cacheWasm.d.ts.map +1 -1
  82. package/types/cache_utils.d.ts +29 -0
  83. package/types/cache_utils.d.ts.map +1 -0
  84. package/types/configs.d.ts.map +1 -1
  85. package/types/env.d.ts +18 -3
  86. package/types/env.d.ts.map +1 -1
  87. package/types/image_processors_utils.d.ts +17 -1
  88. package/types/image_processors_utils.d.ts.map +1 -1
  89. package/types/models/auto/modeling_auto.d.ts +6 -0
  90. package/types/models/auto/modeling_auto.d.ts.map +1 -1
  91. package/types/models/detr/image_processing_detr.d.ts +1 -1
  92. package/types/models/feature_extractors.d.ts +2 -0
  93. package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
  94. package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
  95. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
  96. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
  97. package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
  98. package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
  99. package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
  100. package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
  101. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
  102. package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
  103. package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
  104. package/types/models/image_processors.d.ts +1 -0
  105. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
  106. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
  107. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
  108. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
  109. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
  110. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
  111. package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
  112. package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
  113. package/types/models/modeling_utils.d.ts +44 -24
  114. package/types/models/modeling_utils.d.ts.map +1 -1
  115. package/types/models/models.d.ts +9 -0
  116. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
  117. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
  118. package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
  119. package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
  120. package/types/models/processors.d.ts +3 -0
  121. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
  122. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
  123. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
  124. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
  125. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
  126. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
  127. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  128. package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
  129. package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
  130. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
  131. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
  132. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
  133. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
  134. package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
  135. package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
  136. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
  137. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
  138. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
  139. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
  140. package/types/models/registry.d.ts +2 -1
  141. package/types/models/registry.d.ts.map +1 -1
  142. package/types/models/sam/image_processing_sam.d.ts +1 -1
  143. package/types/models/session.d.ts +3 -2
  144. package/types/models/session.d.ts.map +1 -1
  145. package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
  146. package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
  147. package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
  148. package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
  149. package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
  150. package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
  151. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
  152. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
  153. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
  154. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
  155. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
  156. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
  157. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  158. package/types/pipelines/index.d.ts +0 -34
  159. package/types/pipelines/index.d.ts.map +1 -1
  160. package/types/pipelines.d.ts.map +1 -1
  161. package/types/transformers.d.ts +1 -0
  162. package/types/transformers.d.ts.map +1 -1
  163. package/types/utils/audio.d.ts +5 -2
  164. package/types/utils/audio.d.ts.map +1 -1
  165. package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
  166. package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
  167. package/types/utils/cache/FileCache.d.ts +39 -0
  168. package/types/utils/cache/FileCache.d.ts.map +1 -0
  169. package/types/utils/cache.d.ts +4 -4
  170. package/types/utils/cache.d.ts.map +1 -1
  171. package/types/utils/dtypes.d.ts +1 -1
  172. package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -38
  173. package/types/utils/hub/FileResponse.d.ts.map +1 -0
  174. package/types/utils/hub/utils.d.ts +17 -2
  175. package/types/utils/hub/utils.d.ts.map +1 -1
  176. package/types/utils/hub.d.ts +7 -7
  177. package/types/utils/hub.d.ts.map +1 -1
  178. package/types/utils/image.d.ts +1 -1
  179. package/types/utils/image.d.ts.map +1 -1
  180. package/types/utils/lru_cache.d.ts +38 -0
  181. package/types/utils/lru_cache.d.ts.map +1 -0
  182. package/types/utils/memoize_promise.d.ts +14 -0
  183. package/types/utils/memoize_promise.d.ts.map +1 -0
  184. package/types/utils/model_registry/ModelRegistry.d.ts +66 -6
  185. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
  186. package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
  187. package/types/utils/model_registry/get_model_files.d.ts +1 -0
  188. package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
  189. package/types/utils/model_registry/get_pipeline_files.d.ts +2 -1
  190. package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -1
  191. package/types/utils/model_registry/is_cached.d.ts +47 -4
  192. package/types/utils/model_registry/is_cached.d.ts.map +1 -1
  193. package/types/utils/tensor.d.ts.map +1 -1
  194. package/src/utils/data-structures.js +0 -572
  195. package/types/utils/data-structures.d.ts +0 -294
  196. package/types/utils/data-structures.d.ts.map +0 -1
  197. package/types/utils/hub/files.d.ts.map +0 -1
@@ -0,0 +1,239 @@
1
+ import { PreTrainedModel } from '../modeling_utils.js';
2
+ import { sessionRun } from '../session.js';
3
+ import { getCacheShapes } from '../../configs.js';
4
+ import { Tensor, ones } from '../../utils/tensor.js';
5
+ import { DataTypeMap } from '../../utils/dtypes.js';
6
+ import { pick } from '../../utils/core.js';
7
+ import { DynamicCache } from '../../cache_utils.js';
8
+ import { StoppingCriteria, StoppingCriteriaList } from '../../generation/stopping_criteria.js';
9
+
10
+ // Causal conv padding constants
11
+ const CONV1_LEFT_PAD = 2;
12
+ const CONV2_LEFT_PAD = 1;
13
+
14
+ /**
15
+ * WeakMap to hold encoder streaming states for each model instance during generation.
16
+ * This allows the state to be accessed and modified across the generation process
17
+ * without exposing it on the model instance itself.
18
+ * @private
19
+ * @type {WeakMap<VoxtralRealtimeForConditionalGeneration, Object>}
20
+ */
21
+ const states = new WeakMap();
22
+
23
+ /**
24
+ * Creates encoder streaming state for a VoxtralRealtime generation session.
25
+ * @param {VoxtralRealtimeForConditionalGeneration} model
26
+ * @param {Iterable<Tensor>|AsyncIterable<Tensor>} input_features
27
+ * @returns {Object} Encoder state object.
28
+ * @private
29
+ */
30
+ function createEncoderState(model, input_features) {
31
+ const { text_config, audio_config } = /** @type {any} */ (model.config);
32
+ const encoder_session = model.sessions['audio_encoder'];
33
+
34
+ const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
35
+ const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
36
+
37
+ // Initialize encoder KV cache
38
+ const enc_kv_cache = new DynamicCache();
39
+ const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? 'float32';
40
+ const enc_cls = enc_dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32;
41
+ const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
42
+ for (const name in enc_shapes) {
43
+ const size = enc_shapes[name].reduce((a, b) => a * b, 1);
44
+ enc_kv_cache[name] = new Tensor(enc_dtype, new enc_cls(size), enc_shapes[name]);
45
+ }
46
+
47
+ const enc_padding_cache = new Tensor(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
48
+ 1,
49
+ PADDING_CACHE_CHANNELS,
50
+ CONV1_LEFT_PAD,
51
+ ]);
52
+
53
+ // Set up iterator from input_features
54
+ const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
55
+ if (!chunks_iter) {
56
+ throw new Error('input_features must be iterable or async iterable');
57
+ }
58
+
59
+ return {
60
+ encoder_session,
61
+ enc_kv_cache,
62
+ enc_padding_cache,
63
+ enc_past_seq_len: 0,
64
+ audio_embed_queue: [],
65
+ audio_embed_total_tokens: 0,
66
+ audio_queue_offset: 0,
67
+ audio_consumed: 0,
68
+ stream_exhausted: false,
69
+ chunks_iter,
70
+ text_hidden_size: text_config.hidden_size,
71
+ };
72
+ }
73
+
74
+ /**
75
+ * Encodes one audio chunk through the audio encoder.
76
+ * @param {Object} s Encoder state.
77
+ * @param {Tensor} chunk_features Mel spectrogram chunk [1, num_mel_bins, seq_len].
78
+ * @returns {Promise<Tensor>} Audio embeddings.
79
+ * @private
80
+ */
81
+ async function encodeChunk(s, chunk_features) {
82
+ const audio_seq_len = chunk_features.dims[2];
83
+ const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
84
+
85
+ const position_ids = new Tensor(
86
+ 'int64',
87
+ BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
88
+ [1, conv2_output_len],
89
+ );
90
+
91
+ const total_seq_len = s.enc_past_seq_len + conv2_output_len;
92
+ const attention_mask = ones([1, total_seq_len]);
93
+ const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
94
+ input_features: chunk_features,
95
+ attention_mask,
96
+ position_ids,
97
+ past_padding_cache: s.enc_padding_cache,
98
+ ...s.enc_kv_cache,
99
+ });
100
+ // Dispose previous padding cache and update
101
+ if (s.enc_padding_cache.location === 'gpu-buffer') {
102
+ s.enc_padding_cache.dispose();
103
+ }
104
+ s.enc_padding_cache = present_padding_cache;
105
+
106
+ // Update encoder KV cache, disposing previous tensors
107
+ for (const name in present_cache) {
108
+ if (name.startsWith('present.')) {
109
+ const pastName = name.replace('present', 'past_key_values');
110
+ const prev = s.enc_kv_cache[pastName];
111
+ if (prev?.location === 'gpu-buffer') {
112
+ prev.dispose();
113
+ }
114
+ s.enc_kv_cache[pastName] = present_cache[name];
115
+ }
116
+ }
117
+ s.enc_past_seq_len = total_seq_len;
118
+ return audio_embeds;
119
+ }
120
+
121
+ /**
122
+ * Fills the audio embedding buffer until it has enough tokens.
123
+ * @param {Object} s Encoder state.
124
+ * @param {number} needed Total number of audio tokens needed.
125
+ * @private
126
+ */
127
+ async function fillAudioBuffer(s, needed) {
128
+ while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
129
+ const result = await s.chunks_iter.next();
130
+ if (result.done) {
131
+ s.stream_exhausted = true;
132
+ break;
133
+ }
134
+ const new_embeds = await encodeChunk(s, result.value);
135
+ s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
136
+ s.audio_embed_total_tokens += new_embeds.dims[1];
137
+ }
138
+ }
139
+
140
+ /**
141
+ * Adds audio embeddings to text embeddings from the queue.
142
+ * @param {Object} s Encoder state.
143
+ * @param {Tensor} inputs_embeds Text embeddings tensor (modified in-place).
144
+ * @param {number} current_len Number of tokens to consume.
145
+ * @private
146
+ */
147
+ function addAudioEmbeddings(s, inputs_embeds, current_len) {
148
+ if (s.audio_embed_queue.length === 0) return;
149
+
150
+ const embed_data = inputs_embeds.data;
151
+ let embed_write_pos = 0;
152
+ let remaining = current_len;
153
+
154
+ while (remaining > 0 && s.audio_embed_queue.length > 0) {
155
+ const front = s.audio_embed_queue[0];
156
+ const available = front.tokens - s.audio_queue_offset;
157
+ const n = Math.min(remaining, available);
158
+
159
+ const src_offset = s.audio_queue_offset * s.text_hidden_size;
160
+ for (let i = 0; i < n * s.text_hidden_size; ++i) {
161
+ embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
162
+ }
163
+
164
+ embed_write_pos += n;
165
+ remaining -= n;
166
+ s.audio_queue_offset += n;
167
+
168
+ if (s.audio_queue_offset >= front.tokens) {
169
+ s.audio_embed_queue.shift();
170
+ s.audio_queue_offset = 0;
171
+ }
172
+ }
173
+ s.audio_consumed += current_len - remaining;
174
+ }
175
+
176
+ /**
177
+ * Stopping criterion that triggers when the audio stream is exhausted
178
+ * and all buffered audio embeddings have been consumed.
179
+ * @private
180
+ */
181
+ class AudioExhaustedCriteria extends StoppingCriteria {
182
+ constructor(enc_state) {
183
+ super();
184
+ this._s = enc_state;
185
+ }
186
+ _call(input_ids) {
187
+ const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
188
+ return input_ids.map(() => done);
189
+ }
190
+ }
191
+
192
+ export class VoxtralRealtimePreTrainedModel extends PreTrainedModel {
193
+ forward_params = ['input_ids', 'attention_mask', 'position_ids', 'past_key_values'];
194
+ }
195
+
196
+ export class VoxtralRealtimeForConditionalGeneration extends VoxtralRealtimePreTrainedModel {
197
+ async forward({ input_ids, past_key_values, ...kwargs }) {
198
+ const current_len = input_ids.dims[1];
199
+
200
+ const enc = states.get(this);
201
+ if (enc) {
202
+ // Fill audio buffer and embed tokens with audio
203
+ await fillAudioBuffer(enc, enc.audio_consumed + current_len);
204
+ }
205
+
206
+ const { inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], { input_ids });
207
+ if (enc) {
208
+ addAudioEmbeddings(enc, inputs_embeds, current_len);
209
+ }
210
+
211
+ const decoder_feeds = { inputs_embeds, ...kwargs };
212
+ this.addPastKeyValues(decoder_feeds, past_key_values);
213
+
214
+ const session = this.sessions['decoder_model_merged'];
215
+ const fixed = pick(decoder_feeds, session.inputNames);
216
+ return await sessionRun(session, fixed);
217
+ }
218
+
219
+ async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
220
+ if (!input_features) {
221
+ throw new Error('input_features (generator/iterable) must be provided');
222
+ }
223
+
224
+ const enc_state = createEncoderState(this, input_features);
225
+ states.set(this, enc_state);
226
+
227
+ const stopping_criteria = new StoppingCriteriaList();
228
+ stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
229
+ if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
230
+
231
+ try {
232
+ return await super.generate({ ...kwargs, stopping_criteria });
233
+ } finally {
234
+ // Cleanup encoder state
235
+ enc_state.enc_kv_cache.dispose();
236
+ states.delete(this);
237
+ }
238
+ }
239
+ }
@@ -0,0 +1,113 @@
1
+ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
2
+ import { AutoTokenizer } from '../auto/tokenization_auto.js';
3
+ import { Processor } from '../../processing_utils.js';
4
+ import { Tensor } from '../../utils/tensor.js';
5
+ import { validate_audio_inputs } from '../../feature_extraction_utils.js';
6
+
7
+ // Voxtral Realtime audio config constants (from mistral_common AudioConfig)
8
+ const NUM_LEFT_PAD_TOKENS = 32;
9
+ const NUM_DELAY_TOKENS = 6;
10
+ const AUDIO_LENGTH_PER_TOK = 8;
11
+ const OFFLINE_STREAMING_BUFFER_TOKENS = 10;
12
+
13
+ /** Token ID for [STREAMING_PAD] in the Voxtral tokenizer. */
14
+ const STREAMING_PAD_TOKEN_ID = 32;
15
+
16
+ export class VoxtralRealtimeProcessor extends Processor {
17
+ static tokenizer_class = AutoTokenizer;
18
+ static feature_extractor_class = AutoFeatureExtractor;
19
+ static uses_processor_config = false;
20
+
21
+ /** Number of mel frames in the first audio chunk. */
22
+ get num_mel_frames_first_audio_chunk() {
23
+ return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
24
+ }
25
+
26
+ /** Number of raw audio samples in the first audio chunk. */
27
+ get num_samples_first_audio_chunk() {
28
+ const { hop_length, n_fft } = this.feature_extractor.config;
29
+ return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
30
+ }
31
+
32
+ /** Number of raw audio samples per subsequent audio chunk. */
33
+ get num_samples_per_audio_chunk() {
34
+ const { hop_length, n_fft } = this.feature_extractor.config;
35
+ return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
36
+ }
37
+
38
+ /** Number of right-pad tokens for non-streaming mode. */
39
+ get num_right_pad_tokens() {
40
+ return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
41
+ }
42
+
43
+ /** Number of mel frames per text token. */
44
+ get audio_length_per_tok() {
45
+ return AUDIO_LENGTH_PER_TOK;
46
+ }
47
+
48
+ /** Number of raw audio samples per token. */
49
+ get raw_audio_length_per_tok() {
50
+ return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
51
+ }
52
+
53
+ /**
54
+ * Process audio input for VoxtralRealtime.
55
+ *
56
+ * In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
57
+ * with silence and mel features are extracted with `center=true`.
58
+ * Returns `{ input_ids, input_features }`.
59
+ *
60
+ * In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
61
+ * processed with `center=false` and only `{ input_features }` is returned.
62
+ *
63
+ * In non-streaming mode, the audio is right-padded to ensure the model
64
+ * transcribes the full audio, then processed with `center=true`.
65
+ * Returns `{ input_features }`.
66
+ *
67
+ * @param {Float32Array|Float64Array} audio The audio waveform.
68
+ * @param {Object} [options]
69
+ * @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
70
+ * @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
71
+ * @returns {Promise<Object>}
72
+ */
73
+ async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
74
+ validate_audio_inputs(audio, 'VoxtralRealtimeProcessor');
75
+
76
+ if (!is_streaming && !is_first_audio_chunk) {
77
+ throw new Error('In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.');
78
+ }
79
+
80
+ if (is_first_audio_chunk) {
81
+ if (is_streaming) {
82
+ // Streaming first chunk: left-pad audio with silence, extract mel with center=true, build input_ids
83
+ const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
84
+ const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
85
+ padded_audio.set(audio, num_left_pad_samples);
86
+
87
+ const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
88
+
89
+ // Build input_ids: BOS + (num_left_pad_tokens + num_delay_tokens) * [STREAMING_PAD]
90
+ const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
91
+ const num_input_tokens = 1 + num_pad_tokens;
92
+ const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
93
+ input_ids_data[0] = 1n; // BOS
94
+ const input_ids = new Tensor('int64', input_ids_data, [1, num_input_tokens]);
95
+
96
+ return {
97
+ input_ids,
98
+ ...audio_encoding,
99
+ };
100
+ } else {
101
+ // Non-streaming: right-pad audio to ensure full transcription, extract mel with center=true
102
+ const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
103
+ const padded_audio = new Float32Array(audio.length + right_pad_samples);
104
+ padded_audio.set(audio);
105
+
106
+ return await this.feature_extractor(padded_audio, { center: true });
107
+ }
108
+ } else {
109
+ // Subsequent streaming chunks: extract mel with center=false
110
+ return await this.feature_extractor(audio, { center: false });
111
+ }
112
+ }
113
+ }
@@ -1,7 +1,6 @@
1
1
  import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
2
2
  import { Tensor } from '../../utils/tensor.js';
3
3
  import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
4
- import { max } from '../../utils/maths.js';
5
4
  import { logger } from '../../utils/logger.js';
6
5
 
7
6
  export class WhisperFeatureExtractor extends FeatureExtractor {
@@ -28,7 +27,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
28
27
  * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
29
28
  */
30
29
  async _extract_fbank_features(waveform) {
31
- const features = await spectrogram(
30
+ return await spectrogram(
32
31
  waveform,
33
32
  this.window, // window
34
33
  this.config.n_fft, // frame_length
@@ -36,7 +35,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
36
35
  {
37
36
  power: 2.0,
38
37
  mel_filters: this.config.mel_filters,
39
- log_mel: 'log10',
38
+ log_mel: 'log10_max_norm',
40
39
 
41
40
  // Custom
42
41
  max_num_frames: Math.min(
@@ -45,15 +44,6 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
45
44
  ),
46
45
  },
47
46
  );
48
-
49
- const data = features.data;
50
- const maxValue = max(/** @type {Float32Array} */ (data))[0];
51
-
52
- for (let i = 0; i < data.length; ++i) {
53
- data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
54
- }
55
-
56
- return features;
57
47
  }
58
48
 
59
49
  /**
@@ -1,11 +1,10 @@
1
1
  /**
2
2
  * @file Pipeline task configurations and aliases
3
3
  *
4
- * Defines which components (tokenizer, processor, model) each pipeline task needs.
4
+ * Defines which pipeline class and model class(es) each pipeline task needs.
5
+ * Tokenizer and processor loading is determined automatically from the model's files.
5
6
  */
6
7
 
7
- import { AutoTokenizer } from '../models/auto/tokenization_auto.js';
8
- import { AutoProcessor } from '../models/auto/processing_auto.js';
9
8
  import {
10
9
  AutoModel,
11
10
  AutoModelForSequenceClassification,
@@ -60,41 +59,30 @@ import { ImageFeatureExtractionPipeline } from './image-feature-extraction.js';
60
59
 
61
60
  export const SUPPORTED_TASKS = Object.freeze({
62
61
  'text-classification': {
63
- tokenizer: AutoTokenizer,
64
62
  pipeline: TextClassificationPipeline,
65
63
  model: AutoModelForSequenceClassification,
66
64
  default: {
67
- // TODO: replace with original
68
- // "model": "distilbert-base-uncased-finetuned-sst-2-english",
69
65
  model: 'Xenova/distilbert-base-uncased-finetuned-sst-2-english',
70
66
  },
71
67
  type: 'text',
72
68
  },
73
69
  'token-classification': {
74
- tokenizer: AutoTokenizer,
75
70
  pipeline: TokenClassificationPipeline,
76
71
  model: AutoModelForTokenClassification,
77
72
  default: {
78
- // TODO: replace with original
79
- // "model": "Davlan/bert-base-multilingual-cased-ner-hrl",
80
73
  model: 'Xenova/bert-base-multilingual-cased-ner-hrl',
81
74
  },
82
75
  type: 'text',
83
76
  },
84
77
  'question-answering': {
85
- tokenizer: AutoTokenizer,
86
78
  pipeline: QuestionAnsweringPipeline,
87
79
  model: AutoModelForQuestionAnswering,
88
80
  default: {
89
- // TODO: replace with original
90
- // "model": "distilbert-base-cased-distilled-squad",
91
81
  model: 'Xenova/distilbert-base-cased-distilled-squad',
92
82
  },
93
83
  type: 'text',
94
84
  },
95
-
96
85
  'fill-mask': {
97
- tokenizer: AutoTokenizer,
98
86
  pipeline: FillMaskPipeline,
99
87
  model: AutoModelForMaskedLM,
100
88
  default: {
@@ -104,40 +92,30 @@ export const SUPPORTED_TASKS = Object.freeze({
104
92
  type: 'text',
105
93
  },
106
94
  summarization: {
107
- tokenizer: AutoTokenizer,
108
95
  pipeline: SummarizationPipeline,
109
96
  model: AutoModelForSeq2SeqLM,
110
97
  default: {
111
- // TODO: replace with original
112
- // "model": "sshleifer/distilbart-cnn-6-6",
113
98
  model: 'Xenova/distilbart-cnn-6-6',
114
99
  },
115
100
  type: 'text',
116
101
  },
117
102
  translation: {
118
- tokenizer: AutoTokenizer,
119
103
  pipeline: TranslationPipeline,
120
104
  model: AutoModelForSeq2SeqLM,
121
105
  default: {
122
- // TODO: replace with original
123
- // "model": "t5-small",
124
106
  model: 'Xenova/t5-small',
125
107
  },
126
108
  type: 'text',
127
109
  },
128
110
  'text2text-generation': {
129
- tokenizer: AutoTokenizer,
130
111
  pipeline: Text2TextGenerationPipeline,
131
112
  model: AutoModelForSeq2SeqLM,
132
113
  default: {
133
- // TODO: replace with original
134
- // "model": "google/flan-t5-small",
135
114
  model: 'Xenova/flan-t5-small',
136
115
  },
137
116
  type: 'text',
138
117
  },
139
118
  'text-generation': {
140
- tokenizer: AutoTokenizer,
141
119
  pipeline: TextGenerationPipeline,
142
120
  model: AutoModelForCausalLM,
143
121
  default: {
@@ -147,12 +125,9 @@ export const SUPPORTED_TASKS = Object.freeze({
147
125
  type: 'text',
148
126
  },
149
127
  'zero-shot-classification': {
150
- tokenizer: AutoTokenizer,
151
128
  pipeline: ZeroShotClassificationPipeline,
152
129
  model: AutoModelForSequenceClassification,
153
130
  default: {
154
- // TODO: replace with original
155
- // "model": "typeform/distilbert-base-uncased-mnli",
156
131
  model: 'Xenova/distilbert-base-uncased-mnli',
157
132
  },
158
133
  type: 'text',
@@ -160,43 +135,30 @@ export const SUPPORTED_TASKS = Object.freeze({
160
135
  'audio-classification': {
161
136
  pipeline: AudioClassificationPipeline,
162
137
  model: AutoModelForAudioClassification,
163
- processor: AutoProcessor,
164
138
  default: {
165
- // TODO: replace with original
166
- // "model": "superb/wav2vec2-base-superb-ks",
167
139
  model: 'Xenova/wav2vec2-base-superb-ks',
168
140
  },
169
141
  type: 'audio',
170
142
  },
171
143
  'zero-shot-audio-classification': {
172
- tokenizer: AutoTokenizer,
173
144
  pipeline: ZeroShotAudioClassificationPipeline,
174
145
  model: AutoModel,
175
- processor: AutoProcessor,
176
146
  default: {
177
- // TODO: replace with original
178
- // "model": "laion/clap-htsat-fused",
179
147
  model: 'Xenova/clap-htsat-unfused',
180
148
  },
181
149
  type: 'multimodal',
182
150
  },
183
151
  'automatic-speech-recognition': {
184
- tokenizer: AutoTokenizer,
185
152
  pipeline: AutomaticSpeechRecognitionPipeline,
186
153
  model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
187
- processor: AutoProcessor,
188
154
  default: {
189
- // TODO: replace with original
190
- // "model": "openai/whisper-tiny.en",
191
155
  model: 'Xenova/whisper-tiny.en',
192
156
  },
193
157
  type: 'multimodal',
194
158
  },
195
159
  'text-to-audio': {
196
- tokenizer: AutoTokenizer,
197
160
  pipeline: TextToAudioPipeline,
198
161
  model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
199
- processor: [AutoProcessor, /* Some don't use a processor */ null],
200
162
  default: {
201
163
  model: 'onnx-community/Supertonic-TTS-ONNX',
202
164
  dtype: 'fp32',
@@ -204,129 +166,86 @@ export const SUPPORTED_TASKS = Object.freeze({
204
166
  type: 'text',
205
167
  },
206
168
  'image-to-text': {
207
- tokenizer: AutoTokenizer,
208
169
  pipeline: ImageToTextPipeline,
209
170
  model: AutoModelForVision2Seq,
210
- processor: AutoProcessor,
211
171
  default: {
212
- // TODO: replace with original
213
- // "model": "nlpconnect/vit-gpt2-image-captioning",
214
172
  model: 'Xenova/vit-gpt2-image-captioning',
215
173
  },
216
174
  type: 'multimodal',
217
175
  },
218
-
219
176
  'image-classification': {
220
- // no tokenizer
221
177
  pipeline: ImageClassificationPipeline,
222
178
  model: AutoModelForImageClassification,
223
- processor: AutoProcessor,
224
179
  default: {
225
- // TODO: replace with original
226
- // "model": "google/vit-base-patch16-224",
227
180
  model: 'Xenova/vit-base-patch16-224',
228
181
  },
229
182
  type: 'multimodal',
230
183
  },
231
-
232
184
  'image-segmentation': {
233
- // no tokenizer
234
185
  pipeline: ImageSegmentationPipeline,
235
186
  model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
236
- processor: AutoProcessor,
237
187
  default: {
238
- // TODO: replace with original
239
- // "model": "facebook/detr-resnet-50-panoptic",
240
188
  model: 'Xenova/detr-resnet-50-panoptic',
241
189
  },
242
190
  type: 'multimodal',
243
191
  },
244
192
  'background-removal': {
245
- // no tokenizer
246
193
  pipeline: BackgroundRemovalPipeline,
247
194
  model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
248
- processor: AutoProcessor,
249
195
  default: {
250
196
  model: 'Xenova/modnet',
251
197
  },
252
198
  type: 'image',
253
199
  },
254
-
255
200
  'zero-shot-image-classification': {
256
- tokenizer: AutoTokenizer,
257
201
  pipeline: ZeroShotImageClassificationPipeline,
258
202
  model: AutoModel,
259
- processor: AutoProcessor,
260
203
  default: {
261
- // TODO: replace with original
262
- // "model": "openai/clip-vit-base-patch32",
263
204
  model: 'Xenova/clip-vit-base-patch32',
264
205
  },
265
206
  type: 'multimodal',
266
207
  },
267
-
268
208
  'object-detection': {
269
- // no tokenizer
270
209
  pipeline: ObjectDetectionPipeline,
271
210
  model: AutoModelForObjectDetection,
272
- processor: AutoProcessor,
273
211
  default: {
274
- // TODO: replace with original
275
- // "model": "facebook/detr-resnet-50",
276
212
  model: 'Xenova/detr-resnet-50',
277
213
  },
278
214
  type: 'multimodal',
279
215
  },
280
216
  'zero-shot-object-detection': {
281
- tokenizer: AutoTokenizer,
282
217
  pipeline: ZeroShotObjectDetectionPipeline,
283
218
  model: AutoModelForZeroShotObjectDetection,
284
- processor: AutoProcessor,
285
219
  default: {
286
- // TODO: replace with original
287
- // "model": "google/owlvit-base-patch32",
288
220
  model: 'Xenova/owlvit-base-patch32',
289
221
  },
290
222
  type: 'multimodal',
291
223
  },
292
224
  'document-question-answering': {
293
- tokenizer: AutoTokenizer,
294
225
  pipeline: DocumentQuestionAnsweringPipeline,
295
226
  model: AutoModelForDocumentQuestionAnswering,
296
- processor: AutoProcessor,
297
227
  default: {
298
- // TODO: replace with original
299
- // "model": "naver-clova-ix/donut-base-finetuned-docvqa",
300
228
  model: 'Xenova/donut-base-finetuned-docvqa',
301
229
  },
302
230
  type: 'multimodal',
303
231
  },
304
232
  'image-to-image': {
305
- // no tokenizer
306
233
  pipeline: ImageToImagePipeline,
307
234
  model: AutoModelForImageToImage,
308
- processor: AutoProcessor,
309
235
  default: {
310
- // TODO: replace with original
311
- // "model": "caidas/swin2SR-classical-sr-x2-64",
312
236
  model: 'Xenova/swin2SR-classical-sr-x2-64',
313
237
  },
314
238
  type: 'image',
315
239
  },
316
240
  'depth-estimation': {
317
- // no tokenizer
318
241
  pipeline: DepthEstimationPipeline,
319
242
  model: AutoModelForDepthEstimation,
320
- processor: AutoProcessor,
321
243
  default: {
322
244
  model: 'onnx-community/depth-anything-v2-small',
323
245
  },
324
246
  type: 'image',
325
247
  },
326
-
327
- // This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers).
328
248
  'feature-extraction': {
329
- tokenizer: AutoTokenizer,
330
249
  pipeline: FeatureExtractionPipeline,
331
250
  model: AutoModel,
332
251
  default: {
@@ -336,7 +255,6 @@ export const SUPPORTED_TASKS = Object.freeze({
336
255
  type: 'text',
337
256
  },
338
257
  'image-feature-extraction': {
339
- processor: AutoProcessor,
340
258
  pipeline: ImageFeatureExtractionPipeline,
341
259
  model: [AutoModelForImageFeatureExtraction, AutoModel],
342
260
  default: {