@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +1587 -570
- package/dist/transformers.min.js +17 -17
- package/dist/transformers.node.cjs +1605 -573
- package/dist/transformers.node.min.cjs +21 -21
- package/dist/transformers.node.min.mjs +21 -21
- package/dist/transformers.node.mjs +1600 -583
- package/dist/transformers.web.js +1592 -575
- package/dist/transformers.web.min.js +15 -15
- package/package.json +3 -3
- package/src/cache_utils.js +62 -0
- package/src/configs.js +17 -2
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +3 -3
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +222 -308
- package/src/models/models.js +4 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +7 -7
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +25 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +4 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import { PreTrainedModel } from '../modeling_utils.js';
|
|
2
|
+
import { sessionRun } from '../session.js';
|
|
3
|
+
import { getCacheShapes } from '../../configs.js';
|
|
4
|
+
import { Tensor, ones } from '../../utils/tensor.js';
|
|
5
|
+
import { DataTypeMap } from '../../utils/dtypes.js';
|
|
6
|
+
import { pick } from '../../utils/core.js';
|
|
7
|
+
import { DynamicCache } from '../../cache_utils.js';
|
|
8
|
+
import { StoppingCriteria, StoppingCriteriaList } from '../../generation/stopping_criteria.js';
|
|
9
|
+
|
|
10
|
+
// Causal conv padding constants
|
|
11
|
+
const CONV1_LEFT_PAD = 2;
|
|
12
|
+
const CONV2_LEFT_PAD = 1;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* WeakMap to hold encoder streaming states for each model instance during generation.
|
|
16
|
+
* This allows the state to be accessed and modified across the generation process
|
|
17
|
+
* without exposing it on the model instance itself.
|
|
18
|
+
* @private
|
|
19
|
+
* @type {WeakMap<VoxtralRealtimeForConditionalGeneration, Object>}
|
|
20
|
+
*/
|
|
21
|
+
const states = new WeakMap();
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Creates encoder streaming state for a VoxtralRealtime generation session.
|
|
25
|
+
* @param {VoxtralRealtimeForConditionalGeneration} model
|
|
26
|
+
* @param {Iterable<Tensor>|AsyncIterable<Tensor>} input_features
|
|
27
|
+
* @returns {Object} Encoder state object.
|
|
28
|
+
* @private
|
|
29
|
+
*/
|
|
30
|
+
function createEncoderState(model, input_features) {
|
|
31
|
+
const { text_config, audio_config } = /** @type {any} */ (model.config);
|
|
32
|
+
const encoder_session = model.sessions['audio_encoder'];
|
|
33
|
+
|
|
34
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
35
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
36
|
+
|
|
37
|
+
// Initialize encoder KV cache
|
|
38
|
+
const enc_kv_cache = new DynamicCache();
|
|
39
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? 'float32';
|
|
40
|
+
const enc_cls = enc_dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
41
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
42
|
+
for (const name in enc_shapes) {
|
|
43
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
44
|
+
enc_kv_cache[name] = new Tensor(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const enc_padding_cache = new Tensor(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
48
|
+
1,
|
|
49
|
+
PADDING_CACHE_CHANNELS,
|
|
50
|
+
CONV1_LEFT_PAD,
|
|
51
|
+
]);
|
|
52
|
+
|
|
53
|
+
// Set up iterator from input_features
|
|
54
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
55
|
+
if (!chunks_iter) {
|
|
56
|
+
throw new Error('input_features must be iterable or async iterable');
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
encoder_session,
|
|
61
|
+
enc_kv_cache,
|
|
62
|
+
enc_padding_cache,
|
|
63
|
+
enc_past_seq_len: 0,
|
|
64
|
+
audio_embed_queue: [],
|
|
65
|
+
audio_embed_total_tokens: 0,
|
|
66
|
+
audio_queue_offset: 0,
|
|
67
|
+
audio_consumed: 0,
|
|
68
|
+
stream_exhausted: false,
|
|
69
|
+
chunks_iter,
|
|
70
|
+
text_hidden_size: text_config.hidden_size,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Encodes one audio chunk through the audio encoder.
|
|
76
|
+
* @param {Object} s Encoder state.
|
|
77
|
+
* @param {Tensor} chunk_features Mel spectrogram chunk [1, num_mel_bins, seq_len].
|
|
78
|
+
* @returns {Promise<Tensor>} Audio embeddings.
|
|
79
|
+
* @private
|
|
80
|
+
*/
|
|
81
|
+
async function encodeChunk(s, chunk_features) {
|
|
82
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
83
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
84
|
+
|
|
85
|
+
const position_ids = new Tensor(
|
|
86
|
+
'int64',
|
|
87
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
88
|
+
[1, conv2_output_len],
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
92
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
93
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
94
|
+
input_features: chunk_features,
|
|
95
|
+
attention_mask,
|
|
96
|
+
position_ids,
|
|
97
|
+
past_padding_cache: s.enc_padding_cache,
|
|
98
|
+
...s.enc_kv_cache,
|
|
99
|
+
});
|
|
100
|
+
// Dispose previous padding cache and update
|
|
101
|
+
if (s.enc_padding_cache.location === 'gpu-buffer') {
|
|
102
|
+
s.enc_padding_cache.dispose();
|
|
103
|
+
}
|
|
104
|
+
s.enc_padding_cache = present_padding_cache;
|
|
105
|
+
|
|
106
|
+
// Update encoder KV cache, disposing previous tensors
|
|
107
|
+
for (const name in present_cache) {
|
|
108
|
+
if (name.startsWith('present.')) {
|
|
109
|
+
const pastName = name.replace('present', 'past_key_values');
|
|
110
|
+
const prev = s.enc_kv_cache[pastName];
|
|
111
|
+
if (prev?.location === 'gpu-buffer') {
|
|
112
|
+
prev.dispose();
|
|
113
|
+
}
|
|
114
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
s.enc_past_seq_len = total_seq_len;
|
|
118
|
+
return audio_embeds;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Fills the audio embedding buffer until it has enough tokens.
|
|
123
|
+
* @param {Object} s Encoder state.
|
|
124
|
+
* @param {number} needed Total number of audio tokens needed.
|
|
125
|
+
* @private
|
|
126
|
+
*/
|
|
127
|
+
async function fillAudioBuffer(s, needed) {
|
|
128
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
129
|
+
const result = await s.chunks_iter.next();
|
|
130
|
+
if (result.done) {
|
|
131
|
+
s.stream_exhausted = true;
|
|
132
|
+
break;
|
|
133
|
+
}
|
|
134
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
135
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
136
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Adds audio embeddings to text embeddings from the queue.
|
|
142
|
+
* @param {Object} s Encoder state.
|
|
143
|
+
* @param {Tensor} inputs_embeds Text embeddings tensor (modified in-place).
|
|
144
|
+
* @param {number} current_len Number of tokens to consume.
|
|
145
|
+
* @private
|
|
146
|
+
*/
|
|
147
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
148
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
149
|
+
|
|
150
|
+
const embed_data = inputs_embeds.data;
|
|
151
|
+
let embed_write_pos = 0;
|
|
152
|
+
let remaining = current_len;
|
|
153
|
+
|
|
154
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
155
|
+
const front = s.audio_embed_queue[0];
|
|
156
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
157
|
+
const n = Math.min(remaining, available);
|
|
158
|
+
|
|
159
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
160
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
161
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
embed_write_pos += n;
|
|
165
|
+
remaining -= n;
|
|
166
|
+
s.audio_queue_offset += n;
|
|
167
|
+
|
|
168
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
169
|
+
s.audio_embed_queue.shift();
|
|
170
|
+
s.audio_queue_offset = 0;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
s.audio_consumed += current_len - remaining;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Stopping criterion that triggers when the audio stream is exhausted
|
|
178
|
+
* and all buffered audio embeddings have been consumed.
|
|
179
|
+
* @private
|
|
180
|
+
*/
|
|
181
|
+
class AudioExhaustedCriteria extends StoppingCriteria {
|
|
182
|
+
constructor(enc_state) {
|
|
183
|
+
super();
|
|
184
|
+
this._s = enc_state;
|
|
185
|
+
}
|
|
186
|
+
_call(input_ids) {
|
|
187
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
188
|
+
return input_ids.map(() => done);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
export class VoxtralRealtimePreTrainedModel extends PreTrainedModel {
|
|
193
|
+
forward_params = ['input_ids', 'attention_mask', 'position_ids', 'past_key_values'];
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
export class VoxtralRealtimeForConditionalGeneration extends VoxtralRealtimePreTrainedModel {
|
|
197
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
198
|
+
const current_len = input_ids.dims[1];
|
|
199
|
+
|
|
200
|
+
const enc = states.get(this);
|
|
201
|
+
if (enc) {
|
|
202
|
+
// Fill audio buffer and embed tokens with audio
|
|
203
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const { inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], { input_ids });
|
|
207
|
+
if (enc) {
|
|
208
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
212
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
213
|
+
|
|
214
|
+
const session = this.sessions['decoder_model_merged'];
|
|
215
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
216
|
+
return await sessionRun(session, fixed);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
220
|
+
if (!input_features) {
|
|
221
|
+
throw new Error('input_features (generator/iterable) must be provided');
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const enc_state = createEncoderState(this, input_features);
|
|
225
|
+
states.set(this, enc_state);
|
|
226
|
+
|
|
227
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
228
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
229
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
230
|
+
|
|
231
|
+
try {
|
|
232
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
233
|
+
} finally {
|
|
234
|
+
// Cleanup encoder state
|
|
235
|
+
enc_state.enc_kv_cache.dispose();
|
|
236
|
+
states.delete(this);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
|
|
2
|
+
import { AutoTokenizer } from '../auto/tokenization_auto.js';
|
|
3
|
+
import { Processor } from '../../processing_utils.js';
|
|
4
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
5
|
+
import { validate_audio_inputs } from '../../feature_extraction_utils.js';
|
|
6
|
+
|
|
7
|
+
// Voxtral Realtime audio config constants (from mistral_common AudioConfig)
|
|
8
|
+
const NUM_LEFT_PAD_TOKENS = 32;
|
|
9
|
+
const NUM_DELAY_TOKENS = 6;
|
|
10
|
+
const AUDIO_LENGTH_PER_TOK = 8;
|
|
11
|
+
const OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
12
|
+
|
|
13
|
+
/** Token ID for [STREAMING_PAD] in the Voxtral tokenizer. */
|
|
14
|
+
const STREAMING_PAD_TOKEN_ID = 32;
|
|
15
|
+
|
|
16
|
+
export class VoxtralRealtimeProcessor extends Processor {
|
|
17
|
+
static tokenizer_class = AutoTokenizer;
|
|
18
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
19
|
+
static uses_processor_config = false;
|
|
20
|
+
|
|
21
|
+
/** Number of mel frames in the first audio chunk. */
|
|
22
|
+
get num_mel_frames_first_audio_chunk() {
|
|
23
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
27
|
+
get num_samples_first_audio_chunk() {
|
|
28
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
29
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
33
|
+
get num_samples_per_audio_chunk() {
|
|
34
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
35
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
39
|
+
get num_right_pad_tokens() {
|
|
40
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Number of mel frames per text token. */
|
|
44
|
+
get audio_length_per_tok() {
|
|
45
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Number of raw audio samples per token. */
|
|
49
|
+
get raw_audio_length_per_tok() {
|
|
50
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Process audio input for VoxtralRealtime.
|
|
55
|
+
*
|
|
56
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
57
|
+
* with silence and mel features are extracted with `center=true`.
|
|
58
|
+
* Returns `{ input_ids, input_features }`.
|
|
59
|
+
*
|
|
60
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
61
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
62
|
+
*
|
|
63
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
64
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
65
|
+
* Returns `{ input_features }`.
|
|
66
|
+
*
|
|
67
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
68
|
+
* @param {Object} [options]
|
|
69
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
70
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
71
|
+
* @returns {Promise<Object>}
|
|
72
|
+
*/
|
|
73
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
74
|
+
validate_audio_inputs(audio, 'VoxtralRealtimeProcessor');
|
|
75
|
+
|
|
76
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
77
|
+
throw new Error('In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if (is_first_audio_chunk) {
|
|
81
|
+
if (is_streaming) {
|
|
82
|
+
// Streaming first chunk: left-pad audio with silence, extract mel with center=true, build input_ids
|
|
83
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
84
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
85
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
86
|
+
|
|
87
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
88
|
+
|
|
89
|
+
// Build input_ids: BOS + (num_left_pad_tokens + num_delay_tokens) * [STREAMING_PAD]
|
|
90
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
91
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
92
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
93
|
+
input_ids_data[0] = 1n; // BOS
|
|
94
|
+
const input_ids = new Tensor('int64', input_ids_data, [1, num_input_tokens]);
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
input_ids,
|
|
98
|
+
...audio_encoding,
|
|
99
|
+
};
|
|
100
|
+
} else {
|
|
101
|
+
// Non-streaming: right-pad audio to ensure full transcription, extract mel with center=true
|
|
102
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
103
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
104
|
+
padded_audio.set(audio);
|
|
105
|
+
|
|
106
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
107
|
+
}
|
|
108
|
+
} else {
|
|
109
|
+
// Subsequent streaming chunks: extract mel with center=false
|
|
110
|
+
return await this.feature_extractor(audio, { center: false });
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
|
|
2
2
|
import { Tensor } from '../../utils/tensor.js';
|
|
3
3
|
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
4
|
-
import { max } from '../../utils/maths.js';
|
|
5
4
|
import { logger } from '../../utils/logger.js';
|
|
6
5
|
|
|
7
6
|
export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
@@ -28,7 +27,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
28
27
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
29
28
|
*/
|
|
30
29
|
async _extract_fbank_features(waveform) {
|
|
31
|
-
|
|
30
|
+
return await spectrogram(
|
|
32
31
|
waveform,
|
|
33
32
|
this.window, // window
|
|
34
33
|
this.config.n_fft, // frame_length
|
|
@@ -36,7 +35,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
36
35
|
{
|
|
37
36
|
power: 2.0,
|
|
38
37
|
mel_filters: this.config.mel_filters,
|
|
39
|
-
log_mel: '
|
|
38
|
+
log_mel: 'log10_max_norm',
|
|
40
39
|
|
|
41
40
|
// Custom
|
|
42
41
|
max_num_frames: Math.min(
|
|
@@ -45,15 +44,6 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
45
44
|
),
|
|
46
45
|
},
|
|
47
46
|
);
|
|
48
|
-
|
|
49
|
-
const data = features.data;
|
|
50
|
-
const maxValue = max(/** @type {Float32Array} */ (data))[0];
|
|
51
|
-
|
|
52
|
-
for (let i = 0; i < data.length; ++i) {
|
|
53
|
-
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
return features;
|
|
57
47
|
}
|
|
58
48
|
|
|
59
49
|
/**
|
package/src/transformers.js
CHANGED
|
@@ -53,6 +53,8 @@ export * from './utils/tensor.js';
|
|
|
53
53
|
export { softmax, log_softmax, dot, cos_sim } from './utils/maths.js';
|
|
54
54
|
export { random } from './utils/random.js';
|
|
55
55
|
|
|
56
|
+
export { DynamicCache } from './cache_utils.js';
|
|
57
|
+
|
|
56
58
|
// Cache and file management
|
|
57
59
|
export { ModelRegistry } from './utils/model_registry/ModelRegistry.js';
|
|
58
60
|
|
package/src/utils/audio.js
CHANGED
|
@@ -452,8 +452,10 @@ function power_to_db(spectrogram, reference = 1.0, min_value = 1e-10, db_range =
|
|
|
452
452
|
* If supplied, applies this filter bank to create a mel spectrogram.
|
|
453
453
|
* @param {number} [options.mel_floor=1e-10] Minimum value of mel frequency banks.
|
|
454
454
|
* @param {string} [options.log_mel=null] How to convert the spectrogram to log scale. Possible options are:
|
|
455
|
-
* `null` (don't convert), `"log"` (take the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels)
|
|
455
|
+
* `null` (don't convert), `"log"` (take the natural logarithm), `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels),
|
|
456
|
+
* `"log10_max_norm"` (take `log10`, then apply `(max(x, maxVal - 8) + 4) / 4` normalization, where `maxVal` is computed from data or given by `max_log_mel`).
|
|
456
457
|
* Can only be used when `power` is not `null`.
|
|
458
|
+
* @param {number} [options.max_log_mel=null] When `log_mel` is `"log10_max_norm"`, use this fixed value as the max instead of computing from data.
|
|
457
459
|
* @param {number} [options.reference=1.0] Sets the input spectrogram value that corresponds to 0 dB. For example, use `max(spectrogram)[0]` to set
|
|
458
460
|
* the loudest part to 0 dB. Must be greater than zero.
|
|
459
461
|
* @param {number} [options.min_value=1e-10] The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking `log(0)`.
|
|
@@ -486,6 +488,7 @@ export async function spectrogram(
|
|
|
486
488
|
mel_filters = null,
|
|
487
489
|
mel_floor = 1e-10,
|
|
488
490
|
log_mel = null,
|
|
491
|
+
max_log_mel = null,
|
|
489
492
|
reference = 1.0,
|
|
490
493
|
min_value = 1e-10,
|
|
491
494
|
db_range = null,
|
|
@@ -669,6 +672,17 @@ export async function spectrogram(
|
|
|
669
672
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
670
673
|
}
|
|
671
674
|
break;
|
|
675
|
+
case 'log10_max_norm': {
|
|
676
|
+
for (let i = 0; i < o; ++i) {
|
|
677
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
678
|
+
}
|
|
679
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
680
|
+
const threshold = logMax - 8.0;
|
|
681
|
+
for (let i = 0; i < o; ++i) {
|
|
682
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4.0) / 4.0;
|
|
683
|
+
}
|
|
684
|
+
break;
|
|
685
|
+
}
|
|
672
686
|
case 'dB':
|
|
673
687
|
if (power === 1.0) {
|
|
674
688
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -679,7 +693,9 @@ export async function spectrogram(
|
|
|
679
693
|
}
|
|
680
694
|
break;
|
|
681
695
|
default:
|
|
682
|
-
throw new Error(
|
|
696
|
+
throw new Error(
|
|
697
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`,
|
|
698
|
+
);
|
|
683
699
|
}
|
|
684
700
|
}
|
|
685
701
|
|