@huggingface/transformers 4.0.0-next.5 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -4
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +2189 -1015
- package/dist/transformers.min.js +16 -16
- package/dist/transformers.node.cjs +2234 -1029
- package/dist/transformers.node.min.cjs +20 -20
- package/dist/transformers.node.min.mjs +20 -20
- package/dist/transformers.node.mjs +2194 -1017
- package/dist/transformers.web.js +2175 -1001
- package/dist/transformers.web.min.js +18 -18
- package/package.json +4 -4
- package/src/backends/onnx.js +77 -58
- package/src/backends/utils/cacheWasm.js +22 -43
- package/src/cache_utils.js +62 -0
- package/src/configs.js +32 -5
- package/src/env.js +36 -6
- package/src/image_processors_utils.js +3 -3
- package/src/models/auto/modeling_auto.js +14 -1
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +234 -292
- package/src/models/models.js +9 -0
- package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +36 -3
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
- package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
- package/src/models/registry.js +39 -4
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/pipelines/index.js +2 -84
- package/src/pipelines.js +40 -77
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/FileCache.js +128 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +8 -3
- package/src/utils/hub/{files.js → FileResponse.js} +0 -105
- package/src/utils/hub/utils.js +35 -1
- package/src/utils/hub.js +6 -5
- package/src/utils/image.js +12 -13
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/ModelRegistry.js +70 -23
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +63 -78
- package/src/utils/model_registry/get_pipeline_files.js +15 -24
- package/src/utils/model_registry/is_cached.js +81 -4
- package/src/utils/tensor.js +18 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/backends/utils/cacheWasm.d.ts +3 -17
- package/types/backends/utils/cacheWasm.d.ts.map +1 -1
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +18 -3
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/auto/modeling_auto.d.ts +6 -0
- package/types/models/auto/modeling_auto.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -24
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +9 -0
- package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
- package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
- package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
- package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
- package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
- package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
- package/types/models/registry.d.ts +2 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/pipelines/index.d.ts +0 -34
- package/types/pipelines/index.d.ts.map +1 -1
- package/types/pipelines.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache/FileCache.d.ts +39 -0
- package/types/utils/cache/FileCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts +4 -4
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -38
- package/types/utils/hub/FileResponse.d.ts.map +1 -0
- package/types/utils/hub/utils.d.ts +17 -2
- package/types/utils/hub/utils.d.ts.map +1 -1
- package/types/utils/hub.d.ts +7 -7
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/ModelRegistry.d.ts +66 -6
- package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/model_registry/get_pipeline_files.d.ts +2 -1
- package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -1
- package/types/utils/model_registry/is_cached.d.ts +47 -4
- package/types/utils/model_registry/is_cached.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
- package/types/utils/hub/files.d.ts.map +0 -1
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import { PreTrainedModel } from '../modeling_utils.js';
|
|
2
|
+
import { sessionRun } from '../session.js';
|
|
3
|
+
import { getCacheShapes } from '../../configs.js';
|
|
4
|
+
import { Tensor, ones } from '../../utils/tensor.js';
|
|
5
|
+
import { DataTypeMap } from '../../utils/dtypes.js';
|
|
6
|
+
import { pick } from '../../utils/core.js';
|
|
7
|
+
import { DynamicCache } from '../../cache_utils.js';
|
|
8
|
+
import { StoppingCriteria, StoppingCriteriaList } from '../../generation/stopping_criteria.js';
|
|
9
|
+
|
|
10
|
+
// Causal conv padding constants
|
|
11
|
+
const CONV1_LEFT_PAD = 2;
|
|
12
|
+
const CONV2_LEFT_PAD = 1;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* WeakMap to hold encoder streaming states for each model instance during generation.
|
|
16
|
+
* This allows the state to be accessed and modified across the generation process
|
|
17
|
+
* without exposing it on the model instance itself.
|
|
18
|
+
* @private
|
|
19
|
+
* @type {WeakMap<VoxtralRealtimeForConditionalGeneration, Object>}
|
|
20
|
+
*/
|
|
21
|
+
const states = new WeakMap();
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Creates encoder streaming state for a VoxtralRealtime generation session.
|
|
25
|
+
* @param {VoxtralRealtimeForConditionalGeneration} model
|
|
26
|
+
* @param {Iterable<Tensor>|AsyncIterable<Tensor>} input_features
|
|
27
|
+
* @returns {Object} Encoder state object.
|
|
28
|
+
* @private
|
|
29
|
+
*/
|
|
30
|
+
function createEncoderState(model, input_features) {
|
|
31
|
+
const { text_config, audio_config } = /** @type {any} */ (model.config);
|
|
32
|
+
const encoder_session = model.sessions['audio_encoder'];
|
|
33
|
+
|
|
34
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
35
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
36
|
+
|
|
37
|
+
// Initialize encoder KV cache
|
|
38
|
+
const enc_kv_cache = new DynamicCache();
|
|
39
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? 'float32';
|
|
40
|
+
const enc_cls = enc_dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
41
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
42
|
+
for (const name in enc_shapes) {
|
|
43
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
44
|
+
enc_kv_cache[name] = new Tensor(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const enc_padding_cache = new Tensor(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
48
|
+
1,
|
|
49
|
+
PADDING_CACHE_CHANNELS,
|
|
50
|
+
CONV1_LEFT_PAD,
|
|
51
|
+
]);
|
|
52
|
+
|
|
53
|
+
// Set up iterator from input_features
|
|
54
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
55
|
+
if (!chunks_iter) {
|
|
56
|
+
throw new Error('input_features must be iterable or async iterable');
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
encoder_session,
|
|
61
|
+
enc_kv_cache,
|
|
62
|
+
enc_padding_cache,
|
|
63
|
+
enc_past_seq_len: 0,
|
|
64
|
+
audio_embed_queue: [],
|
|
65
|
+
audio_embed_total_tokens: 0,
|
|
66
|
+
audio_queue_offset: 0,
|
|
67
|
+
audio_consumed: 0,
|
|
68
|
+
stream_exhausted: false,
|
|
69
|
+
chunks_iter,
|
|
70
|
+
text_hidden_size: text_config.hidden_size,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Encodes one audio chunk through the audio encoder.
|
|
76
|
+
* @param {Object} s Encoder state.
|
|
77
|
+
* @param {Tensor} chunk_features Mel spectrogram chunk [1, num_mel_bins, seq_len].
|
|
78
|
+
* @returns {Promise<Tensor>} Audio embeddings.
|
|
79
|
+
* @private
|
|
80
|
+
*/
|
|
81
|
+
async function encodeChunk(s, chunk_features) {
|
|
82
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
83
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
84
|
+
|
|
85
|
+
const position_ids = new Tensor(
|
|
86
|
+
'int64',
|
|
87
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
88
|
+
[1, conv2_output_len],
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
92
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
93
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
94
|
+
input_features: chunk_features,
|
|
95
|
+
attention_mask,
|
|
96
|
+
position_ids,
|
|
97
|
+
past_padding_cache: s.enc_padding_cache,
|
|
98
|
+
...s.enc_kv_cache,
|
|
99
|
+
});
|
|
100
|
+
// Dispose previous padding cache and update
|
|
101
|
+
if (s.enc_padding_cache.location === 'gpu-buffer') {
|
|
102
|
+
s.enc_padding_cache.dispose();
|
|
103
|
+
}
|
|
104
|
+
s.enc_padding_cache = present_padding_cache;
|
|
105
|
+
|
|
106
|
+
// Update encoder KV cache, disposing previous tensors
|
|
107
|
+
for (const name in present_cache) {
|
|
108
|
+
if (name.startsWith('present.')) {
|
|
109
|
+
const pastName = name.replace('present', 'past_key_values');
|
|
110
|
+
const prev = s.enc_kv_cache[pastName];
|
|
111
|
+
if (prev?.location === 'gpu-buffer') {
|
|
112
|
+
prev.dispose();
|
|
113
|
+
}
|
|
114
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
s.enc_past_seq_len = total_seq_len;
|
|
118
|
+
return audio_embeds;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Fills the audio embedding buffer until it has enough tokens.
|
|
123
|
+
* @param {Object} s Encoder state.
|
|
124
|
+
* @param {number} needed Total number of audio tokens needed.
|
|
125
|
+
* @private
|
|
126
|
+
*/
|
|
127
|
+
async function fillAudioBuffer(s, needed) {
|
|
128
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
129
|
+
const result = await s.chunks_iter.next();
|
|
130
|
+
if (result.done) {
|
|
131
|
+
s.stream_exhausted = true;
|
|
132
|
+
break;
|
|
133
|
+
}
|
|
134
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
135
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
136
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Adds audio embeddings to text embeddings from the queue.
|
|
142
|
+
* @param {Object} s Encoder state.
|
|
143
|
+
* @param {Tensor} inputs_embeds Text embeddings tensor (modified in-place).
|
|
144
|
+
* @param {number} current_len Number of tokens to consume.
|
|
145
|
+
* @private
|
|
146
|
+
*/
|
|
147
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
148
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
149
|
+
|
|
150
|
+
const embed_data = inputs_embeds.data;
|
|
151
|
+
let embed_write_pos = 0;
|
|
152
|
+
let remaining = current_len;
|
|
153
|
+
|
|
154
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
155
|
+
const front = s.audio_embed_queue[0];
|
|
156
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
157
|
+
const n = Math.min(remaining, available);
|
|
158
|
+
|
|
159
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
160
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
161
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
embed_write_pos += n;
|
|
165
|
+
remaining -= n;
|
|
166
|
+
s.audio_queue_offset += n;
|
|
167
|
+
|
|
168
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
169
|
+
s.audio_embed_queue.shift();
|
|
170
|
+
s.audio_queue_offset = 0;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
s.audio_consumed += current_len - remaining;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Stopping criterion that triggers when the audio stream is exhausted
|
|
178
|
+
* and all buffered audio embeddings have been consumed.
|
|
179
|
+
* @private
|
|
180
|
+
*/
|
|
181
|
+
class AudioExhaustedCriteria extends StoppingCriteria {
|
|
182
|
+
constructor(enc_state) {
|
|
183
|
+
super();
|
|
184
|
+
this._s = enc_state;
|
|
185
|
+
}
|
|
186
|
+
_call(input_ids) {
|
|
187
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
188
|
+
return input_ids.map(() => done);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
export class VoxtralRealtimePreTrainedModel extends PreTrainedModel {
|
|
193
|
+
forward_params = ['input_ids', 'attention_mask', 'position_ids', 'past_key_values'];
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
export class VoxtralRealtimeForConditionalGeneration extends VoxtralRealtimePreTrainedModel {
|
|
197
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
198
|
+
const current_len = input_ids.dims[1];
|
|
199
|
+
|
|
200
|
+
const enc = states.get(this);
|
|
201
|
+
if (enc) {
|
|
202
|
+
// Fill audio buffer and embed tokens with audio
|
|
203
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const { inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], { input_ids });
|
|
207
|
+
if (enc) {
|
|
208
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
212
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
213
|
+
|
|
214
|
+
const session = this.sessions['decoder_model_merged'];
|
|
215
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
216
|
+
return await sessionRun(session, fixed);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
220
|
+
if (!input_features) {
|
|
221
|
+
throw new Error('input_features (generator/iterable) must be provided');
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const enc_state = createEncoderState(this, input_features);
|
|
225
|
+
states.set(this, enc_state);
|
|
226
|
+
|
|
227
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
228
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
229
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
230
|
+
|
|
231
|
+
try {
|
|
232
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
233
|
+
} finally {
|
|
234
|
+
// Cleanup encoder state
|
|
235
|
+
enc_state.enc_kv_cache.dispose();
|
|
236
|
+
states.delete(this);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
|
|
2
|
+
import { AutoTokenizer } from '../auto/tokenization_auto.js';
|
|
3
|
+
import { Processor } from '../../processing_utils.js';
|
|
4
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
5
|
+
import { validate_audio_inputs } from '../../feature_extraction_utils.js';
|
|
6
|
+
|
|
7
|
+
// Voxtral Realtime audio config constants (from mistral_common AudioConfig)
|
|
8
|
+
const NUM_LEFT_PAD_TOKENS = 32;
|
|
9
|
+
const NUM_DELAY_TOKENS = 6;
|
|
10
|
+
const AUDIO_LENGTH_PER_TOK = 8;
|
|
11
|
+
const OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
12
|
+
|
|
13
|
+
/** Token ID for [STREAMING_PAD] in the Voxtral tokenizer. */
|
|
14
|
+
const STREAMING_PAD_TOKEN_ID = 32;
|
|
15
|
+
|
|
16
|
+
export class VoxtralRealtimeProcessor extends Processor {
|
|
17
|
+
static tokenizer_class = AutoTokenizer;
|
|
18
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
19
|
+
static uses_processor_config = false;
|
|
20
|
+
|
|
21
|
+
/** Number of mel frames in the first audio chunk. */
|
|
22
|
+
get num_mel_frames_first_audio_chunk() {
|
|
23
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
27
|
+
get num_samples_first_audio_chunk() {
|
|
28
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
29
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
33
|
+
get num_samples_per_audio_chunk() {
|
|
34
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
35
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
39
|
+
get num_right_pad_tokens() {
|
|
40
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Number of mel frames per text token. */
|
|
44
|
+
get audio_length_per_tok() {
|
|
45
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Number of raw audio samples per token. */
|
|
49
|
+
get raw_audio_length_per_tok() {
|
|
50
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Process audio input for VoxtralRealtime.
|
|
55
|
+
*
|
|
56
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
57
|
+
* with silence and mel features are extracted with `center=true`.
|
|
58
|
+
* Returns `{ input_ids, input_features }`.
|
|
59
|
+
*
|
|
60
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
61
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
62
|
+
*
|
|
63
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
64
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
65
|
+
* Returns `{ input_features }`.
|
|
66
|
+
*
|
|
67
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
68
|
+
* @param {Object} [options]
|
|
69
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
70
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
71
|
+
* @returns {Promise<Object>}
|
|
72
|
+
*/
|
|
73
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
74
|
+
validate_audio_inputs(audio, 'VoxtralRealtimeProcessor');
|
|
75
|
+
|
|
76
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
77
|
+
throw new Error('In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if (is_first_audio_chunk) {
|
|
81
|
+
if (is_streaming) {
|
|
82
|
+
// Streaming first chunk: left-pad audio with silence, extract mel with center=true, build input_ids
|
|
83
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
84
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
85
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
86
|
+
|
|
87
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
88
|
+
|
|
89
|
+
// Build input_ids: BOS + (num_left_pad_tokens + num_delay_tokens) * [STREAMING_PAD]
|
|
90
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
91
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
92
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
93
|
+
input_ids_data[0] = 1n; // BOS
|
|
94
|
+
const input_ids = new Tensor('int64', input_ids_data, [1, num_input_tokens]);
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
input_ids,
|
|
98
|
+
...audio_encoding,
|
|
99
|
+
};
|
|
100
|
+
} else {
|
|
101
|
+
// Non-streaming: right-pad audio to ensure full transcription, extract mel with center=true
|
|
102
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
103
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
104
|
+
padded_audio.set(audio);
|
|
105
|
+
|
|
106
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
107
|
+
}
|
|
108
|
+
} else {
|
|
109
|
+
// Subsequent streaming chunks: extract mel with center=false
|
|
110
|
+
return await this.feature_extractor(audio, { center: false });
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
|
|
2
2
|
import { Tensor } from '../../utils/tensor.js';
|
|
3
3
|
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
4
|
-
import { max } from '../../utils/maths.js';
|
|
5
4
|
import { logger } from '../../utils/logger.js';
|
|
6
5
|
|
|
7
6
|
export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
@@ -28,7 +27,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
28
27
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
29
28
|
*/
|
|
30
29
|
async _extract_fbank_features(waveform) {
|
|
31
|
-
|
|
30
|
+
return await spectrogram(
|
|
32
31
|
waveform,
|
|
33
32
|
this.window, // window
|
|
34
33
|
this.config.n_fft, // frame_length
|
|
@@ -36,7 +35,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
36
35
|
{
|
|
37
36
|
power: 2.0,
|
|
38
37
|
mel_filters: this.config.mel_filters,
|
|
39
|
-
log_mel: '
|
|
38
|
+
log_mel: 'log10_max_norm',
|
|
40
39
|
|
|
41
40
|
// Custom
|
|
42
41
|
max_num_frames: Math.min(
|
|
@@ -45,15 +44,6 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
45
44
|
),
|
|
46
45
|
},
|
|
47
46
|
);
|
|
48
|
-
|
|
49
|
-
const data = features.data;
|
|
50
|
-
const maxValue = max(/** @type {Float32Array} */ (data))[0];
|
|
51
|
-
|
|
52
|
-
for (let i = 0; i < data.length; ++i) {
|
|
53
|
-
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
return features;
|
|
57
47
|
}
|
|
58
48
|
|
|
59
49
|
/**
|
package/src/pipelines/index.js
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @file Pipeline task configurations and aliases
|
|
3
3
|
*
|
|
4
|
-
* Defines which
|
|
4
|
+
* Defines which pipeline class and model class(es) each pipeline task needs.
|
|
5
|
+
* Tokenizer and processor loading is determined automatically from the model's files.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
|
-
import { AutoTokenizer } from '../models/auto/tokenization_auto.js';
|
|
8
|
-
import { AutoProcessor } from '../models/auto/processing_auto.js';
|
|
9
8
|
import {
|
|
10
9
|
AutoModel,
|
|
11
10
|
AutoModelForSequenceClassification,
|
|
@@ -60,41 +59,30 @@ import { ImageFeatureExtractionPipeline } from './image-feature-extraction.js';
|
|
|
60
59
|
|
|
61
60
|
export const SUPPORTED_TASKS = Object.freeze({
|
|
62
61
|
'text-classification': {
|
|
63
|
-
tokenizer: AutoTokenizer,
|
|
64
62
|
pipeline: TextClassificationPipeline,
|
|
65
63
|
model: AutoModelForSequenceClassification,
|
|
66
64
|
default: {
|
|
67
|
-
// TODO: replace with original
|
|
68
|
-
// "model": "distilbert-base-uncased-finetuned-sst-2-english",
|
|
69
65
|
model: 'Xenova/distilbert-base-uncased-finetuned-sst-2-english',
|
|
70
66
|
},
|
|
71
67
|
type: 'text',
|
|
72
68
|
},
|
|
73
69
|
'token-classification': {
|
|
74
|
-
tokenizer: AutoTokenizer,
|
|
75
70
|
pipeline: TokenClassificationPipeline,
|
|
76
71
|
model: AutoModelForTokenClassification,
|
|
77
72
|
default: {
|
|
78
|
-
// TODO: replace with original
|
|
79
|
-
// "model": "Davlan/bert-base-multilingual-cased-ner-hrl",
|
|
80
73
|
model: 'Xenova/bert-base-multilingual-cased-ner-hrl',
|
|
81
74
|
},
|
|
82
75
|
type: 'text',
|
|
83
76
|
},
|
|
84
77
|
'question-answering': {
|
|
85
|
-
tokenizer: AutoTokenizer,
|
|
86
78
|
pipeline: QuestionAnsweringPipeline,
|
|
87
79
|
model: AutoModelForQuestionAnswering,
|
|
88
80
|
default: {
|
|
89
|
-
// TODO: replace with original
|
|
90
|
-
// "model": "distilbert-base-cased-distilled-squad",
|
|
91
81
|
model: 'Xenova/distilbert-base-cased-distilled-squad',
|
|
92
82
|
},
|
|
93
83
|
type: 'text',
|
|
94
84
|
},
|
|
95
|
-
|
|
96
85
|
'fill-mask': {
|
|
97
|
-
tokenizer: AutoTokenizer,
|
|
98
86
|
pipeline: FillMaskPipeline,
|
|
99
87
|
model: AutoModelForMaskedLM,
|
|
100
88
|
default: {
|
|
@@ -104,40 +92,30 @@ export const SUPPORTED_TASKS = Object.freeze({
|
|
|
104
92
|
type: 'text',
|
|
105
93
|
},
|
|
106
94
|
summarization: {
|
|
107
|
-
tokenizer: AutoTokenizer,
|
|
108
95
|
pipeline: SummarizationPipeline,
|
|
109
96
|
model: AutoModelForSeq2SeqLM,
|
|
110
97
|
default: {
|
|
111
|
-
// TODO: replace with original
|
|
112
|
-
// "model": "sshleifer/distilbart-cnn-6-6",
|
|
113
98
|
model: 'Xenova/distilbart-cnn-6-6',
|
|
114
99
|
},
|
|
115
100
|
type: 'text',
|
|
116
101
|
},
|
|
117
102
|
translation: {
|
|
118
|
-
tokenizer: AutoTokenizer,
|
|
119
103
|
pipeline: TranslationPipeline,
|
|
120
104
|
model: AutoModelForSeq2SeqLM,
|
|
121
105
|
default: {
|
|
122
|
-
// TODO: replace with original
|
|
123
|
-
// "model": "t5-small",
|
|
124
106
|
model: 'Xenova/t5-small',
|
|
125
107
|
},
|
|
126
108
|
type: 'text',
|
|
127
109
|
},
|
|
128
110
|
'text2text-generation': {
|
|
129
|
-
tokenizer: AutoTokenizer,
|
|
130
111
|
pipeline: Text2TextGenerationPipeline,
|
|
131
112
|
model: AutoModelForSeq2SeqLM,
|
|
132
113
|
default: {
|
|
133
|
-
// TODO: replace with original
|
|
134
|
-
// "model": "google/flan-t5-small",
|
|
135
114
|
model: 'Xenova/flan-t5-small',
|
|
136
115
|
},
|
|
137
116
|
type: 'text',
|
|
138
117
|
},
|
|
139
118
|
'text-generation': {
|
|
140
|
-
tokenizer: AutoTokenizer,
|
|
141
119
|
pipeline: TextGenerationPipeline,
|
|
142
120
|
model: AutoModelForCausalLM,
|
|
143
121
|
default: {
|
|
@@ -147,12 +125,9 @@ export const SUPPORTED_TASKS = Object.freeze({
|
|
|
147
125
|
type: 'text',
|
|
148
126
|
},
|
|
149
127
|
'zero-shot-classification': {
|
|
150
|
-
tokenizer: AutoTokenizer,
|
|
151
128
|
pipeline: ZeroShotClassificationPipeline,
|
|
152
129
|
model: AutoModelForSequenceClassification,
|
|
153
130
|
default: {
|
|
154
|
-
// TODO: replace with original
|
|
155
|
-
// "model": "typeform/distilbert-base-uncased-mnli",
|
|
156
131
|
model: 'Xenova/distilbert-base-uncased-mnli',
|
|
157
132
|
},
|
|
158
133
|
type: 'text',
|
|
@@ -160,43 +135,30 @@ export const SUPPORTED_TASKS = Object.freeze({
|
|
|
160
135
|
'audio-classification': {
|
|
161
136
|
pipeline: AudioClassificationPipeline,
|
|
162
137
|
model: AutoModelForAudioClassification,
|
|
163
|
-
processor: AutoProcessor,
|
|
164
138
|
default: {
|
|
165
|
-
// TODO: replace with original
|
|
166
|
-
// "model": "superb/wav2vec2-base-superb-ks",
|
|
167
139
|
model: 'Xenova/wav2vec2-base-superb-ks',
|
|
168
140
|
},
|
|
169
141
|
type: 'audio',
|
|
170
142
|
},
|
|
171
143
|
'zero-shot-audio-classification': {
|
|
172
|
-
tokenizer: AutoTokenizer,
|
|
173
144
|
pipeline: ZeroShotAudioClassificationPipeline,
|
|
174
145
|
model: AutoModel,
|
|
175
|
-
processor: AutoProcessor,
|
|
176
146
|
default: {
|
|
177
|
-
// TODO: replace with original
|
|
178
|
-
// "model": "laion/clap-htsat-fused",
|
|
179
147
|
model: 'Xenova/clap-htsat-unfused',
|
|
180
148
|
},
|
|
181
149
|
type: 'multimodal',
|
|
182
150
|
},
|
|
183
151
|
'automatic-speech-recognition': {
|
|
184
|
-
tokenizer: AutoTokenizer,
|
|
185
152
|
pipeline: AutomaticSpeechRecognitionPipeline,
|
|
186
153
|
model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
|
|
187
|
-
processor: AutoProcessor,
|
|
188
154
|
default: {
|
|
189
|
-
// TODO: replace with original
|
|
190
|
-
// "model": "openai/whisper-tiny.en",
|
|
191
155
|
model: 'Xenova/whisper-tiny.en',
|
|
192
156
|
},
|
|
193
157
|
type: 'multimodal',
|
|
194
158
|
},
|
|
195
159
|
'text-to-audio': {
|
|
196
|
-
tokenizer: AutoTokenizer,
|
|
197
160
|
pipeline: TextToAudioPipeline,
|
|
198
161
|
model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
|
|
199
|
-
processor: [AutoProcessor, /* Some don't use a processor */ null],
|
|
200
162
|
default: {
|
|
201
163
|
model: 'onnx-community/Supertonic-TTS-ONNX',
|
|
202
164
|
dtype: 'fp32',
|
|
@@ -204,129 +166,86 @@ export const SUPPORTED_TASKS = Object.freeze({
|
|
|
204
166
|
type: 'text',
|
|
205
167
|
},
|
|
206
168
|
'image-to-text': {
|
|
207
|
-
tokenizer: AutoTokenizer,
|
|
208
169
|
pipeline: ImageToTextPipeline,
|
|
209
170
|
model: AutoModelForVision2Seq,
|
|
210
|
-
processor: AutoProcessor,
|
|
211
171
|
default: {
|
|
212
|
-
// TODO: replace with original
|
|
213
|
-
// "model": "nlpconnect/vit-gpt2-image-captioning",
|
|
214
172
|
model: 'Xenova/vit-gpt2-image-captioning',
|
|
215
173
|
},
|
|
216
174
|
type: 'multimodal',
|
|
217
175
|
},
|
|
218
|
-
|
|
219
176
|
'image-classification': {
|
|
220
|
-
// no tokenizer
|
|
221
177
|
pipeline: ImageClassificationPipeline,
|
|
222
178
|
model: AutoModelForImageClassification,
|
|
223
|
-
processor: AutoProcessor,
|
|
224
179
|
default: {
|
|
225
|
-
// TODO: replace with original
|
|
226
|
-
// "model": "google/vit-base-patch16-224",
|
|
227
180
|
model: 'Xenova/vit-base-patch16-224',
|
|
228
181
|
},
|
|
229
182
|
type: 'multimodal',
|
|
230
183
|
},
|
|
231
|
-
|
|
232
184
|
'image-segmentation': {
|
|
233
|
-
// no tokenizer
|
|
234
185
|
pipeline: ImageSegmentationPipeline,
|
|
235
186
|
model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
|
|
236
|
-
processor: AutoProcessor,
|
|
237
187
|
default: {
|
|
238
|
-
// TODO: replace with original
|
|
239
|
-
// "model": "facebook/detr-resnet-50-panoptic",
|
|
240
188
|
model: 'Xenova/detr-resnet-50-panoptic',
|
|
241
189
|
},
|
|
242
190
|
type: 'multimodal',
|
|
243
191
|
},
|
|
244
192
|
'background-removal': {
|
|
245
|
-
// no tokenizer
|
|
246
193
|
pipeline: BackgroundRemovalPipeline,
|
|
247
194
|
model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
|
|
248
|
-
processor: AutoProcessor,
|
|
249
195
|
default: {
|
|
250
196
|
model: 'Xenova/modnet',
|
|
251
197
|
},
|
|
252
198
|
type: 'image',
|
|
253
199
|
},
|
|
254
|
-
|
|
255
200
|
'zero-shot-image-classification': {
|
|
256
|
-
tokenizer: AutoTokenizer,
|
|
257
201
|
pipeline: ZeroShotImageClassificationPipeline,
|
|
258
202
|
model: AutoModel,
|
|
259
|
-
processor: AutoProcessor,
|
|
260
203
|
default: {
|
|
261
|
-
// TODO: replace with original
|
|
262
|
-
// "model": "openai/clip-vit-base-patch32",
|
|
263
204
|
model: 'Xenova/clip-vit-base-patch32',
|
|
264
205
|
},
|
|
265
206
|
type: 'multimodal',
|
|
266
207
|
},
|
|
267
|
-
|
|
268
208
|
'object-detection': {
|
|
269
|
-
// no tokenizer
|
|
270
209
|
pipeline: ObjectDetectionPipeline,
|
|
271
210
|
model: AutoModelForObjectDetection,
|
|
272
|
-
processor: AutoProcessor,
|
|
273
211
|
default: {
|
|
274
|
-
// TODO: replace with original
|
|
275
|
-
// "model": "facebook/detr-resnet-50",
|
|
276
212
|
model: 'Xenova/detr-resnet-50',
|
|
277
213
|
},
|
|
278
214
|
type: 'multimodal',
|
|
279
215
|
},
|
|
280
216
|
'zero-shot-object-detection': {
|
|
281
|
-
tokenizer: AutoTokenizer,
|
|
282
217
|
pipeline: ZeroShotObjectDetectionPipeline,
|
|
283
218
|
model: AutoModelForZeroShotObjectDetection,
|
|
284
|
-
processor: AutoProcessor,
|
|
285
219
|
default: {
|
|
286
|
-
// TODO: replace with original
|
|
287
|
-
// "model": "google/owlvit-base-patch32",
|
|
288
220
|
model: 'Xenova/owlvit-base-patch32',
|
|
289
221
|
},
|
|
290
222
|
type: 'multimodal',
|
|
291
223
|
},
|
|
292
224
|
'document-question-answering': {
|
|
293
|
-
tokenizer: AutoTokenizer,
|
|
294
225
|
pipeline: DocumentQuestionAnsweringPipeline,
|
|
295
226
|
model: AutoModelForDocumentQuestionAnswering,
|
|
296
|
-
processor: AutoProcessor,
|
|
297
227
|
default: {
|
|
298
|
-
// TODO: replace with original
|
|
299
|
-
// "model": "naver-clova-ix/donut-base-finetuned-docvqa",
|
|
300
228
|
model: 'Xenova/donut-base-finetuned-docvqa',
|
|
301
229
|
},
|
|
302
230
|
type: 'multimodal',
|
|
303
231
|
},
|
|
304
232
|
'image-to-image': {
|
|
305
|
-
// no tokenizer
|
|
306
233
|
pipeline: ImageToImagePipeline,
|
|
307
234
|
model: AutoModelForImageToImage,
|
|
308
|
-
processor: AutoProcessor,
|
|
309
235
|
default: {
|
|
310
|
-
// TODO: replace with original
|
|
311
|
-
// "model": "caidas/swin2SR-classical-sr-x2-64",
|
|
312
236
|
model: 'Xenova/swin2SR-classical-sr-x2-64',
|
|
313
237
|
},
|
|
314
238
|
type: 'image',
|
|
315
239
|
},
|
|
316
240
|
'depth-estimation': {
|
|
317
|
-
// no tokenizer
|
|
318
241
|
pipeline: DepthEstimationPipeline,
|
|
319
242
|
model: AutoModelForDepthEstimation,
|
|
320
|
-
processor: AutoProcessor,
|
|
321
243
|
default: {
|
|
322
244
|
model: 'onnx-community/depth-anything-v2-small',
|
|
323
245
|
},
|
|
324
246
|
type: 'image',
|
|
325
247
|
},
|
|
326
|
-
|
|
327
|
-
// This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers).
|
|
328
248
|
'feature-extraction': {
|
|
329
|
-
tokenizer: AutoTokenizer,
|
|
330
249
|
pipeline: FeatureExtractionPipeline,
|
|
331
250
|
model: AutoModel,
|
|
332
251
|
default: {
|
|
@@ -336,7 +255,6 @@ export const SUPPORTED_TASKS = Object.freeze({
|
|
|
336
255
|
type: 'text',
|
|
337
256
|
},
|
|
338
257
|
'image-feature-extraction': {
|
|
339
|
-
processor: AutoProcessor,
|
|
340
258
|
pipeline: ImageFeatureExtractionPipeline,
|
|
341
259
|
model: [AutoModelForImageFeatureExtraction, AutoModel],
|
|
342
260
|
default: {
|