@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +1587 -570
- package/dist/transformers.min.js +17 -17
- package/dist/transformers.node.cjs +1605 -573
- package/dist/transformers.node.min.cjs +21 -21
- package/dist/transformers.node.min.mjs +21 -21
- package/dist/transformers.node.mjs +1600 -583
- package/dist/transformers.web.js +1592 -575
- package/dist/transformers.web.min.js +15 -15
- package/package.json +3 -3
- package/src/cache_utils.js +62 -0
- package/src/configs.js +17 -2
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +3 -3
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +222 -308
- package/src/models/models.js +4 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +7 -7
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +25 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +4 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/transformers",
|
|
3
|
-
"version": "4.0.0-next.
|
|
3
|
+
"version": "4.0.0-next.7",
|
|
4
4
|
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
|
|
5
5
|
"main": "./dist/transformers.node.cjs",
|
|
6
6
|
"types": "./types/transformers.d.ts",
|
|
@@ -43,10 +43,10 @@
|
|
|
43
43
|
},
|
|
44
44
|
"homepage": "https://github.com/huggingface/transformers.js#readme",
|
|
45
45
|
"dependencies": {
|
|
46
|
-
"@huggingface/jinja": "^0.5.
|
|
46
|
+
"@huggingface/jinja": "^0.5.6",
|
|
47
47
|
"@huggingface/tokenizers": "^0.1.2",
|
|
48
48
|
"onnxruntime-node": "1.24.3",
|
|
49
|
-
"onnxruntime-web": "1.25.0-dev.
|
|
49
|
+
"onnxruntime-web": "1.25.0-dev.20260307-d626b568e0",
|
|
50
50
|
"sharp": "^0.34.5"
|
|
51
51
|
},
|
|
52
52
|
"devDependencies": {
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { Tensor } from './utils/tensor.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* A cache class that stores past key values as named tensors.
|
|
5
|
+
*/
|
|
6
|
+
class _DynamicCache {
|
|
7
|
+
/**
|
|
8
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
9
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
10
|
+
*/
|
|
11
|
+
constructor(entries) {
|
|
12
|
+
if (!entries) return;
|
|
13
|
+
for (const key in entries) {
|
|
14
|
+
if (key in this) {
|
|
15
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
16
|
+
}
|
|
17
|
+
const value = entries[key];
|
|
18
|
+
if (!(value instanceof Tensor)) {
|
|
19
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
20
|
+
}
|
|
21
|
+
this[key] = value;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
27
|
+
* @returns {number} The past sequence length.
|
|
28
|
+
*/
|
|
29
|
+
get_seq_length() {
|
|
30
|
+
/** @type {Record<string, Tensor>} */
|
|
31
|
+
const self = /** @type {any} */ (this);
|
|
32
|
+
for (const name in self) {
|
|
33
|
+
if (name.startsWith('past_key_values.')) {
|
|
34
|
+
return self[name].dims.at(-2);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
throw new Error('Unable to determine sequence length from the cache.');
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
42
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
43
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
44
|
+
*/
|
|
45
|
+
async dispose() {
|
|
46
|
+
const promises = [];
|
|
47
|
+
for (const t of /** @type {Tensor[]} */ (Object.values(this))) {
|
|
48
|
+
if (t.location === 'gpu-buffer') {
|
|
49
|
+
promises.push(t.dispose());
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
await Promise.all(promises);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* @typedef {_DynamicCache & Record<string, Tensor>} DynamicCache
|
|
58
|
+
*/
|
|
59
|
+
|
|
60
|
+
export const DynamicCache = /** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */ (
|
|
61
|
+
/** @type {unknown} */ (_DynamicCache)
|
|
62
|
+
);
|
package/src/configs.js
CHANGED
|
@@ -68,10 +68,13 @@ function getNormalizedConfig(config) {
|
|
|
68
68
|
case 'florence2':
|
|
69
69
|
case 'llava_onevision':
|
|
70
70
|
case 'idefics3':
|
|
71
|
+
case 'granite_speech':
|
|
71
72
|
case 'ultravox':
|
|
72
73
|
case 'voxtral':
|
|
74
|
+
case 'voxtral_realtime':
|
|
73
75
|
case 'smolvlm':
|
|
74
76
|
case 'gemma3n':
|
|
77
|
+
case 'lfm2_vl':
|
|
75
78
|
case 'chatterbox':
|
|
76
79
|
case 'mistral3':
|
|
77
80
|
case 'qwen2_5_vl':
|
|
@@ -131,10 +134,13 @@ function getNormalizedConfig(config) {
|
|
|
131
134
|
case 'cohere':
|
|
132
135
|
case 'cohere2':
|
|
133
136
|
case 'mistral':
|
|
137
|
+
case 'voxtral_realtime_text':
|
|
138
|
+
case 'voxtral_realtime_encoder':
|
|
134
139
|
case 'starcoder2':
|
|
135
140
|
case 'qwen2':
|
|
136
141
|
case 'qwen2_moe':
|
|
137
142
|
case 'qwen2_vl':
|
|
143
|
+
case 'qwen2_vl_text':
|
|
138
144
|
case 'qwen2_5_vl_text':
|
|
139
145
|
case 'qwen3_moe':
|
|
140
146
|
case 'qwen3_vl_text':
|
|
@@ -293,6 +299,9 @@ function getNormalizedConfig(config) {
|
|
|
293
299
|
* @returns {Record<string, number[]>}
|
|
294
300
|
*/
|
|
295
301
|
export function getCacheShapes(config, options) {
|
|
302
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
303
|
+
config = new PretrainedConfig(config);
|
|
304
|
+
}
|
|
296
305
|
if (['lfm2', 'lfm2_moe'].includes(config.model_type)) {
|
|
297
306
|
const pkv_prefix = options?.prefix ?? 'past_key_values';
|
|
298
307
|
const conv_prefix = pkv_prefix === 'present' ? 'present' : 'past';
|
|
@@ -401,8 +410,14 @@ export function getCacheShapes(config, options) {
|
|
|
401
410
|
}
|
|
402
411
|
}
|
|
403
412
|
return cache_values;
|
|
404
|
-
} else if (['qwen3_5', 'qwen3_5_moe'].includes(config.model_type)) {
|
|
405
|
-
|
|
413
|
+
} else if (['lfm2_vl', 'qwen3_5', 'qwen3_5_moe', 'voxtral_realtime'].includes(config.model_type)) {
|
|
414
|
+
let subConfig;
|
|
415
|
+
if (config.model_type === 'voxtral_realtime' && options?.session_name === 'audio_encoder') {
|
|
416
|
+
subConfig = /** @type {any} */ (config).audio_config;
|
|
417
|
+
} else {
|
|
418
|
+
subConfig = /** @type {any} */ (config).text_config;
|
|
419
|
+
}
|
|
420
|
+
return getCacheShapes(subConfig, options);
|
|
406
421
|
}
|
|
407
422
|
|
|
408
423
|
return getKeyValueShapes(config, options);
|
package/src/env.js
CHANGED
|
@@ -26,7 +26,7 @@ import fs from 'node:fs';
|
|
|
26
26
|
import path from 'node:path';
|
|
27
27
|
import url from 'node:url';
|
|
28
28
|
|
|
29
|
-
const VERSION = '4.0.0-next.
|
|
29
|
+
const VERSION = '4.0.0-next.7';
|
|
30
30
|
|
|
31
31
|
const HAS_SELF = typeof self !== 'undefined';
|
|
32
32
|
|
|
@@ -224,6 +224,11 @@ export const LogLevel = Object.freeze({
|
|
|
224
224
|
* @property {boolean} useWasmCache Whether to pre-load and cache WASM binaries and the WASM factory (.mjs) for ONNX Runtime.
|
|
225
225
|
* Defaults to `true` when cache is available. This can improve performance and enables offline usage by avoiding repeated downloads.
|
|
226
226
|
* @property {string} cacheKey The cache key to use for storing models and WASM binaries. Defaults to 'transformers-cache'.
|
|
227
|
+
* @property {boolean} experimental_useCrossOriginStorage Whether to use the Cross-Origin Storage API to cache model files
|
|
228
|
+
* across origins, allowing different sites to share the same cached model weights. Defaults to `false`.
|
|
229
|
+
* Requires the Cross-Origin Storage Chrome extension: {@link https://chromewebstore.google.com/detail/cross-origin-storage/denpnpcgjgikjpoglpjefakmdcbmlgih}.
|
|
230
|
+
* The `experimental_` prefix indicates that the underlying browser API is not yet standardised and may change or be
|
|
231
|
+
* removed without a major version bump. For more information, see {@link https://github.com/WICG/cross-origin-storage}.
|
|
227
232
|
* @property {(input: string | URL, init?: any) => Promise<any>} fetch The fetch function to use. Defaults to `fetch`.
|
|
228
233
|
*/
|
|
229
234
|
|
|
@@ -270,6 +275,8 @@ export const env = {
|
|
|
270
275
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
271
276
|
cacheKey: 'transformers-cache',
|
|
272
277
|
|
|
278
|
+
experimental_useCrossOriginStorage: false,
|
|
279
|
+
|
|
273
280
|
/////////////////// Custom fetch /////////////////////
|
|
274
281
|
fetch: DEFAULT_FETCH,
|
|
275
282
|
|
|
@@ -14,7 +14,7 @@ import { logger } from './utils/logger.js';
|
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
16
|
/**
|
|
17
|
-
* @typedef {
|
|
17
|
+
* @typedef {Object} ImageProcessorResult
|
|
18
18
|
* @property {Tensor} pixel_values The pixel values of the batched preprocessed images.
|
|
19
19
|
* @property {HeightWidth[]} original_sizes Array of two-dimensional tuples like [[480, 640]].
|
|
20
20
|
* @property {HeightWidth[]} reshaped_input_sizes Array of two-dimensional tuples like [[1000, 1330]].
|
|
@@ -407,7 +407,7 @@ function compute_segments(
|
|
|
407
407
|
* @returns {[number, number]} The new height and width of the image.
|
|
408
408
|
* @throws {Error} If the height or width is smaller than the factor.
|
|
409
409
|
*/
|
|
410
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
410
|
+
export function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
411
411
|
if (height < factor || width < factor) {
|
|
412
412
|
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
413
413
|
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
@@ -886,7 +886,7 @@ export class ImageProcessor extends Callable {
|
|
|
886
886
|
}
|
|
887
887
|
|
|
888
888
|
/**
|
|
889
|
-
* @typedef {
|
|
889
|
+
* @typedef {Object} PreprocessedImage
|
|
890
890
|
* @property {HeightWidth} original_size The original size of the image.
|
|
891
891
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
892
892
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -105,7 +105,7 @@ export class ChatterboxModel extends ChatterboxPreTrainedModel {
|
|
|
105
105
|
if (!past_key_values || target_length !== 1) {
|
|
106
106
|
throw new Error('Incorrect state encountered during generation.');
|
|
107
107
|
}
|
|
108
|
-
const past_length =
|
|
108
|
+
const past_length = past_key_values.get_seq_length();
|
|
109
109
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
110
110
|
}
|
|
111
111
|
}
|
|
@@ -8,7 +8,7 @@ import {
|
|
|
8
8
|
import { full } from '../../utils/tensor.js';
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
|
-
* @typedef {
|
|
11
|
+
* @typedef {Object} DetrFeatureExtractorResultProps
|
|
12
12
|
* @property {import('../../utils/tensor.js').Tensor} pixel_mask
|
|
13
13
|
* @typedef {import('../../image_processors_utils.js').ImageProcessorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult
|
|
14
14
|
*/
|
|
@@ -4,6 +4,7 @@ export * from './chatterbox/feature_extraction_chatterbox.js';
|
|
|
4
4
|
export * from './clap/feature_extraction_clap.js';
|
|
5
5
|
export * from './dac/feature_extraction_dac.js';
|
|
6
6
|
export * from './gemma3n/feature_extraction_gemma3n.js';
|
|
7
|
+
export * from './granite_speech/feature_extraction_granite_speech.js';
|
|
7
8
|
export * from './moonshine/feature_extraction_moonshine.js';
|
|
8
9
|
export * from './parakeet/feature_extraction_parakeet.js';
|
|
9
10
|
export * from './pyannote/feature_extraction_pyannote.js';
|
|
@@ -12,6 +13,7 @@ export * from './snac/feature_extraction_snac.js';
|
|
|
12
13
|
export * from './speecht5/feature_extraction_speecht5.js';
|
|
13
14
|
export * from './wav2vec2/feature_extraction_wav2vec2.js';
|
|
14
15
|
export * from './wespeaker/feature_extraction_wespeaker.js';
|
|
16
|
+
export * from './voxtral_realtime/feature_extraction_voxtral_realtime.js';
|
|
15
17
|
export * from './whisper/feature_extraction_whisper.js';
|
|
16
18
|
|
|
17
19
|
export { FeatureExtractor } from '../feature_extraction_utils.js';
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
|
|
2
|
+
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
3
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
4
|
+
|
|
5
|
+
export class GraniteSpeechFeatureExtractor extends FeatureExtractor {
|
|
6
|
+
constructor(config) {
|
|
7
|
+
super(config);
|
|
8
|
+
|
|
9
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
10
|
+
|
|
11
|
+
// torchaudio uses HTK mel scale with no norm by default
|
|
12
|
+
this.mel_filters = mel_filter_bank(
|
|
13
|
+
Math.floor(1 + n_fft / 2), // num_frequency_bins = 257
|
|
14
|
+
n_mels, // 80
|
|
15
|
+
0, // min_frequency
|
|
16
|
+
sample_rate / 2, // max_frequency = 8000
|
|
17
|
+
sample_rate, // 16000
|
|
18
|
+
null, // norm (torchaudio default: no norm)
|
|
19
|
+
'htk', // mel_scale (torchaudio default)
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
// torchaudio center-pads the window when win_length < n_fft:
|
|
23
|
+
// pad_amount = (n_fft - win_length) // 2 on each side
|
|
24
|
+
const raw_window = window_function(win_length, 'hann');
|
|
25
|
+
this.window = new Float64Array(n_fft);
|
|
26
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
27
|
+
this.window.set(raw_window, pad);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
32
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
33
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
34
|
+
*/
|
|
35
|
+
async _call(audio) {
|
|
36
|
+
validate_audio_inputs(audio, 'GraniteSpeechFeatureExtractor');
|
|
37
|
+
|
|
38
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
39
|
+
|
|
40
|
+
// Truncate to even number of frames for pair-stacking
|
|
41
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
42
|
+
const max_num_frames = num_frames - (num_frames % 2);
|
|
43
|
+
|
|
44
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
45
|
+
power: 2.0,
|
|
46
|
+
mel_filters: this.mel_filters,
|
|
47
|
+
log_mel: 'log10_max_norm',
|
|
48
|
+
transpose: true, // [time, n_mels]
|
|
49
|
+
max_num_frames,
|
|
50
|
+
do_pad: false,
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
// Stack adjacent frame pairs: [time, n_mels] → [1, time/2, 2*n_mels]
|
|
54
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
55
|
+
|
|
56
|
+
return { input_features };
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
|
|
2
|
+
import { AutoTokenizer } from '../auto/tokenization_auto.js';
|
|
3
|
+
import { Processor } from '../../processing_utils.js';
|
|
4
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
5
|
+
|
|
6
|
+
export class GraniteSpeechProcessor extends Processor {
|
|
7
|
+
static tokenizer_class = AutoTokenizer;
|
|
8
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
9
|
+
static uses_processor_config = true;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
13
|
+
* @param {number} audioLength Raw audio sample count.
|
|
14
|
+
* @returns {number} Number of projector output tokens.
|
|
15
|
+
*/
|
|
16
|
+
_get_num_audio_features(audioLength) {
|
|
17
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
18
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
19
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
20
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
21
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
22
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
23
|
+
return nblocks * effective_window_size;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @param {string} text The text input to process.
|
|
28
|
+
* @param {Float32Array} audio The audio input to process.
|
|
29
|
+
*/
|
|
30
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
31
|
+
if (Array.isArray(text)) {
|
|
32
|
+
throw new Error('Batched inputs are not supported yet.');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
let audio_inputs = {};
|
|
36
|
+
if (audio) {
|
|
37
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
38
|
+
audio_inputs['input_features'] = input_features;
|
|
39
|
+
|
|
40
|
+
// Compute audio embed sizes and mask in the processor
|
|
41
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
42
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
43
|
+
audio_inputs['input_features_mask'] = new Tensor('bool', mask_data, [1, audio_embed_size]);
|
|
44
|
+
|
|
45
|
+
const audio_token = this.config.audio_token ?? '<|audio|>';
|
|
46
|
+
if (!text.includes(audio_token)) {
|
|
47
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
48
|
+
}
|
|
49
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const text_inputs = this.tokenizer(text, {
|
|
53
|
+
add_special_tokens: false,
|
|
54
|
+
...kwargs,
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
...text_inputs,
|
|
59
|
+
...audio_inputs,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
@@ -2,7 +2,7 @@ import { ImageProcessor } from '../../image_processors_utils.js';
|
|
|
2
2
|
import { ones } from '../../utils/tensor.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
* @typedef {
|
|
5
|
+
* @typedef {Object} GroundingDinoFeatureExtractorResultProps
|
|
6
6
|
* @property {import('../../utils/tensor.js').Tensor} pixel_mask
|
|
7
7
|
* @typedef {import('../../image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
|
|
8
8
|
*/
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { sessionRun } from '../session.js';
|
|
1
|
+
import { LlavaForConditionalGeneration } from '../llava/modeling_llava.js';
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
/**
|
|
4
|
+
* The Idefics3 model which consists of a vision backbone and a language model.
|
|
5
|
+
*/
|
|
6
|
+
export class Idefics3ForConditionalGeneration extends LlavaForConditionalGeneration {
|
|
5
7
|
forward_params = [
|
|
6
8
|
'input_ids',
|
|
7
9
|
'attention_mask',
|
|
@@ -11,32 +13,3 @@ export class Idefics3PreTrainedModel extends PreTrainedModel {
|
|
|
11
13
|
'past_key_values',
|
|
12
14
|
];
|
|
13
15
|
}
|
|
14
|
-
|
|
15
|
-
/**
|
|
16
|
-
* The Idefics3 model which consists of a vision backbone and a language model.
|
|
17
|
-
*/
|
|
18
|
-
export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
19
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
20
|
-
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask }))
|
|
21
|
-
.image_features;
|
|
22
|
-
return features;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
27
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
28
|
-
|
|
29
|
-
return default_merge_input_ids_with_image_features({
|
|
30
|
-
// @ts-ignore
|
|
31
|
-
image_token_id: this.config.image_token_id,
|
|
32
|
-
...kwargs,
|
|
33
|
-
image_features: reshaped_image_hidden_states,
|
|
34
|
-
});
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* The SmolVLM Model with a language modeling head.
|
|
40
|
-
* It is made up a SigLIP vision encoder, with a language modeling head on top.
|
|
41
|
-
*/
|
|
42
|
-
export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {}
|
|
@@ -14,6 +14,7 @@ export * from './grounding_dino/image_processing_grounding_dino.js';
|
|
|
14
14
|
export * from './idefics3/image_processing_idefics3.js';
|
|
15
15
|
export * from './janus/image_processing_janus.js';
|
|
16
16
|
export * from './jina_clip/image_processing_jina_clip.js';
|
|
17
|
+
export * from './lfm2_vl/image_processing_lfm2_vl.js';
|
|
17
18
|
export * from './llava_onevision/image_processing_llava_onevision.js';
|
|
18
19
|
export * from './mask2former/image_processing_mask2former.js';
|
|
19
20
|
export * from './maskformer/image_processing_maskformer.js';
|