@huggingface/transformers 3.3.3 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -3
- package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.js +2480 -1457
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/{transformers.cjs → transformers.node.cjs} +1412 -2395
- package/dist/transformers.node.cjs.map +1 -0
- package/dist/transformers.node.min.cjs +2 -0
- package/dist/transformers.node.min.cjs.map +1 -0
- package/dist/transformers.node.min.mjs +2 -0
- package/dist/transformers.node.min.mjs.map +1 -0
- package/dist/{transformers.mjs → transformers.node.mjs} +1440 -2375
- package/dist/transformers.node.mjs.map +1 -0
- package/dist/transformers.web.js +35713 -0
- package/dist/transformers.web.js.map +1 -0
- package/dist/transformers.web.min.js +2 -0
- package/dist/transformers.web.min.js.map +1 -0
- package/package.json +6 -6
- package/src/backends/onnx.js +14 -15
- package/src/configs.js +4 -1
- package/src/env.js +1 -1
- package/src/generation/streamers.js +4 -3
- package/src/models/dac/feature_extraction_dac.js +3 -0
- package/src/models/encodec/feature_extraction_encodec.js +32 -0
- package/src/models/feature_extractors.js +2 -0
- package/src/models/idefics3/image_processing_idefics3.js +1 -1
- package/src/models/image_processors.js +1 -0
- package/src/models/processors.js +2 -0
- package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
- package/src/models/smolvlm/processing_smolvlm.js +2 -0
- package/src/models/ultravox/processing_ultravox.js +54 -0
- package/src/models/whisper/common_whisper.js +7 -1
- package/src/models/whisper/feature_extraction_whisper.js +18 -10
- package/src/models.js +456 -76
- package/src/pipelines.js +111 -7
- package/src/tokenizers.js +42 -28
- package/src/transformers.js +1 -0
- package/src/utils/audio.js +2 -0
- package/src/utils/hub.js +140 -80
- package/src/utils/maths.js +1 -1
- package/src/utils/tensor.js +6 -3
- package/src/utils/video.js +128 -0
- package/types/backends/onnx.d.ts +2 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/configs.d.ts +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/dac/feature_extraction_dac.d.ts +4 -0
- package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
- package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
- package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/florence2/processing_florence2.d.ts +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
- package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
- package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
- package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/processing_ultravox.d.ts +16 -0
- package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
- package/types/models/whisper/common_whisper.d.ts.map +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/models.d.ts +132 -4
- package/types/models.d.ts.map +1 -1
- package/types/pipelines.d.ts +50 -4
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/tsconfig.tsbuildinfo +1 -1
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/hub.d.ts +19 -7
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/maths.d.ts +2 -2
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +17 -18
- package/types/utils/tensor.d.ts.map +1 -1
- package/types/utils/video.d.ts +37 -0
- package/types/utils/video.d.ts.map +1 -0
- package/dist/transformers.cjs.map +0 -1
- package/dist/transformers.min.cjs +0 -2
- package/dist/transformers.min.cjs.map +0 -1
- package/dist/transformers.min.mjs +0 -2
- package/dist/transformers.min.mjs.map +0 -1
- package/dist/transformers.mjs.map +0 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/transformers",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.4.0",
|
|
4
4
|
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
|
|
5
5
|
"main": "./src/transformers.js",
|
|
6
6
|
"types": "./types/transformers.d.ts",
|
|
@@ -9,16 +9,16 @@
|
|
|
9
9
|
"node": {
|
|
10
10
|
"import": {
|
|
11
11
|
"types": "./types/transformers.d.ts",
|
|
12
|
-
"default": "./dist/transformers.mjs"
|
|
12
|
+
"default": "./dist/transformers.node.mjs"
|
|
13
13
|
},
|
|
14
14
|
"require": {
|
|
15
15
|
"types": "./types/transformers.d.ts",
|
|
16
|
-
"default": "./dist/transformers.cjs"
|
|
16
|
+
"default": "./dist/transformers.node.cjs"
|
|
17
17
|
}
|
|
18
18
|
},
|
|
19
19
|
"default": {
|
|
20
20
|
"types": "./types/transformers.d.ts",
|
|
21
|
-
"default": "./dist/transformers.js"
|
|
21
|
+
"default": "./dist/transformers.web.js"
|
|
22
22
|
}
|
|
23
23
|
},
|
|
24
24
|
"scripts": {
|
|
@@ -57,7 +57,7 @@
|
|
|
57
57
|
"dependencies": {
|
|
58
58
|
"@huggingface/jinja": "^0.3.3",
|
|
59
59
|
"onnxruntime-node": "1.20.1",
|
|
60
|
-
"onnxruntime-web": "1.
|
|
60
|
+
"onnxruntime-web": "1.22.0-dev.20250306-ccf8fdd9ea",
|
|
61
61
|
"sharp": "^0.33.5"
|
|
62
62
|
},
|
|
63
63
|
"devDependencies": {
|
|
@@ -69,7 +69,7 @@
|
|
|
69
69
|
"jest-environment-node": "^30.0.0-alpha.6",
|
|
70
70
|
"jsdoc-to-markdown": "^9.1.1",
|
|
71
71
|
"prettier": "3.4.2",
|
|
72
|
-
"typescript": "^5.
|
|
72
|
+
"typescript": "^5.8.2",
|
|
73
73
|
"wavefile": "11.0.0",
|
|
74
74
|
"webpack": "^5.97.1",
|
|
75
75
|
"webpack-cli": "^5.1.4",
|
package/src/backends/onnx.js
CHANGED
|
@@ -57,8 +57,8 @@ let ONNX;
|
|
|
57
57
|
const ORT_SYMBOL = Symbol.for('onnxruntime');
|
|
58
58
|
|
|
59
59
|
if (ORT_SYMBOL in globalThis) {
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
// If the JS runtime exposes their own ONNX runtime, use it
|
|
61
|
+
ONNX = globalThis[ORT_SYMBOL];
|
|
62
62
|
|
|
63
63
|
} else if (apis.IS_NODE_ENV) {
|
|
64
64
|
ONNX = ONNX_NODE.default ?? ONNX_NODE;
|
|
@@ -141,19 +141,19 @@ let wasmInitPromise = null;
|
|
|
141
141
|
|
|
142
142
|
/**
|
|
143
143
|
* Create an ONNX inference session.
|
|
144
|
-
* @param {Uint8Array}
|
|
144
|
+
* @param {Uint8Array|string} buffer_or_path The ONNX model buffer or path.
|
|
145
145
|
* @param {import('onnxruntime-common').InferenceSession.SessionOptions} session_options ONNX inference session options.
|
|
146
146
|
* @param {Object} session_config ONNX inference session configuration.
|
|
147
147
|
* @returns {Promise<import('onnxruntime-common').InferenceSession & { config: Object}>} The ONNX inference session.
|
|
148
148
|
*/
|
|
149
|
-
export async function createInferenceSession(
|
|
149
|
+
export async function createInferenceSession(buffer_or_path, session_options, session_config) {
|
|
150
150
|
if (wasmInitPromise) {
|
|
151
151
|
// A previous session has already initialized the WASM runtime
|
|
152
152
|
// so we wait for it to resolve before creating this new session.
|
|
153
153
|
await wasmInitPromise;
|
|
154
154
|
}
|
|
155
155
|
|
|
156
|
-
const sessionPromise = InferenceSession.create(
|
|
156
|
+
const sessionPromise = InferenceSession.create(buffer_or_path, session_options);
|
|
157
157
|
wasmInitPromise ??= sessionPromise;
|
|
158
158
|
const session = await sessionPromise;
|
|
159
159
|
session.config = session_config;
|
|
@@ -175,11 +175,15 @@ const ONNX_ENV = ONNX?.env;
|
|
|
175
175
|
if (ONNX_ENV?.wasm) {
|
|
176
176
|
// Initialize wasm backend with suitable default settings.
|
|
177
177
|
|
|
178
|
-
// (Optional) Set path to wasm files. This
|
|
179
|
-
//
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
178
|
+
// (Optional) Set path to wasm files. This will override the default path search behavior of onnxruntime-web.
|
|
179
|
+
// By default, we only do this if we are not in a service worker and the wasmPaths are not already set.
|
|
180
|
+
if (
|
|
181
|
+
// @ts-ignore Cannot find name 'ServiceWorkerGlobalScope'.ts(2304)
|
|
182
|
+
!(typeof ServiceWorkerGlobalScope !== 'undefined' && self instanceof ServiceWorkerGlobalScope)
|
|
183
|
+
&& !ONNX_ENV.wasm.wasmPaths
|
|
184
|
+
) {
|
|
185
|
+
ONNX_ENV.wasm.wasmPaths = `https://cdn.jsdelivr.net/npm/@huggingface/transformers@${env.version}/dist/`;
|
|
186
|
+
}
|
|
183
187
|
|
|
184
188
|
// TODO: Add support for loading WASM files from cached buffer when we upgrade to onnxruntime-web@1.19.0
|
|
185
189
|
// https://github.com/microsoft/onnxruntime/pull/21534
|
|
@@ -187,11 +191,6 @@ if (ONNX_ENV?.wasm) {
|
|
|
187
191
|
// Users may wish to proxy the WASM backend to prevent the UI from freezing,
|
|
188
192
|
// However, this is not necessary when using WebGPU, so we default to false.
|
|
189
193
|
ONNX_ENV.wasm.proxy = false;
|
|
190
|
-
|
|
191
|
-
// https://developer.mozilla.org/en-US/docs/Web/API/crossOriginIsolated
|
|
192
|
-
if (typeof crossOriginIsolated === 'undefined' || !crossOriginIsolated) {
|
|
193
|
-
ONNX_ENV.wasm.numThreads = 1;
|
|
194
|
-
}
|
|
195
194
|
}
|
|
196
195
|
|
|
197
196
|
if (ONNX_ENV?.webgpu) {
|
package/src/configs.js
CHANGED
|
@@ -70,6 +70,8 @@ function getNormalizedConfig(config) {
|
|
|
70
70
|
case 'florence2':
|
|
71
71
|
case 'llava_onevision':
|
|
72
72
|
case 'idefics3':
|
|
73
|
+
case 'ultravox':
|
|
74
|
+
case 'smolvlm':
|
|
73
75
|
// @ts-expect-error TS2339
|
|
74
76
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
75
77
|
break;
|
|
@@ -173,6 +175,7 @@ function getNormalizedConfig(config) {
|
|
|
173
175
|
case 'mbart':
|
|
174
176
|
case 'marian':
|
|
175
177
|
case 'whisper':
|
|
178
|
+
case 'lite-whisper':
|
|
176
179
|
case 'm2m_100':
|
|
177
180
|
case 'blenderbot':
|
|
178
181
|
case 'blenderbot-small':
|
|
@@ -405,5 +408,5 @@ export class AutoConfig {
|
|
|
405
408
|
* for more information.
|
|
406
409
|
* @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
|
|
407
410
|
* @property {import('./utils/dtypes.js').DataType|Record<string, import('./utils/dtypes.js').DataType>} [dtype] The default data type to use for the model.
|
|
408
|
-
* @property {
|
|
411
|
+
* @property {import('./utils/hub.js').ExternalData|Record<string, import('./utils/hub.js').ExternalData>} [use_external_data_format=false] Whether to load the model using the external data format (used for models >= 2GB in size).
|
|
409
412
|
*/
|
package/src/env.js
CHANGED
|
@@ -26,7 +26,7 @@ import fs from 'fs';
|
|
|
26
26
|
import path from 'path';
|
|
27
27
|
import url from 'url';
|
|
28
28
|
|
|
29
|
-
const VERSION = '3.
|
|
29
|
+
const VERSION = '3.4.0';
|
|
30
30
|
|
|
31
31
|
// Check if various APIs are available (depends on environment)
|
|
32
32
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -72,9 +72,10 @@ export class TextStreamer extends BaseStreamer {
|
|
|
72
72
|
throw Error('TextStreamer only supports batch size of 1');
|
|
73
73
|
}
|
|
74
74
|
|
|
75
|
-
|
|
75
|
+
const is_prompt = this.next_tokens_are_prompt;
|
|
76
|
+
if (is_prompt) {
|
|
76
77
|
this.next_tokens_are_prompt = false;
|
|
77
|
-
return;
|
|
78
|
+
if (this.skip_prompt) return;
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
const tokens = value[0];
|
|
@@ -85,7 +86,7 @@ export class TextStreamer extends BaseStreamer {
|
|
|
85
86
|
const text = this.tokenizer.decode(this.token_cache, this.decode_kwargs);
|
|
86
87
|
|
|
87
88
|
let printable_text;
|
|
88
|
-
if (text.endsWith('\n')) {
|
|
89
|
+
if (is_prompt || text.endsWith('\n')) {
|
|
89
90
|
// After the symbol for a new line, we flush the cache.
|
|
90
91
|
printable_text = text.slice(this.print_len);
|
|
91
92
|
this.token_cache = [];
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
export class EncodecFeatureExtractor extends FeatureExtractor {
|
|
6
|
+
/**
|
|
7
|
+
* Asynchronously extracts input values from a given audio using the provided configuration.
|
|
8
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
9
|
+
* @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
|
|
10
|
+
*/
|
|
11
|
+
async _call(audio) {
|
|
12
|
+
validate_audio_inputs(audio, 'EncodecFeatureExtractor');
|
|
13
|
+
|
|
14
|
+
if (audio instanceof Float64Array) {
|
|
15
|
+
audio = new Float32Array(audio);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const num_channels = this.config.feature_size;
|
|
19
|
+
if (audio.length % num_channels !== 0) {
|
|
20
|
+
throw new Error(`The length of the audio data must be a multiple of the number of channels (${num_channels}).`);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const shape = [
|
|
24
|
+
1, /* batch_size */
|
|
25
|
+
num_channels, /* num_channels */
|
|
26
|
+
audio.length / num_channels, /* num_samples */
|
|
27
|
+
];
|
|
28
|
+
return {
|
|
29
|
+
input_values: new Tensor('float32', audio, shape),
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
|
|
2
2
|
export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
|
|
3
|
+
export * from './encodec/feature_extraction_encodec.js';
|
|
3
4
|
export * from './clap/feature_extraction_clap.js';
|
|
5
|
+
export * from './dac/feature_extraction_dac.js';
|
|
4
6
|
export * from './moonshine/feature_extraction_moonshine.js';
|
|
5
7
|
export * from './pyannote/feature_extraction_pyannote.js';
|
|
6
8
|
export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
|
|
@@ -147,7 +147,7 @@ export class Idefics3ImageProcessor extends ImageProcessor {
|
|
|
147
147
|
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
|
|
148
148
|
const end_offset = (i + 1) * pixel_attention_mask_stride;
|
|
149
149
|
|
|
150
|
-
// @ts-
|
|
150
|
+
// @ts-ignore
|
|
151
151
|
pixel_attention_mask_data.fill(false, start_offset, end_offset);
|
|
152
152
|
}
|
|
153
153
|
}
|
|
@@ -32,6 +32,7 @@ export * from './rt_detr/image_processing_rt_detr.js'
|
|
|
32
32
|
export * from './sam/image_processing_sam.js'
|
|
33
33
|
export * from './segformer/image_processing_segformer.js'
|
|
34
34
|
export * from './siglip/image_processing_siglip.js'
|
|
35
|
+
export * from './smolvlm/image_processing_smolvlm.js'
|
|
35
36
|
export * from './swin2sr/image_processing_swin2sr.js'
|
|
36
37
|
export * from './vit/image_processing_vit.js'
|
|
37
38
|
export * from './vitmatte/image_processing_vitmatte.js'
|
package/src/models/processors.js
CHANGED
|
@@ -11,7 +11,9 @@ export * from './paligemma/processing_paligemma.js';
|
|
|
11
11
|
export * from './pyannote/processing_pyannote.js';
|
|
12
12
|
export * from './qwen2_vl/processing_qwen2_vl.js';
|
|
13
13
|
export * from './sam/processing_sam.js';
|
|
14
|
+
export * from './smolvlm/processing_smolvlm.js';
|
|
14
15
|
export * from './speecht5/processing_speecht5.js';
|
|
16
|
+
export * from './ultravox/processing_ultravox.js';
|
|
15
17
|
export * from './wav2vec2/processing_wav2vec2.js';
|
|
16
18
|
export * from './wav2vec2_with_lm/processing_wav2vec2_with_lm.js';
|
|
17
19
|
export * from './whisper/processing_whisper.js';
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
|
|
2
|
+
import { AutoTokenizer } from "../../tokenizers.js"
|
|
3
|
+
import { Processor } from "../../base/processing_utils.js"
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Represents a UltravoxProcessor that extracts features from an audio input.
|
|
7
|
+
*/
|
|
8
|
+
export class UltravoxProcessor extends Processor {
|
|
9
|
+
static tokenizer_class = AutoTokenizer
|
|
10
|
+
static feature_extractor_class = AutoFeatureExtractor
|
|
11
|
+
static uses_processor_config = true;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* @param {string} text The text input to process.
|
|
15
|
+
* @param {Float32Array} audio The audio input to process.
|
|
16
|
+
*/
|
|
17
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
18
|
+
// TODO: Support batched inputs
|
|
19
|
+
if (Array.isArray(text)) {
|
|
20
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
let audio_inputs = {};
|
|
24
|
+
if (audio) {
|
|
25
|
+
const audio_len = audio.length;
|
|
26
|
+
const { input_features } = await this.feature_extractor(audio, {
|
|
27
|
+
...kwargs,
|
|
28
|
+
max_length: audio_len,
|
|
29
|
+
});
|
|
30
|
+
const nb_encoder_frames = Math.round(audio_len / this.config.encoder_ds_factor + 1e-4);
|
|
31
|
+
|
|
32
|
+
// NOTE: The python version appears to have an off-by-one error.
|
|
33
|
+
const audio_embed_frames = 1 + Math.ceil(nb_encoder_frames / this.config.stack_factor);
|
|
34
|
+
audio_inputs["audio_token_len"] = [audio_embed_frames];
|
|
35
|
+
audio_inputs["audio_values"] = input_features;
|
|
36
|
+
|
|
37
|
+
const image_token = this.config.audio_placeholder;
|
|
38
|
+
if (!text.includes(image_token)) {
|
|
39
|
+
throw new Error(`The input text does not contain the image token ${image_token}.`);
|
|
40
|
+
}
|
|
41
|
+
text = text.replaceAll(image_token, image_token.repeat(audio_embed_frames));
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const text_inputs = this.tokenizer(text, {
|
|
45
|
+
add_special_tokens: false,
|
|
46
|
+
...kwargs,
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
...text_inputs,
|
|
51
|
+
...audio_inputs,
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -135,6 +135,12 @@ export function whisper_language_to_code(language) {
|
|
|
135
135
|
if (language_code === undefined) {
|
|
136
136
|
// User provided something that is not a language name
|
|
137
137
|
|
|
138
|
+
// Perhaps the user passed the special token itself
|
|
139
|
+
const language_special_token = language.match(/^<\|([a-z]{2})\|>$/);
|
|
140
|
+
if (language_special_token) {
|
|
141
|
+
language = language_special_token[1];
|
|
142
|
+
}
|
|
143
|
+
|
|
138
144
|
if (WHISPER_LANGUAGE_MAPPING.has(language)) {
|
|
139
145
|
// User provided the language code directly (e.g., "en")
|
|
140
146
|
language_code = language;
|
|
@@ -144,7 +150,7 @@ export function whisper_language_to_code(language) {
|
|
|
144
150
|
const is_language_code = language.length === 2;
|
|
145
151
|
const langs = is_language_code ? WHISPER_LANGUAGE_MAPPING.keys() : WHISPER_LANGUAGE_MAPPING.values();
|
|
146
152
|
|
|
147
|
-
throw new Error(`Language "${language}" is not supported. Must be one of: ${JSON.stringify(langs)}`);
|
|
153
|
+
throw new Error(`Language "${language}" is not supported. Must be one of: ${JSON.stringify(Array.from(langs))}`);
|
|
148
154
|
}
|
|
149
155
|
}
|
|
150
156
|
return language_code;
|
|
@@ -39,7 +39,10 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
39
39
|
log_mel: 'log10',
|
|
40
40
|
|
|
41
41
|
// Custom
|
|
42
|
-
max_num_frames:
|
|
42
|
+
max_num_frames: Math.min(
|
|
43
|
+
Math.floor(waveform.length / this.config.hop_length),
|
|
44
|
+
this.config.nb_max_frames, // 3000
|
|
45
|
+
)
|
|
43
46
|
}
|
|
44
47
|
)
|
|
45
48
|
|
|
@@ -58,20 +61,25 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
|
58
61
|
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
59
62
|
* @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
|
|
60
63
|
*/
|
|
61
|
-
async _call(audio
|
|
64
|
+
async _call(audio, {
|
|
65
|
+
max_length = null,
|
|
66
|
+
} = {}) {
|
|
62
67
|
validate_audio_inputs(audio, 'WhisperFeatureExtractor');
|
|
63
68
|
|
|
64
69
|
let waveform;
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
const length = max_length ?? this.config.n_samples;
|
|
71
|
+
if (audio.length > length) {
|
|
72
|
+
if (audio.length > this.config.n_samples) {
|
|
73
|
+
console.warn(
|
|
74
|
+
"Attempting to extract features for audio longer than 30 seconds. " +
|
|
75
|
+
"If using a pipeline to extract transcript from a long audio clip, " +
|
|
76
|
+
"remember to specify `chunk_length_s` and/or `stride_length_s`."
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
waveform = audio.slice(0, length);
|
|
72
80
|
} else {
|
|
73
81
|
// pad with zeros
|
|
74
|
-
waveform = new Float32Array(
|
|
82
|
+
waveform = new Float32Array(length);
|
|
75
83
|
waveform.set(audio);
|
|
76
84
|
}
|
|
77
85
|
|