@huggingface/transformers 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -4
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +16607 -13472
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +16601 -13451
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +238 -52
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +229 -43
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +240 -54
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +16017 -12878
- package/dist/transformers.mjs.map +1 -1
- package/package.json +7 -7
- package/src/base/feature_extraction_utils.js +54 -0
- package/src/base/image_processors_utils.js +1089 -0
- package/src/base/processing_utils.js +145 -0
- package/src/configs.js +15 -3
- package/src/env.js +15 -4
- package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +90 -0
- package/src/models/auto/feature_extraction_auto.js +41 -0
- package/src/models/auto/image_processing_auto.js +29 -0
- package/src/models/auto/processing_auto.js +100 -0
- package/src/models/beit/image_processing_beit.js +5 -0
- package/src/models/bit/image_processing_bit.js +5 -0
- package/src/models/chinese_clip/image_processing_chinese_clip.js +5 -0
- package/src/models/clap/feature_extraction_clap.js +159 -0
- package/src/models/clip/image_processing_clip.js +6 -0
- package/src/models/convnext/image_processing_convnext.js +45 -0
- package/src/models/deit/image_processing_deit.js +6 -0
- package/src/models/detr/image_processing_detr.js +52 -0
- package/src/models/donut/image_processing_donut.js +31 -0
- package/src/models/dpt/image_processing_dpt.js +6 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +13 -0
- package/src/models/feature_extractors.js +12 -0
- package/src/models/florence2/processing_florence2.js +128 -0
- package/src/models/glpn/image_processing_glpn.js +5 -0
- package/src/models/image_processors.js +36 -0
- package/src/models/janus/image_processing_janus.js +26 -0
- package/src/models/janus/processing_janus.js +123 -0
- package/src/models/jina_clip/image_processing_jina_clip.js +26 -0
- package/src/models/jina_clip/processing_jina_clip.js +24 -0
- package/src/models/llava_onevision/image_processing_llava_onevision.js +5 -0
- package/src/models/mask2former/image_processing_mask2former.js +5 -0
- package/src/models/maskformer/image_processing_maskformer.js +18 -0
- package/src/models/mgp_str/processing_mgp_str.js +170 -0
- package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +7 -0
- package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +7 -0
- package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +7 -0
- package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +7 -0
- package/src/models/mobilevit/image_processing_mobilevit.js +6 -0
- package/src/models/nougat/image_processing_nougat.js +5 -0
- package/src/models/owlv2/image_processing_owlv2.js +5 -0
- package/src/models/owlvit/image_processing_owlvit.js +12 -0
- package/src/models/owlvit/processing_owlvit.js +7 -0
- package/src/models/processors.js +11 -0
- package/src/models/pvt/image_processing_pvt.js +5 -0
- package/src/models/pyannote/feature_extraction_pyannote.js +28 -0
- package/src/models/pyannote/processing_pyannote.js +71 -0
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +52 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +52 -0
- package/src/models/rt_detr/image_processing_rt_detr.js +12 -0
- package/src/models/sam/image_processing_sam.js +242 -0
- package/src/models/sam/processing_sam.js +20 -0
- package/src/models/sapiens/image_processing_sapiens.js +13 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +180 -0
- package/src/models/segformer/image_processing_segformer.js +13 -0
- package/src/models/siglip/image_processing_siglip.js +5 -0
- package/src/models/speecht5/feature_extraction_speecht5.js +4 -0
- package/src/models/speecht5/processing_speecht5.js +17 -0
- package/src/models/swin2sr/image_processing_swin2sr.js +24 -0
- package/src/models/vit/image_processing_vit.js +7 -0
- package/src/models/vitmatte/image_processing_vitmatte.js +50 -0
- package/src/models/vitpose/image_processing_vitpose.js +89 -0
- package/src/models/wav2vec2/feature_extraction_wav2vec2.js +44 -0
- package/src/models/wav2vec2/processing_wav2vec2.js +15 -0
- package/src/models/wespeaker/feature_extraction_wespeaker.js +100 -0
- package/src/models/whisper/feature_extraction_whisper.js +84 -0
- package/src/models/whisper/processing_whisper.js +21 -0
- package/src/models/yolos/image_processing_yolos.js +12 -0
- package/src/models.js +695 -32
- package/src/pipelines.js +8 -8
- package/src/tokenizers.js +5 -0
- package/src/transformers.js +15 -2
- package/src/utils/constants.js +8 -1
- package/src/utils/core.js +37 -9
- package/src/utils/hub.js +2 -1
- package/src/utils/image.js +68 -17
- package/src/utils/tensor.js +33 -1
- package/types/base/feature_extraction_utils.d.ts +41 -0
- package/types/base/feature_extraction_utils.d.ts.map +1 -0
- package/types/base/image_processors_utils.d.ts +323 -0
- package/types/base/image_processors_utils.d.ts.map +1 -0
- package/types/base/processing_utils.d.ts +80 -0
- package/types/base/processing_utils.d.ts.map +1 -0
- package/types/configs.d.ts +4 -1
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts.map +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +25 -0
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/auto/feature_extraction_auto.d.ts +5 -0
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -0
- package/types/models/auto/image_processing_auto.d.ts +5 -0
- package/types/models/auto/image_processing_auto.d.ts.map +1 -0
- package/types/models/auto/processing_auto.d.ts +35 -0
- package/types/models/auto/processing_auto.d.ts.map +1 -0
- package/types/models/beit/image_processing_beit.d.ts +4 -0
- package/types/models/beit/image_processing_beit.d.ts.map +1 -0
- package/types/models/bit/image_processing_bit.d.ts +4 -0
- package/types/models/bit/image_processing_bit.d.ts.map +1 -0
- package/types/models/chinese_clip/image_processing_chinese_clip.d.ts +4 -0
- package/types/models/chinese_clip/image_processing_chinese_clip.d.ts.map +1 -0
- package/types/models/clap/feature_extraction_clap.d.ts +57 -0
- package/types/models/clap/feature_extraction_clap.d.ts.map +1 -0
- package/types/models/clip/image_processing_clip.d.ts +6 -0
- package/types/models/clip/image_processing_clip.d.ts.map +1 -0
- package/types/models/convnext/image_processing_convnext.d.ts +12 -0
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -0
- package/types/models/deit/image_processing_deit.d.ts +6 -0
- package/types/models/deit/image_processing_deit.d.ts.map +1 -0
- package/types/models/detr/image_processing_detr.d.ts +42 -0
- package/types/models/detr/image_processing_detr.d.ts.map +1 -0
- package/types/models/donut/image_processing_donut.d.ts +7 -0
- package/types/models/donut/image_processing_donut.d.ts.map +1 -0
- package/types/models/dpt/image_processing_dpt.d.ts +6 -0
- package/types/models/dpt/image_processing_dpt.d.ts.map +1 -0
- package/types/models/efficientnet/image_processing_efficientnet.d.ts +6 -0
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +10 -0
- package/types/models/feature_extractors.d.ts.map +1 -0
- package/types/models/florence2/processing_florence2.d.ts +39 -0
- package/types/models/florence2/processing_florence2.d.ts.map +1 -0
- package/types/models/glpn/image_processing_glpn.d.ts +4 -0
- package/types/models/glpn/image_processing_glpn.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +36 -0
- package/types/models/image_processors.d.ts.map +1 -0
- package/types/models/janus/image_processing_janus.d.ts +7 -0
- package/types/models/janus/image_processing_janus.d.ts.map +1 -0
- package/types/models/janus/processing_janus.d.ts +77 -0
- package/types/models/janus/processing_janus.d.ts.map +1 -0
- package/types/models/jina_clip/image_processing_jina_clip.d.ts +5 -0
- package/types/models/jina_clip/image_processing_jina_clip.d.ts.map +1 -0
- package/types/models/jina_clip/processing_jina_clip.d.ts +9 -0
- package/types/models/jina_clip/processing_jina_clip.d.ts.map +1 -0
- package/types/models/llava_onevision/image_processing_llava_onevision.d.ts +4 -0
- package/types/models/llava_onevision/image_processing_llava_onevision.d.ts.map +1 -0
- package/types/models/mask2former/image_processing_mask2former.d.ts +4 -0
- package/types/models/mask2former/image_processing_mask2former.d.ts.map +1 -0
- package/types/models/maskformer/image_processing_maskformer.d.ts +22 -0
- package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -0
- package/types/models/mgp_str/processing_mgp_str.d.ts +64 -0
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -0
- package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts +6 -0
- package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts.map +1 -0
- package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts +6 -0
- package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts.map +1 -0
- package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts +6 -0
- package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts.map +1 -0
- package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts +6 -0
- package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts.map +1 -0
- package/types/models/mobilevit/image_processing_mobilevit.d.ts +6 -0
- package/types/models/mobilevit/image_processing_mobilevit.d.ts.map +1 -0
- package/types/models/nougat/image_processing_nougat.d.ts +4 -0
- package/types/models/nougat/image_processing_nougat.d.ts.map +1 -0
- package/types/models/owlv2/image_processing_owlv2.d.ts +4 -0
- package/types/models/owlv2/image_processing_owlv2.d.ts.map +1 -0
- package/types/models/owlvit/image_processing_owlvit.d.ts +10 -0
- package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -0
- package/types/models/owlvit/processing_owlvit.d.ts +8 -0
- package/types/models/owlvit/processing_owlvit.d.ts.map +1 -0
- package/types/models/processors.d.ts +12 -0
- package/types/models/processors.d.ts.map +1 -0
- package/types/models/pvt/image_processing_pvt.d.ts +4 -0
- package/types/models/pvt/image_processing_pvt.d.ts.map +1 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts +13 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -0
- package/types/models/pyannote/processing_pyannote.d.ts +30 -0
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +11 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +17 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -0
- package/types/models/rt_detr/image_processing_rt_detr.d.ts +8 -0
- package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -0
- package/types/models/sam/image_processing_sam.d.ts +103 -0
- package/types/models/sam/image_processing_sam.d.ts.map +1 -0
- package/types/models/sam/processing_sam.d.ts +9 -0
- package/types/models/sam/processing_sam.d.ts.map +1 -0
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +34 -0
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -0
- package/types/models/segformer/image_processing_segformer.d.ts +10 -0
- package/types/models/segformer/image_processing_segformer.d.ts.map +1 -0
- package/types/models/siglip/image_processing_siglip.d.ts +4 -0
- package/types/models/siglip/image_processing_siglip.d.ts.map +1 -0
- package/types/models/speecht5/feature_extraction_speecht5.d.ts +4 -0
- package/types/models/speecht5/feature_extraction_speecht5.d.ts.map +1 -0
- package/types/models/speecht5/processing_speecht5.d.ts +14 -0
- package/types/models/speecht5/processing_speecht5.d.ts.map +1 -0
- package/types/models/swin2sr/image_processing_swin2sr.d.ts +5 -0
- package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -0
- package/types/models/vit/image_processing_vit.d.ts +6 -0
- package/types/models/vit/image_processing_vit.d.ts.map +1 -0
- package/types/models/vitmatte/image_processing_vitmatte.d.ts +12 -0
- package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -0
- package/types/models/vitpose/image_processing_vitpose.d.ts +26 -0
- package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -0
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts +19 -0
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -0
- package/types/models/wav2vec2/processing_wav2vec2.d.ts +12 -0
- package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -0
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +23 -0
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts +21 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -0
- package/types/models/whisper/processing_whisper.d.ts +17 -0
- package/types/models/whisper/processing_whisper.d.ts.map +1 -0
- package/types/models/yolos/image_processing_yolos.d.ts +10 -0
- package/types/models/yolos/image_processing_yolos.d.ts.map +1 -0
- package/types/models.d.ts +152 -0
- package/types/models.d.ts.map +1 -1
- package/types/pipelines.d.ts +2 -3
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts +3 -0
- package/types/tokenizers.d.ts.map +1 -1
- package/types/transformers.d.ts +10 -1
- package/types/utils/constants.d.ts +6 -0
- package/types/utils/constants.d.ts.map +1 -1
- package/types/utils/core.d.ts +58 -3
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +10 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +34 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/processors.js +0 -2655
- package/types/processors.d.ts +0 -924
- package/types/processors.d.ts.map +0 -1
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from "../../base/feature_extraction_utils.js";
|
|
2
|
+
import { Tensor } from "../../utils/tensor.js";
|
|
3
|
+
|
|
4
|
+
export class Wav2Vec2FeatureExtractor extends FeatureExtractor {
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* @param {Float32Array} input_values
|
|
8
|
+
* @returns {Float32Array}
|
|
9
|
+
*/
|
|
10
|
+
_zero_mean_unit_var_norm(input_values) {
|
|
11
|
+
// TODO support batch?
|
|
12
|
+
const sum = input_values.reduce((a, b) => a + b, 0);
|
|
13
|
+
const mean = sum / input_values.length;
|
|
14
|
+
const variance = input_values.reduce((a, b) => a + (b - mean) ** 2, 0) / input_values.length;
|
|
15
|
+
return input_values.map(x => (x - mean) / Math.sqrt(variance + 1e-7));
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
20
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
21
|
+
* @returns {Promise<{ input_values: Tensor; attention_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention mask as Tensors.
|
|
22
|
+
*/
|
|
23
|
+
async _call(audio) {
|
|
24
|
+
validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor');
|
|
25
|
+
|
|
26
|
+
if (audio instanceof Float64Array) {
|
|
27
|
+
audio = new Float32Array(audio);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
let input_values = audio;
|
|
31
|
+
|
|
32
|
+
// zero-mean and unit-variance normalization
|
|
33
|
+
if (this.config.do_normalize) {
|
|
34
|
+
input_values = this._zero_mean_unit_var_norm(input_values);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// TODO: allow user to pass in attention mask
|
|
38
|
+
const shape = [1, input_values.length];
|
|
39
|
+
return {
|
|
40
|
+
input_values: new Tensor('float32', input_values, shape),
|
|
41
|
+
attention_mask: new Tensor('int64', new BigInt64Array(input_values.length).fill(1n), shape)
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
+
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
|
|
3
|
+
|
|
4
|
+
export class Wav2Vec2ProcessorWithLM extends Processor {
|
|
5
|
+
static feature_extractor_class = AutoFeatureExtractor
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Calls the feature_extractor function with the given audio input.
|
|
9
|
+
* @param {any} audio The audio input to extract features from.
|
|
10
|
+
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
11
|
+
*/
|
|
12
|
+
async _call(audio) {
|
|
13
|
+
return await this.feature_extractor(audio)
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
export class WeSpeakerFeatureExtractor extends FeatureExtractor {
|
|
7
|
+
|
|
8
|
+
constructor(config) {
|
|
9
|
+
super(config);
|
|
10
|
+
|
|
11
|
+
const sampling_rate = this.config.sampling_rate;
|
|
12
|
+
const mel_filters = mel_filter_bank(
|
|
13
|
+
256, // num_frequency_bins
|
|
14
|
+
this.config.num_mel_bins, // num_mel_filters
|
|
15
|
+
20, // min_frequency
|
|
16
|
+
Math.floor(sampling_rate / 2), // max_frequency
|
|
17
|
+
sampling_rate, // sampling_rate
|
|
18
|
+
null, // norm
|
|
19
|
+
"kaldi", // mel_scale
|
|
20
|
+
true, // triangularize_in_mel_space
|
|
21
|
+
);
|
|
22
|
+
|
|
23
|
+
// Do padding:
|
|
24
|
+
for (let i = 0; i < mel_filters.length; ++i) {
|
|
25
|
+
mel_filters[i].push(0);
|
|
26
|
+
}
|
|
27
|
+
this.mel_filters = mel_filters;
|
|
28
|
+
|
|
29
|
+
this.window = window_function(400, 'hamming', {
|
|
30
|
+
periodic: false,
|
|
31
|
+
})
|
|
32
|
+
this.min_num_frames = this.config.min_num_frames;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
37
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
38
|
+
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
39
|
+
*/
|
|
40
|
+
async _extract_fbank_features(waveform) {
|
|
41
|
+
// Kaldi compliance: 16-bit signed integers
|
|
42
|
+
// 32768 == 2 ** 15
|
|
43
|
+
waveform = waveform.map((/** @type {number} */ x) => x * 32768)
|
|
44
|
+
|
|
45
|
+
return spectrogram(
|
|
46
|
+
waveform,
|
|
47
|
+
this.window, // window
|
|
48
|
+
400, // frame_length
|
|
49
|
+
160, // hop_length
|
|
50
|
+
{
|
|
51
|
+
fft_length: 512,
|
|
52
|
+
power: 2.0,
|
|
53
|
+
center: false,
|
|
54
|
+
preemphasis: 0.97,
|
|
55
|
+
mel_filters: this.mel_filters,
|
|
56
|
+
log_mel: 'log',
|
|
57
|
+
mel_floor: 1.192092955078125e-07,
|
|
58
|
+
remove_dc_offset: true,
|
|
59
|
+
|
|
60
|
+
// Custom
|
|
61
|
+
transpose: true,
|
|
62
|
+
min_num_frames: this.min_num_frames,
|
|
63
|
+
}
|
|
64
|
+
)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
70
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
71
|
+
* @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
|
|
72
|
+
*/
|
|
73
|
+
async _call(audio) {
|
|
74
|
+
validate_audio_inputs(audio, 'WeSpeakerFeatureExtractor');
|
|
75
|
+
|
|
76
|
+
const features = (await this._extract_fbank_features(audio)).unsqueeze_(0);
|
|
77
|
+
|
|
78
|
+
if (this.config.fbank_centering_span === null) {
|
|
79
|
+
// center features with global average
|
|
80
|
+
const meanData = /** @type {Float32Array} */ (features.mean(1).data);
|
|
81
|
+
const featuresData = /** @type {Float32Array} */(features.data);
|
|
82
|
+
const [batch_size, num_frames, feature_size] = features.dims;
|
|
83
|
+
|
|
84
|
+
for (let i = 0; i < batch_size; ++i) {
|
|
85
|
+
const offset1 = i * num_frames * feature_size;
|
|
86
|
+
const offset2 = i * feature_size;
|
|
87
|
+
for (let j = 0; j < num_frames; ++j) {
|
|
88
|
+
const offset3 = offset1 + j * feature_size;
|
|
89
|
+
for (let k = 0; k < feature_size; ++k) {
|
|
90
|
+
featuresData[offset3 + k] -= meanData[offset2 + k];
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
input_features: features
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
4
|
+
import { max } from '../../utils/maths.js';
|
|
5
|
+
|
|
6
|
+
export class WhisperFeatureExtractor extends FeatureExtractor {
|
|
7
|
+
|
|
8
|
+
constructor(config) {
|
|
9
|
+
super(config);
|
|
10
|
+
|
|
11
|
+
// Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist.
|
|
12
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
13
|
+
Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins
|
|
14
|
+
this.config.feature_size, // num_mel_filters
|
|
15
|
+
0.0, // min_frequency
|
|
16
|
+
8000.0, // max_frequency
|
|
17
|
+
this.config.sampling_rate, // sampling_rate
|
|
18
|
+
"slaney", // norm
|
|
19
|
+
"slaney", // mel_scale
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
this.window = window_function(this.config.n_fft, 'hann');
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
27
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
28
|
+
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
29
|
+
*/
|
|
30
|
+
async _extract_fbank_features(waveform) {
|
|
31
|
+
const features = await spectrogram(
|
|
32
|
+
waveform,
|
|
33
|
+
this.window, // window
|
|
34
|
+
this.config.n_fft, // frame_length
|
|
35
|
+
this.config.hop_length, // hop_length
|
|
36
|
+
{
|
|
37
|
+
power: 2.0,
|
|
38
|
+
mel_filters: this.config.mel_filters,
|
|
39
|
+
log_mel: 'log10',
|
|
40
|
+
|
|
41
|
+
// Custom
|
|
42
|
+
max_num_frames: this.config.nb_max_frames, // 3000
|
|
43
|
+
}
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
const data = features.data;
|
|
47
|
+
const maxValue = max(data)[0];
|
|
48
|
+
|
|
49
|
+
for (let i = 0; i < data.length; ++i) {
|
|
50
|
+
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return features;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
58
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
59
|
+
* @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
|
|
60
|
+
*/
|
|
61
|
+
async _call(audio) {
|
|
62
|
+
validate_audio_inputs(audio, 'WhisperFeatureExtractor');
|
|
63
|
+
|
|
64
|
+
let waveform;
|
|
65
|
+
if (audio.length > this.config.n_samples) {
|
|
66
|
+
console.warn(
|
|
67
|
+
"Attempting to extract features for audio longer than 30 seconds. " +
|
|
68
|
+
"If using a pipeline to extract transcript from a long audio clip, " +
|
|
69
|
+
"remember to specify `chunk_length_s` and/or `stride_length_s`."
|
|
70
|
+
);
|
|
71
|
+
waveform = audio.slice(0, this.config.n_samples);
|
|
72
|
+
} else {
|
|
73
|
+
// pad with zeros
|
|
74
|
+
waveform = new Float32Array(this.config.n_samples);
|
|
75
|
+
waveform.set(audio);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const features = await this._extract_fbank_features(waveform);
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
input_features: features.unsqueeze_(0)
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
|
|
2
|
+
import { AutoTokenizer } from "../../tokenizers.js"
|
|
3
|
+
import { Processor } from "../../base/processing_utils.js"
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Represents a WhisperProcessor that extracts features from an audio input.
|
|
7
|
+
*/
|
|
8
|
+
export class WhisperProcessor extends Processor {
|
|
9
|
+
static tokenizer_class = AutoTokenizer
|
|
10
|
+
static feature_extractor_class = AutoFeatureExtractor
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Calls the feature_extractor function with the given audio input.
|
|
14
|
+
* @param {any} audio The audio input to extract features from.
|
|
15
|
+
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
16
|
+
*/
|
|
17
|
+
async _call(audio) {
|
|
18
|
+
return await this.feature_extractor(audio);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
post_process_object_detection,
|
|
4
|
+
} from "../../base/image_processors_utils.js";
|
|
5
|
+
|
|
6
|
+
export class YolosImageProcessor extends ImageProcessor {
|
|
7
|
+
/** @type {typeof post_process_object_detection} */
|
|
8
|
+
post_process_object_detection(...args) {
|
|
9
|
+
return post_process_object_detection(...args);
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
export class YolosFeatureExtractor extends YolosImageProcessor { }
|