@huggingface/transformers 3.0.2 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -4
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +16235 -13145
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +16536 -13437
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +238 -52
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +229 -43
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +240 -54
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +15259 -12171
- package/dist/transformers.mjs.map +1 -1
- package/package.json +4 -4
- package/src/base/feature_extraction_utils.js +54 -0
- package/src/base/image_processors_utils.js +1089 -0
- package/src/base/processing_utils.js +145 -0
- package/src/configs.js +13 -3
- package/src/env.js +1 -1
- package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +90 -0
- package/src/models/auto/feature_extraction_auto.js +41 -0
- package/src/models/auto/image_processing_auto.js +29 -0
- package/src/models/auto/processing_auto.js +100 -0
- package/src/models/beit/image_processing_beit.js +5 -0
- package/src/models/bit/image_processing_bit.js +5 -0
- package/src/models/chinese_clip/image_processing_chinese_clip.js +5 -0
- package/src/models/clap/feature_extraction_clap.js +159 -0
- package/src/models/clip/image_processing_clip.js +6 -0
- package/src/models/convnext/image_processing_convnext.js +45 -0
- package/src/models/deit/image_processing_deit.js +6 -0
- package/src/models/detr/image_processing_detr.js +52 -0
- package/src/models/donut/image_processing_donut.js +31 -0
- package/src/models/dpt/image_processing_dpt.js +6 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +13 -0
- package/src/models/feature_extractors.js +12 -0
- package/src/models/florence2/processing_florence2.js +128 -0
- package/src/models/glpn/image_processing_glpn.js +5 -0
- package/src/models/image_processors.js +36 -0
- package/src/models/janus/image_processing_janus.js +26 -0
- package/src/models/janus/processing_janus.js +123 -0
- package/src/models/jina_clip/image_processing_jina_clip.js +26 -0
- package/src/models/jina_clip/processing_jina_clip.js +24 -0
- package/src/models/llava_onevision/image_processing_llava_onevision.js +5 -0
- package/src/models/mask2former/image_processing_mask2former.js +5 -0
- package/src/models/maskformer/image_processing_maskformer.js +18 -0
- package/src/models/mgp_str/processing_mgp_str.js +170 -0
- package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +7 -0
- package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +7 -0
- package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +7 -0
- package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +7 -0
- package/src/models/mobilevit/image_processing_mobilevit.js +6 -0
- package/src/models/nougat/image_processing_nougat.js +5 -0
- package/src/models/owlv2/image_processing_owlv2.js +5 -0
- package/src/models/owlvit/image_processing_owlvit.js +12 -0
- package/src/models/owlvit/processing_owlvit.js +7 -0
- package/src/models/processors.js +11 -0
- package/src/models/pvt/image_processing_pvt.js +5 -0
- package/src/models/pyannote/feature_extraction_pyannote.js +28 -0
- package/src/models/pyannote/processing_pyannote.js +71 -0
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +52 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +52 -0
- package/src/models/rt_detr/image_processing_rt_detr.js +12 -0
- package/src/models/sam/image_processing_sam.js +242 -0
- package/src/models/sam/processing_sam.js +20 -0
- package/src/models/sapiens/image_processing_sapiens.js +13 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +180 -0
- package/src/models/segformer/image_processing_segformer.js +13 -0
- package/src/models/siglip/image_processing_siglip.js +5 -0
- package/src/models/speecht5/feature_extraction_speecht5.js +4 -0
- package/src/models/speecht5/processing_speecht5.js +17 -0
- package/src/models/swin2sr/image_processing_swin2sr.js +24 -0
- package/src/models/vit/image_processing_vit.js +7 -0
- package/src/models/vitmatte/image_processing_vitmatte.js +50 -0
- package/src/models/vitpose/image_processing_vitpose.js +89 -0
- package/src/models/wav2vec2/feature_extraction_wav2vec2.js +44 -0
- package/src/models/wav2vec2/processing_wav2vec2.js +15 -0
- package/src/models/wespeaker/feature_extraction_wespeaker.js +100 -0
- package/src/models/whisper/feature_extraction_whisper.js +84 -0
- package/src/models/whisper/processing_whisper.js +21 -0
- package/src/models/yolos/image_processing_yolos.js +12 -0
- package/src/models.js +675 -32
- package/src/pipelines.js +8 -8
- package/src/tokenizers.js +5 -0
- package/src/transformers.js +15 -2
- package/src/utils/constants.js +8 -1
- package/src/utils/core.js +37 -9
- package/src/utils/hub.js +2 -1
- package/src/utils/image.js +68 -17
- package/src/utils/tensor.js +33 -1
- package/types/base/feature_extraction_utils.d.ts +41 -0
- package/types/base/feature_extraction_utils.d.ts.map +1 -0
- package/types/base/image_processors_utils.d.ts +323 -0
- package/types/base/image_processors_utils.d.ts.map +1 -0
- package/types/base/processing_utils.d.ts +80 -0
- package/types/base/processing_utils.d.ts.map +1 -0
- package/types/configs.d.ts +4 -1
- package/types/configs.d.ts.map +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +25 -0
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/auto/feature_extraction_auto.d.ts +5 -0
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -0
- package/types/models/auto/image_processing_auto.d.ts +5 -0
- package/types/models/auto/image_processing_auto.d.ts.map +1 -0
- package/types/models/auto/processing_auto.d.ts +35 -0
- package/types/models/auto/processing_auto.d.ts.map +1 -0
- package/types/models/beit/image_processing_beit.d.ts +4 -0
- package/types/models/beit/image_processing_beit.d.ts.map +1 -0
- package/types/models/bit/image_processing_bit.d.ts +4 -0
- package/types/models/bit/image_processing_bit.d.ts.map +1 -0
- package/types/models/chinese_clip/image_processing_chinese_clip.d.ts +4 -0
- package/types/models/chinese_clip/image_processing_chinese_clip.d.ts.map +1 -0
- package/types/models/clap/feature_extraction_clap.d.ts +57 -0
- package/types/models/clap/feature_extraction_clap.d.ts.map +1 -0
- package/types/models/clip/image_processing_clip.d.ts +6 -0
- package/types/models/clip/image_processing_clip.d.ts.map +1 -0
- package/types/models/convnext/image_processing_convnext.d.ts +12 -0
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -0
- package/types/models/deit/image_processing_deit.d.ts +6 -0
- package/types/models/deit/image_processing_deit.d.ts.map +1 -0
- package/types/models/detr/image_processing_detr.d.ts +42 -0
- package/types/models/detr/image_processing_detr.d.ts.map +1 -0
- package/types/models/donut/image_processing_donut.d.ts +7 -0
- package/types/models/donut/image_processing_donut.d.ts.map +1 -0
- package/types/models/dpt/image_processing_dpt.d.ts +6 -0
- package/types/models/dpt/image_processing_dpt.d.ts.map +1 -0
- package/types/models/efficientnet/image_processing_efficientnet.d.ts +6 -0
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +10 -0
- package/types/models/feature_extractors.d.ts.map +1 -0
- package/types/models/florence2/processing_florence2.d.ts +39 -0
- package/types/models/florence2/processing_florence2.d.ts.map +1 -0
- package/types/models/glpn/image_processing_glpn.d.ts +4 -0
- package/types/models/glpn/image_processing_glpn.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +36 -0
- package/types/models/image_processors.d.ts.map +1 -0
- package/types/models/janus/image_processing_janus.d.ts +7 -0
- package/types/models/janus/image_processing_janus.d.ts.map +1 -0
- package/types/models/janus/processing_janus.d.ts +77 -0
- package/types/models/janus/processing_janus.d.ts.map +1 -0
- package/types/models/jina_clip/image_processing_jina_clip.d.ts +5 -0
- package/types/models/jina_clip/image_processing_jina_clip.d.ts.map +1 -0
- package/types/models/jina_clip/processing_jina_clip.d.ts +9 -0
- package/types/models/jina_clip/processing_jina_clip.d.ts.map +1 -0
- package/types/models/llava_onevision/image_processing_llava_onevision.d.ts +4 -0
- package/types/models/llava_onevision/image_processing_llava_onevision.d.ts.map +1 -0
- package/types/models/mask2former/image_processing_mask2former.d.ts +4 -0
- package/types/models/mask2former/image_processing_mask2former.d.ts.map +1 -0
- package/types/models/maskformer/image_processing_maskformer.d.ts +22 -0
- package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -0
- package/types/models/mgp_str/processing_mgp_str.d.ts +64 -0
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -0
- package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts +6 -0
- package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts.map +1 -0
- package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts +6 -0
- package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts.map +1 -0
- package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts +6 -0
- package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts.map +1 -0
- package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts +6 -0
- package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts.map +1 -0
- package/types/models/mobilevit/image_processing_mobilevit.d.ts +6 -0
- package/types/models/mobilevit/image_processing_mobilevit.d.ts.map +1 -0
- package/types/models/nougat/image_processing_nougat.d.ts +4 -0
- package/types/models/nougat/image_processing_nougat.d.ts.map +1 -0
- package/types/models/owlv2/image_processing_owlv2.d.ts +4 -0
- package/types/models/owlv2/image_processing_owlv2.d.ts.map +1 -0
- package/types/models/owlvit/image_processing_owlvit.d.ts +10 -0
- package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -0
- package/types/models/owlvit/processing_owlvit.d.ts +8 -0
- package/types/models/owlvit/processing_owlvit.d.ts.map +1 -0
- package/types/models/processors.d.ts +12 -0
- package/types/models/processors.d.ts.map +1 -0
- package/types/models/pvt/image_processing_pvt.d.ts +4 -0
- package/types/models/pvt/image_processing_pvt.d.ts.map +1 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts +13 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -0
- package/types/models/pyannote/processing_pyannote.d.ts +30 -0
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +11 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +17 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -0
- package/types/models/rt_detr/image_processing_rt_detr.d.ts +8 -0
- package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -0
- package/types/models/sam/image_processing_sam.d.ts +103 -0
- package/types/models/sam/image_processing_sam.d.ts.map +1 -0
- package/types/models/sam/processing_sam.d.ts +9 -0
- package/types/models/sam/processing_sam.d.ts.map +1 -0
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +34 -0
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -0
- package/types/models/segformer/image_processing_segformer.d.ts +10 -0
- package/types/models/segformer/image_processing_segformer.d.ts.map +1 -0
- package/types/models/siglip/image_processing_siglip.d.ts +4 -0
- package/types/models/siglip/image_processing_siglip.d.ts.map +1 -0
- package/types/models/speecht5/feature_extraction_speecht5.d.ts +4 -0
- package/types/models/speecht5/feature_extraction_speecht5.d.ts.map +1 -0
- package/types/models/speecht5/processing_speecht5.d.ts +14 -0
- package/types/models/speecht5/processing_speecht5.d.ts.map +1 -0
- package/types/models/swin2sr/image_processing_swin2sr.d.ts +5 -0
- package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -0
- package/types/models/vit/image_processing_vit.d.ts +6 -0
- package/types/models/vit/image_processing_vit.d.ts.map +1 -0
- package/types/models/vitmatte/image_processing_vitmatte.d.ts +12 -0
- package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -0
- package/types/models/vitpose/image_processing_vitpose.d.ts +26 -0
- package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -0
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts +19 -0
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -0
- package/types/models/wav2vec2/processing_wav2vec2.d.ts +12 -0
- package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -0
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +23 -0
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts +21 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -0
- package/types/models/whisper/processing_whisper.d.ts +17 -0
- package/types/models/whisper/processing_whisper.d.ts.map +1 -0
- package/types/models/yolos/image_processing_yolos.d.ts +10 -0
- package/types/models/yolos/image_processing_yolos.d.ts.map +1 -0
- package/types/models.d.ts +140 -0
- package/types/models.d.ts.map +1 -1
- package/types/pipelines.d.ts +2 -3
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts +3 -0
- package/types/tokenizers.d.ts.map +1 -1
- package/types/transformers.d.ts +10 -1
- package/types/utils/constants.d.ts +6 -0
- package/types/utils/constants.d.ts.map +1 -1
- package/types/utils/core.d.ts +58 -3
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +10 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +34 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/processors.js +0 -2655
- package/types/processors.d.ts +0 -924
- package/types/processors.d.ts.map +0 -1
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
|
|
2
|
+
/**
|
|
3
|
+
* @file Processors are used to prepare inputs (e.g., text, image or audio) for a model.
|
|
4
|
+
*
|
|
5
|
+
* **Example:** Using a `WhisperProcessor` to prepare an audio input for a model.
|
|
6
|
+
* ```javascript
|
|
7
|
+
* import { AutoProcessor, read_audio } from '@huggingface/transformers';
|
|
8
|
+
*
|
|
9
|
+
* const processor = await AutoProcessor.from_pretrained('openai/whisper-tiny.en');
|
|
10
|
+
* const audio = await read_audio('https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac', 16000);
|
|
11
|
+
* const { input_features } = await processor(audio);
|
|
12
|
+
* // Tensor {
|
|
13
|
+
* // data: Float32Array(240000) [0.4752984642982483, 0.5597258806228638, 0.56434166431427, ...],
|
|
14
|
+
* // dims: [1, 80, 3000],
|
|
15
|
+
* // type: 'float32',
|
|
16
|
+
* // size: 240000,
|
|
17
|
+
* // }
|
|
18
|
+
* ```
|
|
19
|
+
*
|
|
20
|
+
* @module processors
|
|
21
|
+
*/
|
|
22
|
+
import { PROCESSOR_NAME } from '../utils/constants.js';
|
|
23
|
+
import {
|
|
24
|
+
Callable,
|
|
25
|
+
} from '../utils/generic.js';
|
|
26
|
+
import { getModelJSON } from '../utils/hub.js';
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* @typedef {Object} ProcessorProperties Additional processor-specific properties.
|
|
30
|
+
* @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Represents a Processor that extracts features from an input.
|
|
36
|
+
*/
|
|
37
|
+
export class Processor extends Callable {
|
|
38
|
+
static classes = [
|
|
39
|
+
'image_processor_class',
|
|
40
|
+
'tokenizer_class',
|
|
41
|
+
'feature_extractor_class',
|
|
42
|
+
]
|
|
43
|
+
static uses_processor_config = false;
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Creates a new Processor with the given components
|
|
47
|
+
* @param {Object} config
|
|
48
|
+
* @param {Record<string, Object>} components
|
|
49
|
+
*/
|
|
50
|
+
constructor(config, components) {
|
|
51
|
+
super();
|
|
52
|
+
this.config = config;
|
|
53
|
+
this.components = components;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* @returns {import('./image_processors_utils.js').ImageProcessor|undefined} The image processor of the processor, if it exists.
|
|
58
|
+
*/
|
|
59
|
+
get image_processor() {
|
|
60
|
+
return this.components.image_processor;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
|
|
65
|
+
*/
|
|
66
|
+
get tokenizer() {
|
|
67
|
+
return this.components.tokenizer;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* @returns {import('./feature_extraction_utils.js').FeatureExtractor|undefined} The feature extractor of the processor, if it exists.
|
|
72
|
+
*/
|
|
73
|
+
get feature_extractor() {
|
|
74
|
+
return this.components.feature_extractor;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
apply_chat_template(messages, options = {}) {
|
|
78
|
+
if (!this.tokenizer) {
|
|
79
|
+
throw new Error('Unable to apply chat template without a tokenizer.');
|
|
80
|
+
}
|
|
81
|
+
return this.tokenizer.apply_chat_template(messages, {
|
|
82
|
+
tokenize: false, // default to false
|
|
83
|
+
...options,
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
batch_decode(...args) {
|
|
88
|
+
if (!this.tokenizer) {
|
|
89
|
+
throw new Error('Unable to decode without a tokenizer.');
|
|
90
|
+
}
|
|
91
|
+
return this.tokenizer.batch_decode(...args);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Calls the feature_extractor function with the given input.
|
|
97
|
+
* @param {any} input The input to extract features from.
|
|
98
|
+
* @param {...any} args Additional arguments.
|
|
99
|
+
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
100
|
+
*/
|
|
101
|
+
async _call(input, ...args) {
|
|
102
|
+
for (const item of [this.image_processor, this.feature_extractor, this.tokenizer]) {
|
|
103
|
+
if (item) {
|
|
104
|
+
return item(input, ...args);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
throw new Error('No image processor, feature extractor, or tokenizer found.');
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
113
|
+
*
|
|
114
|
+
* The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
|
|
115
|
+
* (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
116
|
+
*
|
|
117
|
+
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
118
|
+
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
119
|
+
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
120
|
+
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
121
|
+
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
122
|
+
* @param {PretrainedProcessorOptions} options Additional options for loading the processor.
|
|
123
|
+
*
|
|
124
|
+
* @returns {Promise<Processor>} A new instance of the Processor class.
|
|
125
|
+
*/
|
|
126
|
+
static async from_pretrained(pretrained_model_name_or_path, options) {
|
|
127
|
+
|
|
128
|
+
const [config, components] = await Promise.all([
|
|
129
|
+
// TODO:
|
|
130
|
+
this.uses_processor_config
|
|
131
|
+
? getModelJSON(pretrained_model_name_or_path, PROCESSOR_NAME, true, options)
|
|
132
|
+
: {},
|
|
133
|
+
Promise.all(
|
|
134
|
+
this.classes
|
|
135
|
+
.filter((cls) => cls in this)
|
|
136
|
+
.map(async (cls) => {
|
|
137
|
+
const component = await this[cls].from_pretrained(pretrained_model_name_or_path, options);
|
|
138
|
+
return [cls.replace(/_class$/, ''), component];
|
|
139
|
+
})
|
|
140
|
+
).then(Object.fromEntries)
|
|
141
|
+
]);
|
|
142
|
+
|
|
143
|
+
return new this(config, components);
|
|
144
|
+
}
|
|
145
|
+
}
|
package/src/configs.js
CHANGED
|
@@ -36,6 +36,13 @@ import {
|
|
|
36
36
|
* @typedef {import('./utils/hub.js').PretrainedOptions} PretrainedOptions
|
|
37
37
|
*/
|
|
38
38
|
|
|
39
|
+
/**
|
|
40
|
+
* @typedef {import('./utils/core.js').ProgressCallback} ProgressCallback
|
|
41
|
+
*/
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* @typedef {import('./utils/core.js').ProgressInfo} ProgressInfo
|
|
45
|
+
*/
|
|
39
46
|
|
|
40
47
|
/**
|
|
41
48
|
* Loads a config from the specified path.
|
|
@@ -61,6 +68,7 @@ function getNormalizedConfig(config) {
|
|
|
61
68
|
case 'llava':
|
|
62
69
|
case 'paligemma':
|
|
63
70
|
case 'florence2':
|
|
71
|
+
case 'llava_onevision':
|
|
64
72
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
65
73
|
break;
|
|
66
74
|
case 'moondream1':
|
|
@@ -69,6 +77,9 @@ function getNormalizedConfig(config) {
|
|
|
69
77
|
case 'musicgen':
|
|
70
78
|
init_normalized_config = getNormalizedConfig(config.decoder);
|
|
71
79
|
break;
|
|
80
|
+
case 'multi_modality':
|
|
81
|
+
init_normalized_config = getNormalizedConfig(config.language_config);
|
|
82
|
+
break;
|
|
72
83
|
|
|
73
84
|
// Decoder-only models
|
|
74
85
|
case 'gpt2':
|
|
@@ -98,6 +109,7 @@ function getNormalizedConfig(config) {
|
|
|
98
109
|
case 'mistral':
|
|
99
110
|
case 'starcoder2':
|
|
100
111
|
case 'qwen2':
|
|
112
|
+
case 'qwen2_vl':
|
|
101
113
|
mapping['num_heads'] = 'num_key_value_heads';
|
|
102
114
|
mapping['num_layers'] = 'num_hidden_layers';
|
|
103
115
|
mapping['hidden_size'] = 'hidden_size';
|
|
@@ -218,14 +230,12 @@ function getNormalizedConfig(config) {
|
|
|
218
230
|
*/
|
|
219
231
|
export function getKeyValueShapes(config, {
|
|
220
232
|
prefix = 'past_key_values',
|
|
233
|
+
batch_size=1,
|
|
221
234
|
} = {}) {
|
|
222
235
|
/** @type {Record<string, number[]>} */
|
|
223
236
|
const decoderFeeds = {};
|
|
224
237
|
const normalized_config = config.normalized_config;
|
|
225
238
|
|
|
226
|
-
// TODO support batches (i.e., batch_size > 1)
|
|
227
|
-
const batch_size = 1;
|
|
228
|
-
|
|
229
239
|
if (normalized_config.is_encoder_decoder && (
|
|
230
240
|
'num_encoder_heads' in normalized_config && 'num_decoder_heads' in normalized_config
|
|
231
241
|
)) {
|
package/src/env.js
CHANGED
package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
export class ASTFeatureExtractor extends FeatureExtractor {
|
|
7
|
+
|
|
8
|
+
constructor(config) {
|
|
9
|
+
super(config);
|
|
10
|
+
|
|
11
|
+
const sampling_rate = this.config.sampling_rate;
|
|
12
|
+
const mel_filters = mel_filter_bank(
|
|
13
|
+
256, // num_frequency_bins
|
|
14
|
+
this.config.num_mel_bins, // num_mel_filters
|
|
15
|
+
20, // min_frequency
|
|
16
|
+
Math.floor(sampling_rate / 2), // max_frequency
|
|
17
|
+
sampling_rate, // sampling_rate
|
|
18
|
+
null, // norm
|
|
19
|
+
"kaldi", // mel_scale
|
|
20
|
+
true, // triangularize_in_mel_space
|
|
21
|
+
);
|
|
22
|
+
|
|
23
|
+
// Do padding:
|
|
24
|
+
for (let i = 0; i < mel_filters.length; ++i) {
|
|
25
|
+
mel_filters[i].push(0);
|
|
26
|
+
}
|
|
27
|
+
this.mel_filters = mel_filters;
|
|
28
|
+
|
|
29
|
+
this.window = window_function(400, 'hann', {
|
|
30
|
+
periodic: false,
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
this.mean = this.config.mean;
|
|
34
|
+
this.std = this.config.std;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
39
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
40
|
+
* @param {number} max_length The maximum number of frames to return.
|
|
41
|
+
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
42
|
+
*/
|
|
43
|
+
async _extract_fbank_features(waveform, max_length) {
|
|
44
|
+
// NOTE: We don't pad/truncate since that is passed in as `max_num_frames`
|
|
45
|
+
return spectrogram(
|
|
46
|
+
waveform,
|
|
47
|
+
this.window, // window
|
|
48
|
+
400, // frame_length
|
|
49
|
+
160, // hop_length
|
|
50
|
+
{
|
|
51
|
+
fft_length: 512,
|
|
52
|
+
power: 2.0,
|
|
53
|
+
center: false,
|
|
54
|
+
preemphasis: 0.97,
|
|
55
|
+
mel_filters: this.mel_filters,
|
|
56
|
+
log_mel: 'log',
|
|
57
|
+
mel_floor: 1.192092955078125e-07,
|
|
58
|
+
remove_dc_offset: true,
|
|
59
|
+
|
|
60
|
+
// Custom
|
|
61
|
+
max_num_frames: max_length,
|
|
62
|
+
transpose: true,
|
|
63
|
+
}
|
|
64
|
+
)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
70
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
71
|
+
* @returns {Promise<{ input_values: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
|
|
72
|
+
*/
|
|
73
|
+
async _call(audio) {
|
|
74
|
+
validate_audio_inputs(audio, 'ASTFeatureExtractor');
|
|
75
|
+
|
|
76
|
+
const features = await this._extract_fbank_features(audio, this.config.max_length);
|
|
77
|
+
if (this.config.do_normalize) {
|
|
78
|
+
// Normalize the input audio spectrogram to have mean=0, std=0.5
|
|
79
|
+
const denom = this.std * 2;
|
|
80
|
+
const features_data = features.data;
|
|
81
|
+
for (let i = 0; i < features_data.length; ++i) {
|
|
82
|
+
features_data[i] = (features_data[i] - this.mean) / denom;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
input_values: features.unsqueeze_(0)
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
|
|
2
|
+
import { FEATURE_EXTRACTOR_NAME, GITHUB_ISSUE_URL } from '../../utils/constants.js';
|
|
3
|
+
import { getModelJSON } from '../../utils/hub.js';
|
|
4
|
+
import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
|
|
5
|
+
import * as AllFeatureExtractors from '../feature_extractors.js';
|
|
6
|
+
|
|
7
|
+
export class AutoFeatureExtractor {
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
11
|
+
*
|
|
12
|
+
* The processor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
13
|
+
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
14
|
+
*
|
|
15
|
+
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
16
|
+
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
17
|
+
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
18
|
+
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
19
|
+
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
20
|
+
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
21
|
+
*
|
|
22
|
+
* @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
/** @type {typeof FeatureExtractor.from_pretrained} */
|
|
26
|
+
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
27
|
+
|
|
28
|
+
const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, true, options);
|
|
29
|
+
|
|
30
|
+
// Determine feature extractor class
|
|
31
|
+
const key = preprocessorConfig.feature_extractor_type;
|
|
32
|
+
const feature_extractor_class = AllFeatureExtractors[key];
|
|
33
|
+
|
|
34
|
+
if (!feature_extractor_class) {
|
|
35
|
+
throw new Error(`Unknown feature_extractor_type: '${key}'. Please report this at ${GITHUB_ISSUE_URL}.`);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Instantiate feature extractor
|
|
39
|
+
return new feature_extractor_class(preprocessorConfig);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
|
|
2
|
+
import { GITHUB_ISSUE_URL, IMAGE_PROCESSOR_NAME } from '../../utils/constants.js';
|
|
3
|
+
import { getModelJSON } from '../../utils/hub.js';
|
|
4
|
+
import { ImageProcessor } from '../../base/image_processors_utils.js';
|
|
5
|
+
import * as AllImageProcessors from '../image_processors.js';
|
|
6
|
+
|
|
7
|
+
export class AutoImageProcessor {
|
|
8
|
+
|
|
9
|
+
/** @type {typeof ImageProcessor.from_pretrained} */
|
|
10
|
+
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
11
|
+
|
|
12
|
+
const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME, true, options);
|
|
13
|
+
|
|
14
|
+
// Determine image processor class
|
|
15
|
+
const key = preprocessorConfig.image_processor_type ?? preprocessorConfig.feature_extractor_type;
|
|
16
|
+
let image_processor_class = AllImageProcessors[key];
|
|
17
|
+
|
|
18
|
+
if (!image_processor_class) {
|
|
19
|
+
if (key !== undefined) {
|
|
20
|
+
// Only log a warning if the class is not found and the key is set.
|
|
21
|
+
console.warn(`Image processor type '${key}' not found, assuming base ImageProcessor. Please report this at ${GITHUB_ISSUE_URL}.`)
|
|
22
|
+
}
|
|
23
|
+
image_processor_class = ImageProcessor;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Instantiate image processor
|
|
27
|
+
return new image_processor_class(preprocessorConfig);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import { IMAGE_PROCESSOR_NAME } from '../../utils/constants.js';
|
|
4
|
+
import { getModelJSON } from '../../utils/hub.js';
|
|
5
|
+
import { Processor } from '../../base/processing_utils.js';
|
|
6
|
+
|
|
7
|
+
import * as AllProcessors from '../processors.js';
|
|
8
|
+
import * as AllImageProcessors from '../image_processors.js';
|
|
9
|
+
import * as AllFeatureExtractors from '../feature_extractors.js';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Helper class which is used to instantiate pretrained processors with the `from_pretrained` function.
|
|
13
|
+
* The chosen processor class is determined by the type specified in the processor config.
|
|
14
|
+
*
|
|
15
|
+
* **Example:** Load a processor using `from_pretrained`.
|
|
16
|
+
* ```javascript
|
|
17
|
+
* let processor = await AutoProcessor.from_pretrained('openai/whisper-tiny.en');
|
|
18
|
+
* ```
|
|
19
|
+
*
|
|
20
|
+
* **Example:** Run an image through a processor.
|
|
21
|
+
* ```javascript
|
|
22
|
+
* let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');
|
|
23
|
+
* let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
|
|
24
|
+
* let image_inputs = await processor(image);
|
|
25
|
+
* // {
|
|
26
|
+
* // "pixel_values": {
|
|
27
|
+
* // "dims": [ 1, 3, 224, 224 ],
|
|
28
|
+
* // "type": "float32",
|
|
29
|
+
* // "data": Float32Array [ -1.558687686920166, -1.558687686920166, -1.5440893173217773, ... ],
|
|
30
|
+
* // "size": 150528
|
|
31
|
+
* // },
|
|
32
|
+
* // "original_sizes": [
|
|
33
|
+
* // [ 533, 800 ]
|
|
34
|
+
* // ],
|
|
35
|
+
* // "reshaped_input_sizes": [
|
|
36
|
+
* // [ 224, 224 ]
|
|
37
|
+
* // ]
|
|
38
|
+
* // }
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
export class AutoProcessor {
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
45
|
+
*
|
|
46
|
+
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
47
|
+
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
48
|
+
*
|
|
49
|
+
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
50
|
+
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
51
|
+
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
52
|
+
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
53
|
+
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
54
|
+
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
55
|
+
*
|
|
56
|
+
* @returns {Promise<Processor>} A new instance of the Processor class.
|
|
57
|
+
*/
|
|
58
|
+
|
|
59
|
+
/** @type {typeof Processor.from_pretrained} */
|
|
60
|
+
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
61
|
+
|
|
62
|
+
// TODO: first check for processor.json
|
|
63
|
+
const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME, true, options);
|
|
64
|
+
|
|
65
|
+
const { image_processor_type, feature_extractor_type, processor_class } = preprocessorConfig;
|
|
66
|
+
if (processor_class && AllProcessors[processor_class]) {
|
|
67
|
+
return AllProcessors[processor_class].from_pretrained(pretrained_model_name_or_path, options);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (!image_processor_type && !feature_extractor_type) {
|
|
71
|
+
throw new Error('No `image_processor_type` or `feature_extractor_type` found in the config.');
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const components = {};
|
|
75
|
+
if (image_processor_type) {
|
|
76
|
+
const image_processor_class = AllImageProcessors[image_processor_type];
|
|
77
|
+
if (!image_processor_class) {
|
|
78
|
+
throw new Error(`Unknown image_processor_type: '${image_processor_type}'.`);
|
|
79
|
+
}
|
|
80
|
+
components.image_processor = new image_processor_class(preprocessorConfig);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (feature_extractor_type) {
|
|
84
|
+
const image_processor_class = AllImageProcessors[feature_extractor_type];
|
|
85
|
+
if (image_processor_class) {
|
|
86
|
+
// Handle legacy case where image processors were specified as feature extractors
|
|
87
|
+
components.image_processor = new image_processor_class(preprocessorConfig);
|
|
88
|
+
} else {
|
|
89
|
+
const feature_extractor_class = AllFeatureExtractors[feature_extractor_type];
|
|
90
|
+
if (!feature_extractor_class) {
|
|
91
|
+
throw new Error(`Unknown feature_extractor_type: '${feature_extractor_type}'.`);
|
|
92
|
+
}
|
|
93
|
+
components.feature_extractor = new feature_extractor_class(preprocessorConfig);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const config = {};
|
|
98
|
+
return new Processor(config, components);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
3
|
+
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
export class ClapFeatureExtractor extends FeatureExtractor {
|
|
7
|
+
|
|
8
|
+
constructor(config) {
|
|
9
|
+
super(config);
|
|
10
|
+
|
|
11
|
+
this.mel_filters = mel_filter_bank(
|
|
12
|
+
this.config.nb_frequency_bins, // num_frequency_bins
|
|
13
|
+
this.config.feature_size, // num_mel_filters
|
|
14
|
+
this.config.frequency_min, // min_frequency
|
|
15
|
+
this.config.frequency_max, // max_frequency
|
|
16
|
+
this.config.sampling_rate, // sampling_rate
|
|
17
|
+
null, // norm
|
|
18
|
+
"htk", // mel_scale
|
|
19
|
+
);
|
|
20
|
+
|
|
21
|
+
this.mel_filters_slaney = mel_filter_bank(
|
|
22
|
+
this.config.nb_frequency_bins, // num_frequency_bins
|
|
23
|
+
this.config.feature_size, // num_mel_filters
|
|
24
|
+
this.config.frequency_min, // min_frequency
|
|
25
|
+
this.config.frequency_max, // max_frequency
|
|
26
|
+
this.config.sampling_rate, // sampling_rate
|
|
27
|
+
"slaney", // norm
|
|
28
|
+
"slaney", // mel_scale
|
|
29
|
+
);
|
|
30
|
+
|
|
31
|
+
this.window = window_function(this.config.fft_window_size, 'hann')
|
|
32
|
+
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments.
|
|
38
|
+
*
|
|
39
|
+
* Four different path are possible:
|
|
40
|
+
* - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram
|
|
41
|
+
* will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram
|
|
42
|
+
* are then stacked together. They will later be used for `feature_fusion`.
|
|
43
|
+
* - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is
|
|
44
|
+
* padded based on `padding`.
|
|
45
|
+
* - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded
|
|
46
|
+
* based on `padding`, and is repeated `4` times.
|
|
47
|
+
* - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel
|
|
48
|
+
* spectrogram will be computed on a random crop of the waveform.
|
|
49
|
+
*
|
|
50
|
+
* @param {Float32Array|Float64Array} waveform The input waveform.
|
|
51
|
+
* @param {number} max_length The maximum length of the waveform.
|
|
52
|
+
* @param {string} truncation The truncation strategy to use.
|
|
53
|
+
* @param {string} padding The padding strategy to use.
|
|
54
|
+
* @returns {Promise<Tensor>} An object containing the mel spectrogram data as a Float32Array, its dimensions as an array of numbers, and a boolean indicating whether the waveform was longer than the max length.
|
|
55
|
+
* @private
|
|
56
|
+
*/
|
|
57
|
+
async _get_input_mel(waveform, max_length, truncation, padding) {
|
|
58
|
+
|
|
59
|
+
/** @type {Tensor} */
|
|
60
|
+
let input_mel;
|
|
61
|
+
let longer = false;
|
|
62
|
+
const diff = waveform.length - max_length;
|
|
63
|
+
if (diff > 0) {
|
|
64
|
+
if (truncation === 'rand_trunc') {
|
|
65
|
+
longer = true;
|
|
66
|
+
const idx = Math.floor(Math.random() * (diff + 1));
|
|
67
|
+
waveform = waveform.subarray(idx, idx + max_length);
|
|
68
|
+
|
|
69
|
+
input_mel = await this._extract_fbank_features(waveform, this.mel_filters_slaney, this.config.nb_max_samples);
|
|
70
|
+
} else {
|
|
71
|
+
// TODO implement fusion strategy
|
|
72
|
+
throw new Error(`Truncation strategy "${truncation}" not implemented`)
|
|
73
|
+
}
|
|
74
|
+
} else {
|
|
75
|
+
if (diff < 0) {
|
|
76
|
+
let padded = new Float64Array(max_length); // already padded with zeros
|
|
77
|
+
padded.set(waveform);
|
|
78
|
+
|
|
79
|
+
if (padding === 'repeat') {
|
|
80
|
+
for (let i = waveform.length; i < max_length; i += waveform.length) {
|
|
81
|
+
padded.set(waveform.subarray(0, Math.min(waveform.length, max_length - i)), i);
|
|
82
|
+
}
|
|
83
|
+
} else if (padding === 'repeatpad') {
|
|
84
|
+
for (let i = waveform.length; i < -diff; i += waveform.length) {
|
|
85
|
+
padded.set(waveform, i);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
waveform = padded;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if (truncation === 'fusion') {
|
|
92
|
+
throw new Error(`Truncation strategy "${truncation}" not implemented`)
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
input_mel = await this._extract_fbank_features(waveform, this.mel_filters_slaney, this.config.nb_max_samples);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return input_mel.unsqueeze_(0);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Compute the log-mel spectrogram of the provided `waveform` using the Hann window.
|
|
103
|
+
* In CLAP, two different filter banks are used depending on the truncation pattern:
|
|
104
|
+
* - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from
|
|
105
|
+
* calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
|
|
106
|
+
* is set to `"fusion"`.
|
|
107
|
+
* - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used
|
|
108
|
+
* `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
|
|
109
|
+
* implementation when the truncation mode is not `"fusion"`.
|
|
110
|
+
*
|
|
111
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
112
|
+
* @param {number[][]} mel_filters The mel filters to use.
|
|
113
|
+
* @param {number} [max_length=null] The maximum number of frames to return.
|
|
114
|
+
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
115
|
+
*/
|
|
116
|
+
async _extract_fbank_features(waveform, mel_filters, max_length = null) {
|
|
117
|
+
// NOTE: We don't pad/truncate since that is passed in as `max_num_frames`
|
|
118
|
+
return spectrogram(
|
|
119
|
+
waveform,
|
|
120
|
+
this.window, // window
|
|
121
|
+
this.config.fft_window_size, // frame_length
|
|
122
|
+
this.config.hop_length, // hop_length
|
|
123
|
+
{
|
|
124
|
+
power: 2.0,
|
|
125
|
+
mel_filters,
|
|
126
|
+
log_mel: 'dB',
|
|
127
|
+
|
|
128
|
+
// Custom
|
|
129
|
+
max_num_frames: max_length,
|
|
130
|
+
do_pad: false,
|
|
131
|
+
transpose: true,
|
|
132
|
+
}
|
|
133
|
+
)
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
139
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
140
|
+
* @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
|
|
141
|
+
*/
|
|
142
|
+
async _call(audio, {
|
|
143
|
+
max_length = null,
|
|
144
|
+
} = {}) {
|
|
145
|
+
validate_audio_inputs(audio, 'ClapFeatureExtractor');
|
|
146
|
+
|
|
147
|
+
// convert to mel spectrogram, truncate and pad if needed.
|
|
148
|
+
const padded_inputs = await this._get_input_mel(
|
|
149
|
+
audio,
|
|
150
|
+
max_length ?? this.config.nb_max_samples,
|
|
151
|
+
this.config.truncation,
|
|
152
|
+
this.config.padding,
|
|
153
|
+
);
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
input_features: padded_inputs.unsqueeze_(0),
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|