@huggingface/transformers 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. package/README.md +14 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +16607 -13472
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +16601 -13451
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +238 -52
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +229 -43
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +240 -54
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +16017 -12878
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +7 -7
  16. package/src/base/feature_extraction_utils.js +54 -0
  17. package/src/base/image_processors_utils.js +1089 -0
  18. package/src/base/processing_utils.js +145 -0
  19. package/src/configs.js +15 -3
  20. package/src/env.js +15 -4
  21. package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +90 -0
  22. package/src/models/auto/feature_extraction_auto.js +41 -0
  23. package/src/models/auto/image_processing_auto.js +29 -0
  24. package/src/models/auto/processing_auto.js +100 -0
  25. package/src/models/beit/image_processing_beit.js +5 -0
  26. package/src/models/bit/image_processing_bit.js +5 -0
  27. package/src/models/chinese_clip/image_processing_chinese_clip.js +5 -0
  28. package/src/models/clap/feature_extraction_clap.js +159 -0
  29. package/src/models/clip/image_processing_clip.js +6 -0
  30. package/src/models/convnext/image_processing_convnext.js +45 -0
  31. package/src/models/deit/image_processing_deit.js +6 -0
  32. package/src/models/detr/image_processing_detr.js +52 -0
  33. package/src/models/donut/image_processing_donut.js +31 -0
  34. package/src/models/dpt/image_processing_dpt.js +6 -0
  35. package/src/models/efficientnet/image_processing_efficientnet.js +13 -0
  36. package/src/models/feature_extractors.js +12 -0
  37. package/src/models/florence2/processing_florence2.js +128 -0
  38. package/src/models/glpn/image_processing_glpn.js +5 -0
  39. package/src/models/image_processors.js +36 -0
  40. package/src/models/janus/image_processing_janus.js +26 -0
  41. package/src/models/janus/processing_janus.js +123 -0
  42. package/src/models/jina_clip/image_processing_jina_clip.js +26 -0
  43. package/src/models/jina_clip/processing_jina_clip.js +24 -0
  44. package/src/models/llava_onevision/image_processing_llava_onevision.js +5 -0
  45. package/src/models/mask2former/image_processing_mask2former.js +5 -0
  46. package/src/models/maskformer/image_processing_maskformer.js +18 -0
  47. package/src/models/mgp_str/processing_mgp_str.js +170 -0
  48. package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +7 -0
  49. package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +7 -0
  50. package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +7 -0
  51. package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +7 -0
  52. package/src/models/mobilevit/image_processing_mobilevit.js +6 -0
  53. package/src/models/nougat/image_processing_nougat.js +5 -0
  54. package/src/models/owlv2/image_processing_owlv2.js +5 -0
  55. package/src/models/owlvit/image_processing_owlvit.js +12 -0
  56. package/src/models/owlvit/processing_owlvit.js +7 -0
  57. package/src/models/processors.js +11 -0
  58. package/src/models/pvt/image_processing_pvt.js +5 -0
  59. package/src/models/pyannote/feature_extraction_pyannote.js +28 -0
  60. package/src/models/pyannote/processing_pyannote.js +71 -0
  61. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +52 -0
  62. package/src/models/qwen2_vl/processing_qwen2_vl.js +52 -0
  63. package/src/models/rt_detr/image_processing_rt_detr.js +12 -0
  64. package/src/models/sam/image_processing_sam.js +242 -0
  65. package/src/models/sam/processing_sam.js +20 -0
  66. package/src/models/sapiens/image_processing_sapiens.js +13 -0
  67. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +180 -0
  68. package/src/models/segformer/image_processing_segformer.js +13 -0
  69. package/src/models/siglip/image_processing_siglip.js +5 -0
  70. package/src/models/speecht5/feature_extraction_speecht5.js +4 -0
  71. package/src/models/speecht5/processing_speecht5.js +17 -0
  72. package/src/models/swin2sr/image_processing_swin2sr.js +24 -0
  73. package/src/models/vit/image_processing_vit.js +7 -0
  74. package/src/models/vitmatte/image_processing_vitmatte.js +50 -0
  75. package/src/models/vitpose/image_processing_vitpose.js +89 -0
  76. package/src/models/wav2vec2/feature_extraction_wav2vec2.js +44 -0
  77. package/src/models/wav2vec2/processing_wav2vec2.js +15 -0
  78. package/src/models/wespeaker/feature_extraction_wespeaker.js +100 -0
  79. package/src/models/whisper/feature_extraction_whisper.js +84 -0
  80. package/src/models/whisper/processing_whisper.js +21 -0
  81. package/src/models/yolos/image_processing_yolos.js +12 -0
  82. package/src/models.js +695 -32
  83. package/src/pipelines.js +8 -8
  84. package/src/tokenizers.js +5 -0
  85. package/src/transformers.js +15 -2
  86. package/src/utils/constants.js +8 -1
  87. package/src/utils/core.js +37 -9
  88. package/src/utils/hub.js +2 -1
  89. package/src/utils/image.js +68 -17
  90. package/src/utils/tensor.js +33 -1
  91. package/types/base/feature_extraction_utils.d.ts +41 -0
  92. package/types/base/feature_extraction_utils.d.ts.map +1 -0
  93. package/types/base/image_processors_utils.d.ts +323 -0
  94. package/types/base/image_processors_utils.d.ts.map +1 -0
  95. package/types/base/processing_utils.d.ts +80 -0
  96. package/types/base/processing_utils.d.ts.map +1 -0
  97. package/types/configs.d.ts +4 -1
  98. package/types/configs.d.ts.map +1 -1
  99. package/types/env.d.ts.map +1 -1
  100. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +25 -0
  101. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -0
  102. package/types/models/auto/feature_extraction_auto.d.ts +5 -0
  103. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -0
  104. package/types/models/auto/image_processing_auto.d.ts +5 -0
  105. package/types/models/auto/image_processing_auto.d.ts.map +1 -0
  106. package/types/models/auto/processing_auto.d.ts +35 -0
  107. package/types/models/auto/processing_auto.d.ts.map +1 -0
  108. package/types/models/beit/image_processing_beit.d.ts +4 -0
  109. package/types/models/beit/image_processing_beit.d.ts.map +1 -0
  110. package/types/models/bit/image_processing_bit.d.ts +4 -0
  111. package/types/models/bit/image_processing_bit.d.ts.map +1 -0
  112. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts +4 -0
  113. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts.map +1 -0
  114. package/types/models/clap/feature_extraction_clap.d.ts +57 -0
  115. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -0
  116. package/types/models/clip/image_processing_clip.d.ts +6 -0
  117. package/types/models/clip/image_processing_clip.d.ts.map +1 -0
  118. package/types/models/convnext/image_processing_convnext.d.ts +12 -0
  119. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -0
  120. package/types/models/deit/image_processing_deit.d.ts +6 -0
  121. package/types/models/deit/image_processing_deit.d.ts.map +1 -0
  122. package/types/models/detr/image_processing_detr.d.ts +42 -0
  123. package/types/models/detr/image_processing_detr.d.ts.map +1 -0
  124. package/types/models/donut/image_processing_donut.d.ts +7 -0
  125. package/types/models/donut/image_processing_donut.d.ts.map +1 -0
  126. package/types/models/dpt/image_processing_dpt.d.ts +6 -0
  127. package/types/models/dpt/image_processing_dpt.d.ts.map +1 -0
  128. package/types/models/efficientnet/image_processing_efficientnet.d.ts +6 -0
  129. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -0
  130. package/types/models/feature_extractors.d.ts +10 -0
  131. package/types/models/feature_extractors.d.ts.map +1 -0
  132. package/types/models/florence2/processing_florence2.d.ts +39 -0
  133. package/types/models/florence2/processing_florence2.d.ts.map +1 -0
  134. package/types/models/glpn/image_processing_glpn.d.ts +4 -0
  135. package/types/models/glpn/image_processing_glpn.d.ts.map +1 -0
  136. package/types/models/image_processors.d.ts +36 -0
  137. package/types/models/image_processors.d.ts.map +1 -0
  138. package/types/models/janus/image_processing_janus.d.ts +7 -0
  139. package/types/models/janus/image_processing_janus.d.ts.map +1 -0
  140. package/types/models/janus/processing_janus.d.ts +77 -0
  141. package/types/models/janus/processing_janus.d.ts.map +1 -0
  142. package/types/models/jina_clip/image_processing_jina_clip.d.ts +5 -0
  143. package/types/models/jina_clip/image_processing_jina_clip.d.ts.map +1 -0
  144. package/types/models/jina_clip/processing_jina_clip.d.ts +9 -0
  145. package/types/models/jina_clip/processing_jina_clip.d.ts.map +1 -0
  146. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts +4 -0
  147. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts.map +1 -0
  148. package/types/models/mask2former/image_processing_mask2former.d.ts +4 -0
  149. package/types/models/mask2former/image_processing_mask2former.d.ts.map +1 -0
  150. package/types/models/maskformer/image_processing_maskformer.d.ts +22 -0
  151. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -0
  152. package/types/models/mgp_str/processing_mgp_str.d.ts +64 -0
  153. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -0
  154. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts +6 -0
  155. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts.map +1 -0
  156. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts +6 -0
  157. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts.map +1 -0
  158. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts +6 -0
  159. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts.map +1 -0
  160. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts +6 -0
  161. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts.map +1 -0
  162. package/types/models/mobilevit/image_processing_mobilevit.d.ts +6 -0
  163. package/types/models/mobilevit/image_processing_mobilevit.d.ts.map +1 -0
  164. package/types/models/nougat/image_processing_nougat.d.ts +4 -0
  165. package/types/models/nougat/image_processing_nougat.d.ts.map +1 -0
  166. package/types/models/owlv2/image_processing_owlv2.d.ts +4 -0
  167. package/types/models/owlv2/image_processing_owlv2.d.ts.map +1 -0
  168. package/types/models/owlvit/image_processing_owlvit.d.ts +10 -0
  169. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -0
  170. package/types/models/owlvit/processing_owlvit.d.ts +8 -0
  171. package/types/models/owlvit/processing_owlvit.d.ts.map +1 -0
  172. package/types/models/processors.d.ts +12 -0
  173. package/types/models/processors.d.ts.map +1 -0
  174. package/types/models/pvt/image_processing_pvt.d.ts +4 -0
  175. package/types/models/pvt/image_processing_pvt.d.ts.map +1 -0
  176. package/types/models/pyannote/feature_extraction_pyannote.d.ts +13 -0
  177. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -0
  178. package/types/models/pyannote/processing_pyannote.d.ts +30 -0
  179. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -0
  180. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +11 -0
  181. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -0
  182. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +17 -0
  183. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -0
  184. package/types/models/rt_detr/image_processing_rt_detr.d.ts +8 -0
  185. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -0
  186. package/types/models/sam/image_processing_sam.d.ts +103 -0
  187. package/types/models/sam/image_processing_sam.d.ts.map +1 -0
  188. package/types/models/sam/processing_sam.d.ts +9 -0
  189. package/types/models/sam/processing_sam.d.ts.map +1 -0
  190. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +34 -0
  191. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -0
  192. package/types/models/segformer/image_processing_segformer.d.ts +10 -0
  193. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -0
  194. package/types/models/siglip/image_processing_siglip.d.ts +4 -0
  195. package/types/models/siglip/image_processing_siglip.d.ts.map +1 -0
  196. package/types/models/speecht5/feature_extraction_speecht5.d.ts +4 -0
  197. package/types/models/speecht5/feature_extraction_speecht5.d.ts.map +1 -0
  198. package/types/models/speecht5/processing_speecht5.d.ts +14 -0
  199. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -0
  200. package/types/models/swin2sr/image_processing_swin2sr.d.ts +5 -0
  201. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -0
  202. package/types/models/vit/image_processing_vit.d.ts +6 -0
  203. package/types/models/vit/image_processing_vit.d.ts.map +1 -0
  204. package/types/models/vitmatte/image_processing_vitmatte.d.ts +12 -0
  205. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -0
  206. package/types/models/vitpose/image_processing_vitpose.d.ts +26 -0
  207. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -0
  208. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts +19 -0
  209. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -0
  210. package/types/models/wav2vec2/processing_wav2vec2.d.ts +12 -0
  211. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -0
  212. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +23 -0
  213. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -0
  214. package/types/models/whisper/feature_extraction_whisper.d.ts +21 -0
  215. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -0
  216. package/types/models/whisper/processing_whisper.d.ts +17 -0
  217. package/types/models/whisper/processing_whisper.d.ts.map +1 -0
  218. package/types/models/yolos/image_processing_yolos.d.ts +10 -0
  219. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -0
  220. package/types/models.d.ts +152 -0
  221. package/types/models.d.ts.map +1 -1
  222. package/types/pipelines.d.ts +2 -3
  223. package/types/pipelines.d.ts.map +1 -1
  224. package/types/tokenizers.d.ts +3 -0
  225. package/types/tokenizers.d.ts.map +1 -1
  226. package/types/transformers.d.ts +10 -1
  227. package/types/utils/constants.d.ts +6 -0
  228. package/types/utils/constants.d.ts.map +1 -1
  229. package/types/utils/core.d.ts +58 -3
  230. package/types/utils/core.d.ts.map +1 -1
  231. package/types/utils/hub.d.ts +1 -1
  232. package/types/utils/hub.d.ts.map +1 -1
  233. package/types/utils/image.d.ts +10 -2
  234. package/types/utils/image.d.ts.map +1 -1
  235. package/types/utils/tensor.d.ts +34 -1
  236. package/types/utils/tensor.d.ts.map +1 -1
  237. package/src/processors.js +0 -2655
  238. package/types/processors.d.ts +0 -924
  239. package/types/processors.d.ts.map +0 -1
@@ -0,0 +1,44 @@
1
+ import { FeatureExtractor, validate_audio_inputs } from "../../base/feature_extraction_utils.js";
2
+ import { Tensor } from "../../utils/tensor.js";
3
+
4
+ export class Wav2Vec2FeatureExtractor extends FeatureExtractor {
5
+
6
+ /**
7
+ * @param {Float32Array} input_values
8
+ * @returns {Float32Array}
9
+ */
10
+ _zero_mean_unit_var_norm(input_values) {
11
+ // TODO support batch?
12
+ const sum = input_values.reduce((a, b) => a + b, 0);
13
+ const mean = sum / input_values.length;
14
+ const variance = input_values.reduce((a, b) => a + (b - mean) ** 2, 0) / input_values.length;
15
+ return input_values.map(x => (x - mean) / Math.sqrt(variance + 1e-7));
16
+ }
17
+
18
+ /**
19
+ * Asynchronously extracts features from a given audio using the provided configuration.
20
+ * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
21
+ * @returns {Promise<{ input_values: Tensor; attention_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention mask as Tensors.
22
+ */
23
+ async _call(audio) {
24
+ validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor');
25
+
26
+ if (audio instanceof Float64Array) {
27
+ audio = new Float32Array(audio);
28
+ }
29
+
30
+ let input_values = audio;
31
+
32
+ // zero-mean and unit-variance normalization
33
+ if (this.config.do_normalize) {
34
+ input_values = this._zero_mean_unit_var_norm(input_values);
35
+ }
36
+
37
+ // TODO: allow user to pass in attention mask
38
+ const shape = [1, input_values.length];
39
+ return {
40
+ input_values: new Tensor('float32', input_values, shape),
41
+ attention_mask: new Tensor('int64', new BigInt64Array(input_values.length).fill(1n), shape)
42
+ };
43
+ }
44
+ }
@@ -0,0 +1,15 @@
1
+ import { Processor } from "../../base/processing_utils.js";
2
+ import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
3
+
4
+ export class Wav2Vec2ProcessorWithLM extends Processor {
5
+ static feature_extractor_class = AutoFeatureExtractor
6
+
7
+ /**
8
+ * Calls the feature_extractor function with the given audio input.
9
+ * @param {any} audio The audio input to extract features from.
10
+ * @returns {Promise<any>} A Promise that resolves with the extracted features.
11
+ */
12
+ async _call(audio) {
13
+ return await this.feature_extractor(audio)
14
+ }
15
+ }
@@ -0,0 +1,100 @@
1
+ import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
2
+ import { Tensor } from '../../utils/tensor.js';
3
+ import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
4
+
5
+
6
+ export class WeSpeakerFeatureExtractor extends FeatureExtractor {
7
+
8
+ constructor(config) {
9
+ super(config);
10
+
11
+ const sampling_rate = this.config.sampling_rate;
12
+ const mel_filters = mel_filter_bank(
13
+ 256, // num_frequency_bins
14
+ this.config.num_mel_bins, // num_mel_filters
15
+ 20, // min_frequency
16
+ Math.floor(sampling_rate / 2), // max_frequency
17
+ sampling_rate, // sampling_rate
18
+ null, // norm
19
+ "kaldi", // mel_scale
20
+ true, // triangularize_in_mel_space
21
+ );
22
+
23
+ // Do padding:
24
+ for (let i = 0; i < mel_filters.length; ++i) {
25
+ mel_filters[i].push(0);
26
+ }
27
+ this.mel_filters = mel_filters;
28
+
29
+ this.window = window_function(400, 'hamming', {
30
+ periodic: false,
31
+ })
32
+ this.min_num_frames = this.config.min_num_frames;
33
+ }
34
+
35
+ /**
36
+ * Computes the log-Mel spectrogram of the provided audio waveform.
37
+ * @param {Float32Array|Float64Array} waveform The audio waveform to process.
38
+ * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
39
+ */
40
+ async _extract_fbank_features(waveform) {
41
+ // Kaldi compliance: 16-bit signed integers
42
+ // 32768 == 2 ** 15
43
+ waveform = waveform.map((/** @type {number} */ x) => x * 32768)
44
+
45
+ return spectrogram(
46
+ waveform,
47
+ this.window, // window
48
+ 400, // frame_length
49
+ 160, // hop_length
50
+ {
51
+ fft_length: 512,
52
+ power: 2.0,
53
+ center: false,
54
+ preemphasis: 0.97,
55
+ mel_filters: this.mel_filters,
56
+ log_mel: 'log',
57
+ mel_floor: 1.192092955078125e-07,
58
+ remove_dc_offset: true,
59
+
60
+ // Custom
61
+ transpose: true,
62
+ min_num_frames: this.min_num_frames,
63
+ }
64
+ )
65
+ }
66
+
67
+
68
+ /**
69
+ * Asynchronously extracts features from a given audio using the provided configuration.
70
+ * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
71
+ * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
72
+ */
73
+ async _call(audio) {
74
+ validate_audio_inputs(audio, 'WeSpeakerFeatureExtractor');
75
+
76
+ const features = (await this._extract_fbank_features(audio)).unsqueeze_(0);
77
+
78
+ if (this.config.fbank_centering_span === null) {
79
+ // center features with global average
80
+ const meanData = /** @type {Float32Array} */ (features.mean(1).data);
81
+ const featuresData = /** @type {Float32Array} */(features.data);
82
+ const [batch_size, num_frames, feature_size] = features.dims;
83
+
84
+ for (let i = 0; i < batch_size; ++i) {
85
+ const offset1 = i * num_frames * feature_size;
86
+ const offset2 = i * feature_size;
87
+ for (let j = 0; j < num_frames; ++j) {
88
+ const offset3 = offset1 + j * feature_size;
89
+ for (let k = 0; k < feature_size; ++k) {
90
+ featuresData[offset3 + k] -= meanData[offset2 + k];
91
+ }
92
+ }
93
+ }
94
+ }
95
+
96
+ return {
97
+ input_features: features
98
+ };
99
+ }
100
+ }
@@ -0,0 +1,84 @@
1
+ import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
2
+ import { Tensor } from '../../utils/tensor.js';
3
+ import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
4
+ import { max } from '../../utils/maths.js';
5
+
6
+ export class WhisperFeatureExtractor extends FeatureExtractor {
7
+
8
+ constructor(config) {
9
+ super(config);
10
+
11
+ // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist.
12
+ this.config.mel_filters ??= mel_filter_bank(
13
+ Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins
14
+ this.config.feature_size, // num_mel_filters
15
+ 0.0, // min_frequency
16
+ 8000.0, // max_frequency
17
+ this.config.sampling_rate, // sampling_rate
18
+ "slaney", // norm
19
+ "slaney", // mel_scale
20
+ );
21
+
22
+ this.window = window_function(this.config.n_fft, 'hann');
23
+ }
24
+
25
+ /**
26
+ * Computes the log-Mel spectrogram of the provided audio waveform.
27
+ * @param {Float32Array|Float64Array} waveform The audio waveform to process.
28
+ * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
29
+ */
30
+ async _extract_fbank_features(waveform) {
31
+ const features = await spectrogram(
32
+ waveform,
33
+ this.window, // window
34
+ this.config.n_fft, // frame_length
35
+ this.config.hop_length, // hop_length
36
+ {
37
+ power: 2.0,
38
+ mel_filters: this.config.mel_filters,
39
+ log_mel: 'log10',
40
+
41
+ // Custom
42
+ max_num_frames: this.config.nb_max_frames, // 3000
43
+ }
44
+ )
45
+
46
+ const data = features.data;
47
+ const maxValue = max(data)[0];
48
+
49
+ for (let i = 0; i < data.length; ++i) {
50
+ data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
51
+ }
52
+
53
+ return features;
54
+ }
55
+
56
+ /**
57
+ * Asynchronously extracts features from a given audio using the provided configuration.
58
+ * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
59
+ * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
60
+ */
61
+ async _call(audio) {
62
+ validate_audio_inputs(audio, 'WhisperFeatureExtractor');
63
+
64
+ let waveform;
65
+ if (audio.length > this.config.n_samples) {
66
+ console.warn(
67
+ "Attempting to extract features for audio longer than 30 seconds. " +
68
+ "If using a pipeline to extract transcript from a long audio clip, " +
69
+ "remember to specify `chunk_length_s` and/or `stride_length_s`."
70
+ );
71
+ waveform = audio.slice(0, this.config.n_samples);
72
+ } else {
73
+ // pad with zeros
74
+ waveform = new Float32Array(this.config.n_samples);
75
+ waveform.set(audio);
76
+ }
77
+
78
+ const features = await this._extract_fbank_features(waveform);
79
+
80
+ return {
81
+ input_features: features.unsqueeze_(0)
82
+ };
83
+ }
84
+ }
@@ -0,0 +1,21 @@
1
+ import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
2
+ import { AutoTokenizer } from "../../tokenizers.js"
3
+ import { Processor } from "../../base/processing_utils.js"
4
+
5
+ /**
6
+ * Represents a WhisperProcessor that extracts features from an audio input.
7
+ */
8
+ export class WhisperProcessor extends Processor {
9
+ static tokenizer_class = AutoTokenizer
10
+ static feature_extractor_class = AutoFeatureExtractor
11
+
12
+ /**
13
+ * Calls the feature_extractor function with the given audio input.
14
+ * @param {any} audio The audio input to extract features from.
15
+ * @returns {Promise<any>} A Promise that resolves with the extracted features.
16
+ */
17
+ async _call(audio) {
18
+ return await this.feature_extractor(audio);
19
+ }
20
+ }
21
+
@@ -0,0 +1,12 @@
1
+ import {
2
+ ImageProcessor,
3
+ post_process_object_detection,
4
+ } from "../../base/image_processors_utils.js";
5
+
6
+ export class YolosImageProcessor extends ImageProcessor {
7
+ /** @type {typeof post_process_object_detection} */
8
+ post_process_object_detection(...args) {
9
+ return post_process_object_detection(...args);
10
+ }
11
+ }
12
+ export class YolosFeatureExtractor extends YolosImageProcessor { }