@huggingface/transformers 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. package/README.md +14 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +16607 -13472
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +16601 -13451
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +238 -52
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +229 -43
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +240 -54
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +16017 -12878
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +7 -7
  16. package/src/base/feature_extraction_utils.js +54 -0
  17. package/src/base/image_processors_utils.js +1089 -0
  18. package/src/base/processing_utils.js +145 -0
  19. package/src/configs.js +15 -3
  20. package/src/env.js +15 -4
  21. package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +90 -0
  22. package/src/models/auto/feature_extraction_auto.js +41 -0
  23. package/src/models/auto/image_processing_auto.js +29 -0
  24. package/src/models/auto/processing_auto.js +100 -0
  25. package/src/models/beit/image_processing_beit.js +5 -0
  26. package/src/models/bit/image_processing_bit.js +5 -0
  27. package/src/models/chinese_clip/image_processing_chinese_clip.js +5 -0
  28. package/src/models/clap/feature_extraction_clap.js +159 -0
  29. package/src/models/clip/image_processing_clip.js +6 -0
  30. package/src/models/convnext/image_processing_convnext.js +45 -0
  31. package/src/models/deit/image_processing_deit.js +6 -0
  32. package/src/models/detr/image_processing_detr.js +52 -0
  33. package/src/models/donut/image_processing_donut.js +31 -0
  34. package/src/models/dpt/image_processing_dpt.js +6 -0
  35. package/src/models/efficientnet/image_processing_efficientnet.js +13 -0
  36. package/src/models/feature_extractors.js +12 -0
  37. package/src/models/florence2/processing_florence2.js +128 -0
  38. package/src/models/glpn/image_processing_glpn.js +5 -0
  39. package/src/models/image_processors.js +36 -0
  40. package/src/models/janus/image_processing_janus.js +26 -0
  41. package/src/models/janus/processing_janus.js +123 -0
  42. package/src/models/jina_clip/image_processing_jina_clip.js +26 -0
  43. package/src/models/jina_clip/processing_jina_clip.js +24 -0
  44. package/src/models/llava_onevision/image_processing_llava_onevision.js +5 -0
  45. package/src/models/mask2former/image_processing_mask2former.js +5 -0
  46. package/src/models/maskformer/image_processing_maskformer.js +18 -0
  47. package/src/models/mgp_str/processing_mgp_str.js +170 -0
  48. package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +7 -0
  49. package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +7 -0
  50. package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +7 -0
  51. package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +7 -0
  52. package/src/models/mobilevit/image_processing_mobilevit.js +6 -0
  53. package/src/models/nougat/image_processing_nougat.js +5 -0
  54. package/src/models/owlv2/image_processing_owlv2.js +5 -0
  55. package/src/models/owlvit/image_processing_owlvit.js +12 -0
  56. package/src/models/owlvit/processing_owlvit.js +7 -0
  57. package/src/models/processors.js +11 -0
  58. package/src/models/pvt/image_processing_pvt.js +5 -0
  59. package/src/models/pyannote/feature_extraction_pyannote.js +28 -0
  60. package/src/models/pyannote/processing_pyannote.js +71 -0
  61. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +52 -0
  62. package/src/models/qwen2_vl/processing_qwen2_vl.js +52 -0
  63. package/src/models/rt_detr/image_processing_rt_detr.js +12 -0
  64. package/src/models/sam/image_processing_sam.js +242 -0
  65. package/src/models/sam/processing_sam.js +20 -0
  66. package/src/models/sapiens/image_processing_sapiens.js +13 -0
  67. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +180 -0
  68. package/src/models/segformer/image_processing_segformer.js +13 -0
  69. package/src/models/siglip/image_processing_siglip.js +5 -0
  70. package/src/models/speecht5/feature_extraction_speecht5.js +4 -0
  71. package/src/models/speecht5/processing_speecht5.js +17 -0
  72. package/src/models/swin2sr/image_processing_swin2sr.js +24 -0
  73. package/src/models/vit/image_processing_vit.js +7 -0
  74. package/src/models/vitmatte/image_processing_vitmatte.js +50 -0
  75. package/src/models/vitpose/image_processing_vitpose.js +89 -0
  76. package/src/models/wav2vec2/feature_extraction_wav2vec2.js +44 -0
  77. package/src/models/wav2vec2/processing_wav2vec2.js +15 -0
  78. package/src/models/wespeaker/feature_extraction_wespeaker.js +100 -0
  79. package/src/models/whisper/feature_extraction_whisper.js +84 -0
  80. package/src/models/whisper/processing_whisper.js +21 -0
  81. package/src/models/yolos/image_processing_yolos.js +12 -0
  82. package/src/models.js +695 -32
  83. package/src/pipelines.js +8 -8
  84. package/src/tokenizers.js +5 -0
  85. package/src/transformers.js +15 -2
  86. package/src/utils/constants.js +8 -1
  87. package/src/utils/core.js +37 -9
  88. package/src/utils/hub.js +2 -1
  89. package/src/utils/image.js +68 -17
  90. package/src/utils/tensor.js +33 -1
  91. package/types/base/feature_extraction_utils.d.ts +41 -0
  92. package/types/base/feature_extraction_utils.d.ts.map +1 -0
  93. package/types/base/image_processors_utils.d.ts +323 -0
  94. package/types/base/image_processors_utils.d.ts.map +1 -0
  95. package/types/base/processing_utils.d.ts +80 -0
  96. package/types/base/processing_utils.d.ts.map +1 -0
  97. package/types/configs.d.ts +4 -1
  98. package/types/configs.d.ts.map +1 -1
  99. package/types/env.d.ts.map +1 -1
  100. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +25 -0
  101. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -0
  102. package/types/models/auto/feature_extraction_auto.d.ts +5 -0
  103. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -0
  104. package/types/models/auto/image_processing_auto.d.ts +5 -0
  105. package/types/models/auto/image_processing_auto.d.ts.map +1 -0
  106. package/types/models/auto/processing_auto.d.ts +35 -0
  107. package/types/models/auto/processing_auto.d.ts.map +1 -0
  108. package/types/models/beit/image_processing_beit.d.ts +4 -0
  109. package/types/models/beit/image_processing_beit.d.ts.map +1 -0
  110. package/types/models/bit/image_processing_bit.d.ts +4 -0
  111. package/types/models/bit/image_processing_bit.d.ts.map +1 -0
  112. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts +4 -0
  113. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts.map +1 -0
  114. package/types/models/clap/feature_extraction_clap.d.ts +57 -0
  115. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -0
  116. package/types/models/clip/image_processing_clip.d.ts +6 -0
  117. package/types/models/clip/image_processing_clip.d.ts.map +1 -0
  118. package/types/models/convnext/image_processing_convnext.d.ts +12 -0
  119. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -0
  120. package/types/models/deit/image_processing_deit.d.ts +6 -0
  121. package/types/models/deit/image_processing_deit.d.ts.map +1 -0
  122. package/types/models/detr/image_processing_detr.d.ts +42 -0
  123. package/types/models/detr/image_processing_detr.d.ts.map +1 -0
  124. package/types/models/donut/image_processing_donut.d.ts +7 -0
  125. package/types/models/donut/image_processing_donut.d.ts.map +1 -0
  126. package/types/models/dpt/image_processing_dpt.d.ts +6 -0
  127. package/types/models/dpt/image_processing_dpt.d.ts.map +1 -0
  128. package/types/models/efficientnet/image_processing_efficientnet.d.ts +6 -0
  129. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -0
  130. package/types/models/feature_extractors.d.ts +10 -0
  131. package/types/models/feature_extractors.d.ts.map +1 -0
  132. package/types/models/florence2/processing_florence2.d.ts +39 -0
  133. package/types/models/florence2/processing_florence2.d.ts.map +1 -0
  134. package/types/models/glpn/image_processing_glpn.d.ts +4 -0
  135. package/types/models/glpn/image_processing_glpn.d.ts.map +1 -0
  136. package/types/models/image_processors.d.ts +36 -0
  137. package/types/models/image_processors.d.ts.map +1 -0
  138. package/types/models/janus/image_processing_janus.d.ts +7 -0
  139. package/types/models/janus/image_processing_janus.d.ts.map +1 -0
  140. package/types/models/janus/processing_janus.d.ts +77 -0
  141. package/types/models/janus/processing_janus.d.ts.map +1 -0
  142. package/types/models/jina_clip/image_processing_jina_clip.d.ts +5 -0
  143. package/types/models/jina_clip/image_processing_jina_clip.d.ts.map +1 -0
  144. package/types/models/jina_clip/processing_jina_clip.d.ts +9 -0
  145. package/types/models/jina_clip/processing_jina_clip.d.ts.map +1 -0
  146. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts +4 -0
  147. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts.map +1 -0
  148. package/types/models/mask2former/image_processing_mask2former.d.ts +4 -0
  149. package/types/models/mask2former/image_processing_mask2former.d.ts.map +1 -0
  150. package/types/models/maskformer/image_processing_maskformer.d.ts +22 -0
  151. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -0
  152. package/types/models/mgp_str/processing_mgp_str.d.ts +64 -0
  153. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -0
  154. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts +6 -0
  155. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts.map +1 -0
  156. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts +6 -0
  157. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts.map +1 -0
  158. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts +6 -0
  159. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts.map +1 -0
  160. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts +6 -0
  161. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts.map +1 -0
  162. package/types/models/mobilevit/image_processing_mobilevit.d.ts +6 -0
  163. package/types/models/mobilevit/image_processing_mobilevit.d.ts.map +1 -0
  164. package/types/models/nougat/image_processing_nougat.d.ts +4 -0
  165. package/types/models/nougat/image_processing_nougat.d.ts.map +1 -0
  166. package/types/models/owlv2/image_processing_owlv2.d.ts +4 -0
  167. package/types/models/owlv2/image_processing_owlv2.d.ts.map +1 -0
  168. package/types/models/owlvit/image_processing_owlvit.d.ts +10 -0
  169. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -0
  170. package/types/models/owlvit/processing_owlvit.d.ts +8 -0
  171. package/types/models/owlvit/processing_owlvit.d.ts.map +1 -0
  172. package/types/models/processors.d.ts +12 -0
  173. package/types/models/processors.d.ts.map +1 -0
  174. package/types/models/pvt/image_processing_pvt.d.ts +4 -0
  175. package/types/models/pvt/image_processing_pvt.d.ts.map +1 -0
  176. package/types/models/pyannote/feature_extraction_pyannote.d.ts +13 -0
  177. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -0
  178. package/types/models/pyannote/processing_pyannote.d.ts +30 -0
  179. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -0
  180. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +11 -0
  181. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -0
  182. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +17 -0
  183. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -0
  184. package/types/models/rt_detr/image_processing_rt_detr.d.ts +8 -0
  185. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -0
  186. package/types/models/sam/image_processing_sam.d.ts +103 -0
  187. package/types/models/sam/image_processing_sam.d.ts.map +1 -0
  188. package/types/models/sam/processing_sam.d.ts +9 -0
  189. package/types/models/sam/processing_sam.d.ts.map +1 -0
  190. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +34 -0
  191. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -0
  192. package/types/models/segformer/image_processing_segformer.d.ts +10 -0
  193. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -0
  194. package/types/models/siglip/image_processing_siglip.d.ts +4 -0
  195. package/types/models/siglip/image_processing_siglip.d.ts.map +1 -0
  196. package/types/models/speecht5/feature_extraction_speecht5.d.ts +4 -0
  197. package/types/models/speecht5/feature_extraction_speecht5.d.ts.map +1 -0
  198. package/types/models/speecht5/processing_speecht5.d.ts +14 -0
  199. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -0
  200. package/types/models/swin2sr/image_processing_swin2sr.d.ts +5 -0
  201. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -0
  202. package/types/models/vit/image_processing_vit.d.ts +6 -0
  203. package/types/models/vit/image_processing_vit.d.ts.map +1 -0
  204. package/types/models/vitmatte/image_processing_vitmatte.d.ts +12 -0
  205. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -0
  206. package/types/models/vitpose/image_processing_vitpose.d.ts +26 -0
  207. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -0
  208. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts +19 -0
  209. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -0
  210. package/types/models/wav2vec2/processing_wav2vec2.d.ts +12 -0
  211. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -0
  212. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +23 -0
  213. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -0
  214. package/types/models/whisper/feature_extraction_whisper.d.ts +21 -0
  215. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -0
  216. package/types/models/whisper/processing_whisper.d.ts +17 -0
  217. package/types/models/whisper/processing_whisper.d.ts.map +1 -0
  218. package/types/models/yolos/image_processing_yolos.d.ts +10 -0
  219. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -0
  220. package/types/models.d.ts +152 -0
  221. package/types/models.d.ts.map +1 -1
  222. package/types/pipelines.d.ts +2 -3
  223. package/types/pipelines.d.ts.map +1 -1
  224. package/types/tokenizers.d.ts +3 -0
  225. package/types/tokenizers.d.ts.map +1 -1
  226. package/types/transformers.d.ts +10 -1
  227. package/types/utils/constants.d.ts +6 -0
  228. package/types/utils/constants.d.ts.map +1 -1
  229. package/types/utils/core.d.ts +58 -3
  230. package/types/utils/core.d.ts.map +1 -1
  231. package/types/utils/hub.d.ts +1 -1
  232. package/types/utils/hub.d.ts.map +1 -1
  233. package/types/utils/image.d.ts +10 -2
  234. package/types/utils/image.d.ts.map +1 -1
  235. package/types/utils/tensor.d.ts +34 -1
  236. package/types/utils/tensor.d.ts.map +1 -1
  237. package/src/processors.js +0 -2655
  238. package/types/processors.d.ts +0 -924
  239. package/types/processors.d.ts.map +0 -1
@@ -1,924 +0,0 @@
1
- declare const FeatureExtractor_base: new () => {
2
- (...args: any[]): any;
3
- _call(...args: any[]): any;
4
- };
5
- /**
6
- * Base class for feature extractors.
7
- *
8
- * @extends Callable
9
- */
10
- export class FeatureExtractor extends FeatureExtractor_base {
11
- /**
12
- * Constructs a new FeatureExtractor instance.
13
- *
14
- * @param {Object} config The configuration for the feature extractor.
15
- */
16
- constructor(config: any);
17
- config: any;
18
- }
19
- /**
20
- * @typedef {object} ImageFeatureExtractorResult
21
- * @property {Tensor} pixel_values The pixel values of the batched preprocessed images.
22
- * @property {HeightWidth[]} original_sizes Array of two-dimensional tuples like [[480, 640]].
23
- * @property {HeightWidth[]} reshaped_input_sizes Array of two-dimensional tuples like [[1000, 1330]].
24
- */
25
- /**
26
- * Feature extractor for image models.
27
- *
28
- * @extends FeatureExtractor
29
- */
30
- export class ImageFeatureExtractor extends FeatureExtractor {
31
- /**
32
- * Constructs a new ImageFeatureExtractor instance.
33
- *
34
- * @param {Object} config The configuration for the feature extractor.
35
- * @param {number[]} config.image_mean The mean values for image normalization.
36
- * @param {number[]} config.image_std The standard deviation values for image normalization.
37
- * @param {boolean} config.do_rescale Whether to rescale the image pixel values to the [0,1] range.
38
- * @param {number} config.rescale_factor The factor to use for rescaling the image pixel values.
39
- * @param {boolean} config.do_normalize Whether to normalize the image pixel values.
40
- * @param {boolean} config.do_resize Whether to resize the image.
41
- * @param {number} config.resample What method to use for resampling.
42
- * @param {number|Object} config.size The size to resize the image to.
43
- * @param {boolean} [config.do_flip_channel_order=false] Whether to flip the color channels from RGB to BGR.
44
- * Can be overridden by the `do_flip_channel_order` parameter in the `preprocess` method.
45
- */
46
- constructor(config: {
47
- image_mean: number[];
48
- image_std: number[];
49
- do_rescale: boolean;
50
- rescale_factor: number;
51
- do_normalize: boolean;
52
- do_resize: boolean;
53
- resample: number;
54
- size: number | any;
55
- do_flip_channel_order?: boolean;
56
- });
57
- image_mean: any;
58
- image_std: any;
59
- resample: any;
60
- do_rescale: any;
61
- rescale_factor: any;
62
- do_normalize: any;
63
- do_resize: any;
64
- do_thumbnail: any;
65
- size: any;
66
- size_divisibility: any;
67
- do_center_crop: any;
68
- crop_size: any;
69
- do_convert_rgb: any;
70
- do_crop_margin: any;
71
- pad_size: any;
72
- do_pad: any;
73
- do_flip_channel_order: any;
74
- /**
75
- * Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
76
- * corresponding dimension of the specified size.
77
- * @param {RawImage} image The image to be resized.
78
- * @param {{height:number, width:number}} size The size `{"height": h, "width": w}` to resize the image to.
79
- * @param {string | 0 | 1 | 2 | 3 | 4 | 5} [resample=2] The resampling filter to use.
80
- * @returns {Promise<RawImage>} The resized image.
81
- */
82
- thumbnail(image: RawImage, size: {
83
- height: number;
84
- width: number;
85
- }, resample?: string | 0 | 1 | 2 | 3 | 4 | 5): Promise<RawImage>;
86
- /**
87
- * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold).
88
- * @param {RawImage} image The image to be cropped.
89
- * @param {number} gray_threshold Value below which pixels are considered to be gray.
90
- * @returns {Promise<RawImage>} The cropped image.
91
- */
92
- crop_margin(image: RawImage, gray_threshold?: number): Promise<RawImage>;
93
- /**
94
- * Pad the image by a certain amount.
95
- * @param {Float32Array} pixelData The pixel data to pad.
96
- * @param {number[]} imgDims The dimensions of the image (height, width, channels).
97
- * @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
98
- * @param {Object} options The options for padding.
99
- * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
100
- * @param {boolean} [options.center=false] Whether to center the image.
101
- * @param {number} [options.constant_values=0] The constant value to use for padding.
102
- * @returns {[Float32Array, number[]]} The padded pixel data and image dimensions.
103
- */
104
- pad_image(pixelData: Float32Array, imgDims: number[], padSize: {
105
- width: number;
106
- height: number;
107
- } | number, { mode, center, constant_values, }?: {
108
- mode?: 'constant' | 'symmetric';
109
- center?: boolean;
110
- constant_values?: number;
111
- }): [Float32Array, number[]];
112
- /**
113
- * Rescale the image' pixel values by `this.rescale_factor`.
114
- * @param {Float32Array} pixelData The pixel data to rescale.
115
- * @returns {void}
116
- */
117
- rescale(pixelData: Float32Array): void;
118
- /**
119
- * Find the target (width, height) dimension of the output image after
120
- * resizing given the input image and the desired size.
121
- * @param {RawImage} image The image to resize.
122
- * @param {any} size The size to use for resizing the image.
123
- * @returns {[number, number]} The target (width, height) dimension of the output image after resizing.
124
- */
125
- get_resize_output_image_size(image: RawImage, size: any): [number, number];
126
- /**
127
- * Resizes the image.
128
- * @param {RawImage} image The image to resize.
129
- * @returns {Promise<RawImage>} The resized image.
130
- */
131
- resize(image: RawImage): Promise<RawImage>;
132
- /**
133
- * @typedef {object} PreprocessedImage
134
- * @property {HeightWidth} original_size The original size of the image.
135
- * @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
136
- * @property {Tensor} pixel_values The pixel values of the preprocessed image.
137
- */
138
- /**
139
- * Preprocesses the given image.
140
- *
141
- * @param {RawImage} image The image to preprocess.
142
- * @param {Object} overrides The overrides for the preprocessing options.
143
- * @returns {Promise<PreprocessedImage>} The preprocessed image.
144
- */
145
- preprocess(image: RawImage, { do_normalize, do_pad, do_convert_rgb, do_convert_grayscale, do_flip_channel_order, }?: any): Promise<{
146
- /**
147
- * The original size of the image.
148
- */
149
- original_size: HeightWidth;
150
- /**
151
- * The reshaped input size of the image.
152
- */
153
- reshaped_input_size: HeightWidth;
154
- /**
155
- * The pixel values of the preprocessed image.
156
- */
157
- pixel_values: Tensor;
158
- }>;
159
- /**
160
- * Calls the feature extraction process on an array of images,
161
- * preprocesses each image, and concatenates the resulting
162
- * features into a single Tensor.
163
- * @param {RawImage[]} images The image(s) to extract features from.
164
- * @param {...any} args Additional arguments.
165
- * @returns {Promise<ImageFeatureExtractorResult>} An object containing the concatenated pixel values (and other metadata) of the preprocessed images.
166
- */
167
- _call(images: RawImage[], ...args: any[]): Promise<ImageFeatureExtractorResult>;
168
- }
169
- export class SapiensFeatureExtractor extends ImageFeatureExtractor {
170
- /**
171
- * Post-processes the outputs of the model (for semantic segmentation).
172
- * @param {*} outputs Raw outputs of the model.
173
- * @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
174
- * (height, width) of each prediction. If unset, predictions will not be resized.
175
- * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps.
176
- */
177
- post_process_semantic_segmentation(outputs: any, target_sizes?: [number, number][]): {
178
- segmentation: Tensor;
179
- labels: number[];
180
- }[];
181
- }
182
- export class SegformerFeatureExtractor extends ImageFeatureExtractor {
183
- /**
184
- * Post-processes the outputs of the model (for semantic segmentation).
185
- * @param {*} outputs Raw outputs of the model.
186
- * @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
187
- * (height, width) of each prediction. If unset, predictions will not be resized.
188
- * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps.
189
- */
190
- post_process_semantic_segmentation(outputs: any, target_sizes?: [number, number][]): {
191
- segmentation: Tensor;
192
- labels: number[];
193
- }[];
194
- }
195
- export class PvtImageProcessor extends ImageFeatureExtractor {
196
- }
197
- export class DPTFeatureExtractor extends ImageFeatureExtractor {
198
- }
199
- export class DPTImageProcessor extends DPTFeatureExtractor {
200
- }
201
- export class BitImageProcessor extends ImageFeatureExtractor {
202
- }
203
- export class GLPNFeatureExtractor extends ImageFeatureExtractor {
204
- }
205
- export class CLIPFeatureExtractor extends ImageFeatureExtractor {
206
- }
207
- export class CLIPImageProcessor extends CLIPFeatureExtractor {
208
- }
209
- export class ChineseCLIPFeatureExtractor extends ImageFeatureExtractor {
210
- }
211
- export class SiglipImageProcessor extends ImageFeatureExtractor {
212
- }
213
- export class ConvNextFeatureExtractor extends ImageFeatureExtractor {
214
- constructor(config: any);
215
- /**
216
- * Percentage of the image to crop. Only has an effect if this.size < 384.
217
- */
218
- crop_pct: any;
219
- resize(image: any): Promise<any>;
220
- }
221
- export class ConvNextImageProcessor extends ConvNextFeatureExtractor {
222
- }
223
- export class ViTFeatureExtractor extends ImageFeatureExtractor {
224
- }
225
- export class ViTImageProcessor extends ImageFeatureExtractor {
226
- }
227
- export class EfficientNetImageProcessor extends ImageFeatureExtractor {
228
- constructor(config: any);
229
- include_top: any;
230
- }
231
- export class MobileNetV1FeatureExtractor extends ImageFeatureExtractor {
232
- }
233
- export class MobileNetV2FeatureExtractor extends ImageFeatureExtractor {
234
- }
235
- export class MobileNetV3FeatureExtractor extends ImageFeatureExtractor {
236
- }
237
- export class MobileNetV4FeatureExtractor extends ImageFeatureExtractor {
238
- }
239
- export class MobileViTFeatureExtractor extends ImageFeatureExtractor {
240
- }
241
- export class MobileViTImageProcessor extends MobileViTFeatureExtractor {
242
- }
243
- export class OwlViTFeatureExtractor extends ImageFeatureExtractor {
244
- /**
245
- * Post-processes the outputs of the model (for object detection).
246
- * @param {Object} outputs The outputs of the model that must be post-processed
247
- * @param {Tensor} outputs.logits The logits
248
- * @param {Tensor} outputs.pred_boxes The predicted boxes.
249
- * @param {number} [threshold=0.5] The threshold to use for the scores.
250
- * @param {[number, number][]} [target_sizes=null] The sizes of the original images.
251
- * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed.
252
- * @return {Object[]} An array of objects containing the post-processed outputs.
253
- * @private
254
- */
255
- post_process_object_detection(outputs: {
256
- logits: Tensor;
257
- pred_boxes: Tensor;
258
- }, threshold?: number, target_sizes?: [number, number][], is_zero_shot?: boolean): any[];
259
- }
260
- export class Owlv2ImageProcessor extends OwlViTFeatureExtractor {
261
- }
262
- export class RTDetrImageProcessor extends ImageFeatureExtractor {
263
- /**
264
- * Post-processes the outputs of the model (for object detection).
265
- * @param {Object} outputs The outputs of the model that must be post-processed
266
- * @param {Tensor} outputs.logits The logits
267
- * @param {Tensor} outputs.pred_boxes The predicted boxes.
268
- * @param {number} [threshold=0.5] The threshold to use for the scores.
269
- * @param {[number, number][]} [target_sizes=null] The sizes of the original images.
270
- * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed.
271
- * @return {Object[]} An array of objects containing the post-processed outputs.
272
- * @private
273
- */
274
- post_process_object_detection(outputs: {
275
- logits: Tensor;
276
- pred_boxes: Tensor;
277
- }, threshold?: number, target_sizes?: [number, number][], is_zero_shot?: boolean): any[];
278
- }
279
- export class DeiTFeatureExtractor extends ImageFeatureExtractor {
280
- }
281
- export class BeitFeatureExtractor extends ImageFeatureExtractor {
282
- }
283
- export class DonutFeatureExtractor extends ImageFeatureExtractor {
284
- pad_image(pixelData: any, imgDims: any, padSize: any, options?: {}): [Float32Array, number[]];
285
- }
286
- export class DonutImageProcessor extends DonutFeatureExtractor {
287
- }
288
- export class NougatImageProcessor extends DonutFeatureExtractor {
289
- }
290
- /**
291
- * @typedef {object} DetrFeatureExtractorResultProps
292
- * @property {Tensor} pixel_mask
293
- * @typedef {ImageFeatureExtractorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult
294
- */
295
- /**
296
- * Detr Feature Extractor.
297
- *
298
- * @extends ImageFeatureExtractor
299
- */
300
- export class DetrFeatureExtractor extends ImageFeatureExtractor {
301
- /**
302
- * Calls the feature extraction process on an array of images, preprocesses
303
- * each image, and concatenates the resulting features into a single Tensor.
304
- * @param {RawImage[]} images The image(s) to extract features from.
305
- * @returns {Promise<DetrFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
306
- */
307
- _call(images: RawImage[]): Promise<DetrFeatureExtractorResult>;
308
- /**
309
- * Post-processes the outputs of the model (for object detection).
310
- * @param {Object} outputs The outputs of the model that must be post-processed
311
- * @param {Tensor} outputs.logits The logits
312
- * @param {Tensor} outputs.pred_boxes The predicted boxes.
313
- * @param {number} [threshold=0.5] The threshold to use for the scores.
314
- * @param {[number, number][]} [target_sizes=null] The sizes of the original images.
315
- * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed.
316
- * @return {Object[]} An array of objects containing the post-processed outputs.
317
- * @private
318
- */
319
- post_process_object_detection(outputs: {
320
- logits: Tensor;
321
- pred_boxes: Tensor;
322
- }, threshold?: number, target_sizes?: [number, number][], is_zero_shot?: boolean): any[];
323
- /**
324
- * Post-process the model output to generate the final panoptic segmentation.
325
- * @param {*} outputs The model output to post process
326
- * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
327
- * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
328
- * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.
329
- * @param {Set<number>} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together.
330
- * @param {[number, number][]} [target_sizes=null] The target sizes to resize the masks to.
331
- * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
332
- */
333
- post_process_panoptic_segmentation(outputs: any, threshold?: number, mask_threshold?: number, overlap_mask_area_threshold?: number, label_ids_to_fuse?: Set<number>, target_sizes?: [number, number][]): {
334
- segmentation: Tensor;
335
- segments_info: {
336
- id: number;
337
- label_id: number;
338
- score: number;
339
- }[];
340
- }[];
341
- post_process_instance_segmentation(): void;
342
- }
343
- export class MaskFormerFeatureExtractor extends ImageFeatureExtractor {
344
- /**
345
- * Post-process the model output to generate the final panoptic segmentation.
346
- * @param {*} outputs The model output to post process
347
- * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
348
- * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
349
- * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.
350
- * @param {Set<number>} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together.
351
- * @param {[number, number][]} [target_sizes=null] The target sizes to resize the masks to.
352
- * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
353
- */
354
- post_process_panoptic_segmentation(outputs: any, threshold?: number, mask_threshold?: number, overlap_mask_area_threshold?: number, label_ids_to_fuse?: Set<number>, target_sizes?: [number, number][]): {
355
- segmentation: Tensor;
356
- segments_info: {
357
- id: number;
358
- label_id: number;
359
- score: number;
360
- }[];
361
- }[];
362
- post_process_instance_segmentation(): void;
363
- }
364
- export class YolosFeatureExtractor extends ImageFeatureExtractor {
365
- /**
366
- * Post-processes the outputs of the model (for object detection).
367
- * @param {Object} outputs The outputs of the model that must be post-processed
368
- * @param {Tensor} outputs.logits The logits
369
- * @param {Tensor} outputs.pred_boxes The predicted boxes.
370
- * @param {number} [threshold=0.5] The threshold to use for the scores.
371
- * @param {[number, number][]} [target_sizes=null] The sizes of the original images.
372
- * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed.
373
- * @return {Object[]} An array of objects containing the post-processed outputs.
374
- * @private
375
- */
376
- post_process_object_detection(outputs: {
377
- logits: Tensor;
378
- pred_boxes: Tensor;
379
- }, threshold?: number, target_sizes?: [number, number][], is_zero_shot?: boolean): any[];
380
- }
381
- /**
382
- * @typedef {object} SamImageProcessorResult
383
- * @property {Tensor} pixel_values
384
- * @property {HeightWidth[]} original_sizes
385
- * @property {HeightWidth[]} reshaped_input_sizes
386
- * @property {Tensor} [input_points]
387
- * @property {Tensor} [input_labels]
388
- * @property {Tensor} [input_boxes]
389
- */
390
- export class SamImageProcessor extends ImageFeatureExtractor {
391
- /**
392
- *
393
- * @param {any} input_points
394
- * @param {HeightWidth[]} original_sizes
395
- * @param {HeightWidth[]} reshaped_input_sizes
396
- * @returns {Tensor}
397
- */
398
- reshape_input_points(input_points: any, original_sizes: HeightWidth[], reshaped_input_sizes: HeightWidth[], is_bounding_box?: boolean): Tensor;
399
- /**
400
- *
401
- * @param {any} input_labels
402
- * @param {Tensor} input_points
403
- * @returns {Tensor}
404
- */
405
- add_input_labels(input_labels: any, input_points: Tensor): Tensor;
406
- /**
407
- * @param {any[]} images The URL(s) of the image(s) to extract features from.
408
- * @param {Object} [options] Additional options for the processor.
409
- * @param {any} [options.input_points=null] A 3D or 4D array, representing the input points provided by the user.
410
- * - 3D: `[point_batch_size, nb_points_per_image, 2]`. In this case, `batch_size` is assumed to be 1.
411
- * - 4D: `[batch_size, point_batch_size, nb_points_per_image, 2]`.
412
- * @param {any} [options.input_labels=null] A 2D or 3D array, representing the input labels for the points, used by the prompt encoder to encode the prompt.
413
- * - 2D: `[point_batch_size, nb_points_per_image]`. In this case, `batch_size` is assumed to be 1.
414
- * - 3D: `[batch_size, point_batch_size, nb_points_per_image]`.
415
- * @param {number[][][]} [options.input_boxes=null] A 3D array of shape `(batch_size, num_boxes, 4)`, representing the input boxes provided by the user.
416
- * This is used by the prompt encoder to encode the prompt. Generally yields to much better generated masks.
417
- * The processor will generate a tensor, with each dimension corresponding respectively to the image batch size,
418
- * the number of boxes per image and the coordinates of the top left and botton right point of the box.
419
- * In the order (`x1`, `y1`, `x2`, `y2`):
420
- * - `x1`: the x coordinate of the top left point of the input box
421
- * - `y1`: the y coordinate of the top left point of the input box
422
- * - `x2`: the x coordinate of the bottom right point of the input box
423
- * - `y2`: the y coordinate of the bottom right point of the input box
424
- * @returns {Promise<SamImageProcessorResult>}
425
- */
426
- _call(images: any[], { input_points, input_labels, input_boxes }?: {
427
- input_points?: any;
428
- input_labels?: any;
429
- input_boxes?: number[][][];
430
- }): Promise<SamImageProcessorResult>;
431
- /**
432
- * Remove padding and upscale masks to the original image size.
433
- * @param {Tensor} masks Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format.
434
- * @param {[number, number][]} original_sizes The original sizes of each image before it was resized to the model's expected input shape, in (height, width) format.
435
- * @param {[number, number][]} reshaped_input_sizes The size of each image as it is fed to the model, in (height, width) format. Used to remove padding.
436
- * @param {Object} options Optional parameters for post-processing.
437
- * @param {number} [options.mask_threshold] The threshold to use for binarizing the masks.
438
- * @param {boolean} [options.binarize] Whether to binarize the masks.
439
- * @param {Object} [options.pad_size] The target size the images were padded to before being passed to the model. If `null`, the target size is assumed to be the processor's `pad_size`.
440
- * @param {number} [options.pad_size.height] The height the images were padded to.
441
- * @param {number} [options.pad_size.width] The width the images were padded to.
442
- * @returns {Promise<Tensor[]>} Batched masks in batch_size, num_channels, height, width) format, where (height, width) is given by original_size.
443
- */
444
- post_process_masks(masks: Tensor, original_sizes: [number, number][], reshaped_input_sizes: [number, number][], { mask_threshold, binarize, pad_size, }?: {
445
- mask_threshold?: number;
446
- binarize?: boolean;
447
- pad_size?: {
448
- height?: number;
449
- width?: number;
450
- };
451
- }): Promise<Tensor[]>;
452
- /**
453
- * Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.
454
- * @param {RawImage} image Input original image
455
- * @param {number} target_size Target size of the resized image
456
- * @param {Object} options Options for generating crop boxes
457
- * @param {number} [options.crop_n_layers] If >0, mask prediction will be run again on crops of the image.
458
- * Sets the number of layers to run, where each layer has 2**i_layer number of image crops.
459
- * @param {number} [options.overlap_ratio] Sets the degree to which crops overlap. In the first crop layer,
460
- * crops will overlap by this fraction of the image length. Later layers with more crops scale down this overlap.
461
- * @param {number} [options.points_per_crop] Number of points to sample from each crop.
462
- * @param {number} [options.crop_n_points_downscale_factor] The number of points-per-side sampled in layer n is
463
- * scaled down by crop_n_points_downscale_factor**n.
464
- * @returns {Object} An object containing the crop boxes, number of points per crop, cropped images, and input labels.
465
- */
466
- generate_crop_boxes(image: RawImage, target_size: number, { crop_n_layers, overlap_ratio, points_per_crop, crop_n_points_downscale_factor, }?: {
467
- crop_n_layers?: number;
468
- overlap_ratio?: number;
469
- points_per_crop?: number;
470
- crop_n_points_downscale_factor?: number;
471
- }): any;
472
- }
473
- export class Swin2SRImageProcessor extends ImageFeatureExtractor {
474
- pad_image(pixelData: any, imgDims: any, padSize: any, options?: {}): [Float32Array, number[]];
475
- }
476
- export class VitMatteImageProcessor extends ImageFeatureExtractor {
477
- /**
478
- * Calls the feature extraction process on an array of images, preprocesses
479
- * each image, and concatenates the resulting features into a single Tensor.
480
- * @param {RawImage[]} images The image(s) to extract features from.
481
- * @param {RawImage[]} trimaps The trimaps(s) to extract features from.
482
- * @returns {Promise<ImageFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
483
- */
484
- _call(images: RawImage[], trimaps: RawImage[]): Promise<ImageFeatureExtractorResult>;
485
- }
486
- export class WhisperFeatureExtractor extends FeatureExtractor {
487
- constructor(config: any);
488
- window: Float64Array;
489
- /**
490
- * Computes the log-Mel spectrogram of the provided audio waveform.
491
- * @param {Float32Array|Float64Array} waveform The audio waveform to process.
492
- * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
493
- */
494
- _extract_fbank_features(waveform: Float32Array | Float64Array): Promise<Tensor>;
495
- /**
496
- * Asynchronously extracts features from a given audio using the provided configuration.
497
- * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
498
- * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
499
- */
500
- _call(audio: Float32Array | Float64Array): Promise<{
501
- input_features: Tensor;
502
- }>;
503
- }
504
- export class Wav2Vec2FeatureExtractor extends FeatureExtractor {
505
- /**
506
- * @param {Float32Array} input_values
507
- * @returns {Float32Array}
508
- */
509
- _zero_mean_unit_var_norm(input_values: Float32Array): Float32Array;
510
- /**
511
- * Asynchronously extracts features from a given audio using the provided configuration.
512
- * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
513
- * @returns {Promise<{ input_values: Tensor; attention_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention mask as Tensors.
514
- */
515
- _call(audio: Float32Array | Float64Array): Promise<{
516
- input_values: Tensor;
517
- attention_mask: Tensor;
518
- }>;
519
- }
520
- export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
521
- constructor(config: any);
522
- mel_filters: number[][];
523
- window: Float64Array;
524
- /**
525
- * Computes the log-Mel spectrogram of the provided audio waveform.
526
- * @param {Float32Array|Float64Array} waveform The audio waveform to process.
527
- * @param {number} max_length The maximum number of frames to return.
528
- * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
529
- */
530
- _extract_fbank_features(waveform: Float32Array | Float64Array, max_length: number): Promise<Tensor>;
531
- /**
532
- * Asynchronously extracts features from a given audio using the provided configuration.
533
- * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
534
- * @param {Object} options Optional parameters for feature extraction.
535
- * @param {boolean} [options.padding=true] Whether to pad the sequence to a multiple of `pad_to_multiple_of`.
536
- * @param {number} [options.pad_to_multiple_of=2] The number to pad the sequence to a multiple of.
537
- * @param {boolean} [options.do_normalize_per_mel_bins=true] Whether or not to zero-mean unit-variance normalize the input per mel-channel.
538
- * @param {boolean} [options.return_attention_mask=true] Whether to return the attention mask.
539
- * @returns {Promise<{ input_features: Tensor, attention_mask?: Tensor }>} A Promise resolving to an object containing the extracted input features and attention masks as Tensors.
540
- */
541
- _call(audio: Float32Array | Float64Array, { padding, pad_to_multiple_of, do_normalize_per_mel_bins, return_attention_mask, }?: {
542
- padding?: boolean;
543
- pad_to_multiple_of?: number;
544
- do_normalize_per_mel_bins?: boolean;
545
- return_attention_mask?: boolean;
546
- }): Promise<{
547
- input_features: Tensor;
548
- attention_mask?: Tensor;
549
- }>;
550
- }
551
- export class ASTFeatureExtractor extends FeatureExtractor {
552
- constructor(config: any);
553
- mel_filters: number[][];
554
- window: Float64Array;
555
- mean: any;
556
- std: any;
557
- /**
558
- * Computes the log-Mel spectrogram of the provided audio waveform.
559
- * @param {Float32Array|Float64Array} waveform The audio waveform to process.
560
- * @param {number} max_length The maximum number of frames to return.
561
- * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
562
- */
563
- _extract_fbank_features(waveform: Float32Array | Float64Array, max_length: number): Promise<Tensor>;
564
- /**
565
- * Asynchronously extracts features from a given audio using the provided configuration.
566
- * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
567
- * @returns {Promise<{ input_values: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
568
- */
569
- _call(audio: Float32Array | Float64Array): Promise<{
570
- input_values: Tensor;
571
- }>;
572
- }
573
- export class ClapFeatureExtractor extends FeatureExtractor {
574
- constructor(config: any);
575
- mel_filters: number[][];
576
- mel_filters_slaney: number[][];
577
- window: Float64Array;
578
- /**
579
- * Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments.
580
- *
581
- * Four different path are possible:
582
- * - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram
583
- * will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram
584
- * are then stacked together. They will later be used for `feature_fusion`.
585
- * - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is
586
- * padded based on `padding`.
587
- * - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded
588
- * based on `padding`, and is repeated `4` times.
589
- * - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel
590
- * spectrogram will be computed on a random crop of the waveform.
591
- *
592
- * @param {Float32Array|Float64Array} waveform The input waveform.
593
- * @param {number} max_length The maximum length of the waveform.
594
- * @param {string} truncation The truncation strategy to use.
595
- * @param {string} padding The padding strategy to use.
596
- * @returns {Promise<Tensor>} An object containing the mel spectrogram data as a Float32Array, its dimensions as an array of numbers, and a boolean indicating whether the waveform was longer than the max length.
597
- * @private
598
- */
599
- private _get_input_mel;
600
- /**
601
- * Compute the log-mel spectrogram of the provided `waveform` using the Hann window.
602
- * In CLAP, two different filter banks are used depending on the truncation pattern:
603
- * - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from
604
- * calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
605
- * is set to `"fusion"`.
606
- * - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used
607
- * `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
608
- * implementation when the truncation mode is not `"fusion"`.
609
- *
610
- * @param {Float32Array|Float64Array} waveform The audio waveform to process.
611
- * @param {number[][]} mel_filters The mel filters to use.
612
- * @param {number} [max_length=null] The maximum number of frames to return.
613
- * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
614
- */
615
- _extract_fbank_features(waveform: Float32Array | Float64Array, mel_filters: number[][], max_length?: number): Promise<Tensor>;
616
- /**
617
- * Asynchronously extracts features from a given audio using the provided configuration.
618
- * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
619
- * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
620
- */
621
- _call(audio: Float32Array | Float64Array, { max_length, }?: {
622
- max_length?: any;
623
- }): Promise<{
624
- input_features: Tensor;
625
- }>;
626
- }
627
- export class PyAnnoteFeatureExtractor extends FeatureExtractor {
628
- /**
629
- * Asynchronously extracts features from a given audio using the provided configuration.
630
- * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
631
- * @returns {Promise<{ input_values: Tensor; }>} The extracted input features.
632
- */
633
- _call(audio: Float32Array | Float64Array): Promise<{
634
- input_values: Tensor;
635
- }>;
636
- /**
637
- * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
638
- * @param {number} samples The number of frames in the audio.
639
- * @returns {number} The number of frames in the audio.
640
- */
641
- samples_to_frames(samples: number): number;
642
- /**
643
- * Post-processes the speaker diarization logits output by the model.
644
- * @param {Tensor} logits The speaker diarization logits output by the model.
645
- * @param {number} num_samples Number of samples in the input audio.
646
- * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
647
- */
648
- post_process_speaker_diarization(logits: Tensor, num_samples: number): Array<Array<{
649
- id: number;
650
- start: number;
651
- end: number;
652
- confidence: number;
653
- }>>;
654
- }
655
- export class WeSpeakerFeatureExtractor extends FeatureExtractor {
656
- constructor(config: any);
657
- mel_filters: number[][];
658
- window: Float64Array;
659
- min_num_frames: any;
660
- /**
661
- * Computes the log-Mel spectrogram of the provided audio waveform.
662
- * @param {Float32Array|Float64Array} waveform The audio waveform to process.
663
- * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
664
- */
665
- _extract_fbank_features(waveform: Float32Array | Float64Array): Promise<Tensor>;
666
- /**
667
- * Asynchronously extracts features from a given audio using the provided configuration.
668
- * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
669
- * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
670
- */
671
- _call(audio: Float32Array | Float64Array): Promise<{
672
- input_features: Tensor;
673
- }>;
674
- }
675
- export class SpeechT5FeatureExtractor extends FeatureExtractor {
676
- }
677
- declare const Processor_base: new () => {
678
- (...args: any[]): any;
679
- _call(...args: any[]): any;
680
- };
681
- /**
682
- * Represents a Processor that extracts features from an input.
683
- * @extends Callable
684
- */
685
- export class Processor extends Processor_base {
686
- /**
687
- * Creates a new Processor with the given feature extractor.
688
- * @param {FeatureExtractor} feature_extractor The function used to extract features from the input.
689
- */
690
- constructor(feature_extractor: FeatureExtractor);
691
- feature_extractor: FeatureExtractor;
692
- /**
693
- * Calls the feature_extractor function with the given input.
694
- * @param {any} input The input to extract features from.
695
- * @param {...any} args Additional arguments.
696
- * @returns {Promise<any>} A Promise that resolves with the extracted features.
697
- */
698
- _call(input: any, ...args: any[]): Promise<any>;
699
- }
700
- export class SamProcessor extends Processor {
701
- /**
702
- * @borrows SamImageProcessor#_call as _call
703
- */
704
- _call(...args: any[]): Promise<any>;
705
- /**
706
- * @borrows SamImageProcessor#post_process_masks as post_process_masks
707
- */
708
- post_process_masks(...args: any[]): any;
709
- /**
710
- * @borrows SamImageProcessor#reshape_input_points as reshape_input_points
711
- */
712
- reshape_input_points(...args: any[]): any;
713
- }
714
- /**
715
- * Represents a WhisperProcessor that extracts features from an audio input.
716
- * @extends Processor
717
- */
718
- export class WhisperProcessor extends Processor {
719
- /**
720
- * Calls the feature_extractor function with the given audio input.
721
- * @param {any} audio The audio input to extract features from.
722
- * @returns {Promise<any>} A Promise that resolves with the extracted features.
723
- */
724
- _call(audio: any): Promise<any>;
725
- }
726
- export class Wav2Vec2ProcessorWithLM extends Processor {
727
- /**
728
- * Calls the feature_extractor function with the given audio input.
729
- * @param {any} audio The audio input to extract features from.
730
- * @returns {Promise<any>} A Promise that resolves with the extracted features.
731
- */
732
- _call(audio: any): Promise<any>;
733
- }
734
- export class PyAnnoteProcessor extends Processor {
735
- /**
736
- * Calls the feature_extractor function with the given audio input.
737
- * @param {any} audio The audio input to extract features from.
738
- * @returns {Promise<any>} A Promise that resolves with the extracted features.
739
- */
740
- _call(audio: any): Promise<any>;
741
- post_process_speaker_diarization(...args: any[]): any;
742
- }
743
- export class SpeechT5Processor extends Processor {
744
- /**
745
- * Calls the feature_extractor function with the given input.
746
- * @param {any} input The input to extract features from.
747
- * @returns {Promise<any>} A Promise that resolves with the extracted features.
748
- */
749
- _call(input: any): Promise<any>;
750
- }
751
- export class OwlViTProcessor extends Processor {
752
- }
753
- export class Florence2Processor extends Processor {
754
- constructor(feature_extractor: any);
755
- /** @type {Map<string, string>} */
756
- tasks_answer_post_processing_type: Map<string, string>;
757
- /** @type {Map<string, string>} */
758
- task_prompts_without_inputs: Map<string, string>;
759
- /** @type {Map<string, string>} */
760
- task_prompts_with_input: Map<string, string>;
761
- regexes: {
762
- quad_boxes: RegExp;
763
- bboxes: RegExp;
764
- };
765
- size_per_bin: number;
766
- /**
767
- * Helper function to construct prompts from input texts
768
- * @param {string|string[]} text
769
- * @returns {string[]}
770
- */
771
- construct_prompts(text: string | string[]): string[];
772
- /**
773
- * Post-process the output of the model to each of the task outputs.
774
- * @param {string} text The text to post-process.
775
- * @param {string} task The task to post-process the text for.
776
- * @param {[number, number]} image_size The size of the image. height x width.
777
- */
778
- post_process_generation(text: string, task: string, image_size: [number, number]): {
779
- [x: string]: string | {
780
- [x: string]: any[];
781
- labels: any[];
782
- };
783
- };
784
- }
785
- /**
786
- * Helper class which is used to instantiate pretrained processors with the `from_pretrained` function.
787
- * The chosen processor class is determined by the type specified in the processor config.
788
- *
789
- * **Example:** Load a processor using `from_pretrained`.
790
- * ```javascript
791
- * let processor = await AutoProcessor.from_pretrained('openai/whisper-tiny.en');
792
- * ```
793
- *
794
- * **Example:** Run an image through a processor.
795
- * ```javascript
796
- * let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');
797
- * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
798
- * let image_inputs = await processor(image);
799
- * // {
800
- * // "pixel_values": {
801
- * // "dims": [ 1, 3, 224, 224 ],
802
- * // "type": "float32",
803
- * // "data": Float32Array [ -1.558687686920166, -1.558687686920166, -1.5440893173217773, ... ],
804
- * // "size": 150528
805
- * // },
806
- * // "original_sizes": [
807
- * // [ 533, 800 ]
808
- * // ],
809
- * // "reshaped_input_sizes": [
810
- * // [ 224, 224 ]
811
- * // ]
812
- * // }
813
- * ```
814
- */
815
- export class AutoProcessor {
816
- static FEATURE_EXTRACTOR_CLASS_MAPPING: {
817
- ImageFeatureExtractor: typeof ImageFeatureExtractor;
818
- WhisperFeatureExtractor: typeof WhisperFeatureExtractor;
819
- ViTFeatureExtractor: typeof ViTFeatureExtractor;
820
- MobileViTFeatureExtractor: typeof MobileViTFeatureExtractor;
821
- MobileViTImageProcessor: typeof MobileViTImageProcessor;
822
- MobileNetV1FeatureExtractor: typeof MobileNetV1FeatureExtractor;
823
- MobileNetV2FeatureExtractor: typeof MobileNetV2FeatureExtractor;
824
- MobileNetV3FeatureExtractor: typeof MobileNetV3FeatureExtractor;
825
- MobileNetV4FeatureExtractor: typeof MobileNetV4FeatureExtractor;
826
- OwlViTFeatureExtractor: typeof OwlViTFeatureExtractor;
827
- Owlv2ImageProcessor: typeof Owlv2ImageProcessor;
828
- CLIPFeatureExtractor: typeof CLIPFeatureExtractor;
829
- CLIPImageProcessor: typeof CLIPImageProcessor;
830
- Florence2Processor: typeof Florence2Processor;
831
- ChineseCLIPFeatureExtractor: typeof ChineseCLIPFeatureExtractor;
832
- SiglipImageProcessor: typeof SiglipImageProcessor;
833
- ConvNextFeatureExtractor: typeof ConvNextFeatureExtractor;
834
- ConvNextImageProcessor: typeof ConvNextImageProcessor;
835
- SegformerFeatureExtractor: typeof SegformerFeatureExtractor;
836
- SapiensFeatureExtractor: typeof SapiensFeatureExtractor;
837
- BitImageProcessor: typeof BitImageProcessor;
838
- DPTImageProcessor: typeof DPTImageProcessor;
839
- DPTFeatureExtractor: typeof DPTFeatureExtractor;
840
- PvtImageProcessor: typeof PvtImageProcessor;
841
- GLPNFeatureExtractor: typeof GLPNFeatureExtractor;
842
- BeitFeatureExtractor: typeof BeitFeatureExtractor;
843
- DeiTFeatureExtractor: typeof DeiTFeatureExtractor;
844
- DetrFeatureExtractor: typeof DetrFeatureExtractor;
845
- RTDetrImageProcessor: typeof RTDetrImageProcessor;
846
- MaskFormerFeatureExtractor: typeof MaskFormerFeatureExtractor;
847
- YolosFeatureExtractor: typeof YolosFeatureExtractor;
848
- DonutFeatureExtractor: typeof DonutFeatureExtractor;
849
- DonutImageProcessor: typeof DonutImageProcessor;
850
- NougatImageProcessor: typeof NougatImageProcessor;
851
- EfficientNetImageProcessor: typeof EfficientNetImageProcessor;
852
- ViTImageProcessor: typeof ViTImageProcessor;
853
- VitMatteImageProcessor: typeof VitMatteImageProcessor;
854
- SamImageProcessor: typeof SamImageProcessor;
855
- Swin2SRImageProcessor: typeof Swin2SRImageProcessor;
856
- Wav2Vec2FeatureExtractor: typeof Wav2Vec2FeatureExtractor;
857
- SeamlessM4TFeatureExtractor: typeof SeamlessM4TFeatureExtractor;
858
- SpeechT5FeatureExtractor: typeof SpeechT5FeatureExtractor;
859
- ASTFeatureExtractor: typeof ASTFeatureExtractor;
860
- ClapFeatureExtractor: typeof ClapFeatureExtractor;
861
- PyAnnoteFeatureExtractor: typeof PyAnnoteFeatureExtractor;
862
- WeSpeakerFeatureExtractor: typeof WeSpeakerFeatureExtractor;
863
- };
864
- static PROCESSOR_CLASS_MAPPING: {
865
- WhisperProcessor: typeof WhisperProcessor;
866
- Wav2Vec2ProcessorWithLM: typeof Wav2Vec2ProcessorWithLM;
867
- PyAnnoteProcessor: typeof PyAnnoteProcessor;
868
- SamProcessor: typeof SamProcessor;
869
- SpeechT5Processor: typeof SpeechT5Processor;
870
- OwlViTProcessor: typeof OwlViTProcessor;
871
- Florence2Processor: typeof Florence2Processor;
872
- };
873
- /**
874
- * Instantiate one of the processor classes of the library from a pretrained model.
875
- *
876
- * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
877
- * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
878
- *
879
- * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
880
- * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
881
- * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
882
- * user or organization name, like `dbmdz/bert-base-german-cased`.
883
- * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
884
- * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
885
- *
886
- * @returns {Promise<Processor>} A new instance of the Processor class.
887
- */
888
- static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, }?: import('./utils/hub.js').PretrainedOptions): Promise<Processor>;
889
- }
890
- /**
891
- * Named tuple to indicate the order we are using is (height x width), even though
892
- * the Graphics’ industry standard is (width x height).
893
- */
894
- export type HeightWidth = [height: number, width: number];
895
- export type ImageFeatureExtractorResult = {
896
- /**
897
- * The pixel values of the batched preprocessed images.
898
- */
899
- pixel_values: Tensor;
900
- /**
901
- * Array of two-dimensional tuples like [[480, 640]].
902
- */
903
- original_sizes: HeightWidth[];
904
- /**
905
- * Array of two-dimensional tuples like [[1000, 1330]].
906
- */
907
- reshaped_input_sizes: HeightWidth[];
908
- };
909
- export type DetrFeatureExtractorResultProps = {
910
- pixel_mask: Tensor;
911
- };
912
- export type DetrFeatureExtractorResult = ImageFeatureExtractorResult & DetrFeatureExtractorResultProps;
913
- export type SamImageProcessorResult = {
914
- pixel_values: Tensor;
915
- original_sizes: HeightWidth[];
916
- reshaped_input_sizes: HeightWidth[];
917
- input_points?: Tensor;
918
- input_labels?: Tensor;
919
- input_boxes?: Tensor;
920
- };
921
- import { RawImage } from './utils/image.js';
922
- import { Tensor } from './utils/tensor.js';
923
- export {};
924
- //# sourceMappingURL=processors.d.ts.map