@huggingface/transformers 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. package/README.md +14 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +16607 -13472
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +16601 -13451
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +238 -52
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +229 -43
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +240 -54
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +16017 -12878
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +7 -7
  16. package/src/base/feature_extraction_utils.js +54 -0
  17. package/src/base/image_processors_utils.js +1089 -0
  18. package/src/base/processing_utils.js +145 -0
  19. package/src/configs.js +15 -3
  20. package/src/env.js +15 -4
  21. package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +90 -0
  22. package/src/models/auto/feature_extraction_auto.js +41 -0
  23. package/src/models/auto/image_processing_auto.js +29 -0
  24. package/src/models/auto/processing_auto.js +100 -0
  25. package/src/models/beit/image_processing_beit.js +5 -0
  26. package/src/models/bit/image_processing_bit.js +5 -0
  27. package/src/models/chinese_clip/image_processing_chinese_clip.js +5 -0
  28. package/src/models/clap/feature_extraction_clap.js +159 -0
  29. package/src/models/clip/image_processing_clip.js +6 -0
  30. package/src/models/convnext/image_processing_convnext.js +45 -0
  31. package/src/models/deit/image_processing_deit.js +6 -0
  32. package/src/models/detr/image_processing_detr.js +52 -0
  33. package/src/models/donut/image_processing_donut.js +31 -0
  34. package/src/models/dpt/image_processing_dpt.js +6 -0
  35. package/src/models/efficientnet/image_processing_efficientnet.js +13 -0
  36. package/src/models/feature_extractors.js +12 -0
  37. package/src/models/florence2/processing_florence2.js +128 -0
  38. package/src/models/glpn/image_processing_glpn.js +5 -0
  39. package/src/models/image_processors.js +36 -0
  40. package/src/models/janus/image_processing_janus.js +26 -0
  41. package/src/models/janus/processing_janus.js +123 -0
  42. package/src/models/jina_clip/image_processing_jina_clip.js +26 -0
  43. package/src/models/jina_clip/processing_jina_clip.js +24 -0
  44. package/src/models/llava_onevision/image_processing_llava_onevision.js +5 -0
  45. package/src/models/mask2former/image_processing_mask2former.js +5 -0
  46. package/src/models/maskformer/image_processing_maskformer.js +18 -0
  47. package/src/models/mgp_str/processing_mgp_str.js +170 -0
  48. package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +7 -0
  49. package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +7 -0
  50. package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +7 -0
  51. package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +7 -0
  52. package/src/models/mobilevit/image_processing_mobilevit.js +6 -0
  53. package/src/models/nougat/image_processing_nougat.js +5 -0
  54. package/src/models/owlv2/image_processing_owlv2.js +5 -0
  55. package/src/models/owlvit/image_processing_owlvit.js +12 -0
  56. package/src/models/owlvit/processing_owlvit.js +7 -0
  57. package/src/models/processors.js +11 -0
  58. package/src/models/pvt/image_processing_pvt.js +5 -0
  59. package/src/models/pyannote/feature_extraction_pyannote.js +28 -0
  60. package/src/models/pyannote/processing_pyannote.js +71 -0
  61. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +52 -0
  62. package/src/models/qwen2_vl/processing_qwen2_vl.js +52 -0
  63. package/src/models/rt_detr/image_processing_rt_detr.js +12 -0
  64. package/src/models/sam/image_processing_sam.js +242 -0
  65. package/src/models/sam/processing_sam.js +20 -0
  66. package/src/models/sapiens/image_processing_sapiens.js +13 -0
  67. package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +180 -0
  68. package/src/models/segformer/image_processing_segformer.js +13 -0
  69. package/src/models/siglip/image_processing_siglip.js +5 -0
  70. package/src/models/speecht5/feature_extraction_speecht5.js +4 -0
  71. package/src/models/speecht5/processing_speecht5.js +17 -0
  72. package/src/models/swin2sr/image_processing_swin2sr.js +24 -0
  73. package/src/models/vit/image_processing_vit.js +7 -0
  74. package/src/models/vitmatte/image_processing_vitmatte.js +50 -0
  75. package/src/models/vitpose/image_processing_vitpose.js +89 -0
  76. package/src/models/wav2vec2/feature_extraction_wav2vec2.js +44 -0
  77. package/src/models/wav2vec2/processing_wav2vec2.js +15 -0
  78. package/src/models/wespeaker/feature_extraction_wespeaker.js +100 -0
  79. package/src/models/whisper/feature_extraction_whisper.js +84 -0
  80. package/src/models/whisper/processing_whisper.js +21 -0
  81. package/src/models/yolos/image_processing_yolos.js +12 -0
  82. package/src/models.js +695 -32
  83. package/src/pipelines.js +8 -8
  84. package/src/tokenizers.js +5 -0
  85. package/src/transformers.js +15 -2
  86. package/src/utils/constants.js +8 -1
  87. package/src/utils/core.js +37 -9
  88. package/src/utils/hub.js +2 -1
  89. package/src/utils/image.js +68 -17
  90. package/src/utils/tensor.js +33 -1
  91. package/types/base/feature_extraction_utils.d.ts +41 -0
  92. package/types/base/feature_extraction_utils.d.ts.map +1 -0
  93. package/types/base/image_processors_utils.d.ts +323 -0
  94. package/types/base/image_processors_utils.d.ts.map +1 -0
  95. package/types/base/processing_utils.d.ts +80 -0
  96. package/types/base/processing_utils.d.ts.map +1 -0
  97. package/types/configs.d.ts +4 -1
  98. package/types/configs.d.ts.map +1 -1
  99. package/types/env.d.ts.map +1 -1
  100. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +25 -0
  101. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -0
  102. package/types/models/auto/feature_extraction_auto.d.ts +5 -0
  103. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -0
  104. package/types/models/auto/image_processing_auto.d.ts +5 -0
  105. package/types/models/auto/image_processing_auto.d.ts.map +1 -0
  106. package/types/models/auto/processing_auto.d.ts +35 -0
  107. package/types/models/auto/processing_auto.d.ts.map +1 -0
  108. package/types/models/beit/image_processing_beit.d.ts +4 -0
  109. package/types/models/beit/image_processing_beit.d.ts.map +1 -0
  110. package/types/models/bit/image_processing_bit.d.ts +4 -0
  111. package/types/models/bit/image_processing_bit.d.ts.map +1 -0
  112. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts +4 -0
  113. package/types/models/chinese_clip/image_processing_chinese_clip.d.ts.map +1 -0
  114. package/types/models/clap/feature_extraction_clap.d.ts +57 -0
  115. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -0
  116. package/types/models/clip/image_processing_clip.d.ts +6 -0
  117. package/types/models/clip/image_processing_clip.d.ts.map +1 -0
  118. package/types/models/convnext/image_processing_convnext.d.ts +12 -0
  119. package/types/models/convnext/image_processing_convnext.d.ts.map +1 -0
  120. package/types/models/deit/image_processing_deit.d.ts +6 -0
  121. package/types/models/deit/image_processing_deit.d.ts.map +1 -0
  122. package/types/models/detr/image_processing_detr.d.ts +42 -0
  123. package/types/models/detr/image_processing_detr.d.ts.map +1 -0
  124. package/types/models/donut/image_processing_donut.d.ts +7 -0
  125. package/types/models/donut/image_processing_donut.d.ts.map +1 -0
  126. package/types/models/dpt/image_processing_dpt.d.ts +6 -0
  127. package/types/models/dpt/image_processing_dpt.d.ts.map +1 -0
  128. package/types/models/efficientnet/image_processing_efficientnet.d.ts +6 -0
  129. package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -0
  130. package/types/models/feature_extractors.d.ts +10 -0
  131. package/types/models/feature_extractors.d.ts.map +1 -0
  132. package/types/models/florence2/processing_florence2.d.ts +39 -0
  133. package/types/models/florence2/processing_florence2.d.ts.map +1 -0
  134. package/types/models/glpn/image_processing_glpn.d.ts +4 -0
  135. package/types/models/glpn/image_processing_glpn.d.ts.map +1 -0
  136. package/types/models/image_processors.d.ts +36 -0
  137. package/types/models/image_processors.d.ts.map +1 -0
  138. package/types/models/janus/image_processing_janus.d.ts +7 -0
  139. package/types/models/janus/image_processing_janus.d.ts.map +1 -0
  140. package/types/models/janus/processing_janus.d.ts +77 -0
  141. package/types/models/janus/processing_janus.d.ts.map +1 -0
  142. package/types/models/jina_clip/image_processing_jina_clip.d.ts +5 -0
  143. package/types/models/jina_clip/image_processing_jina_clip.d.ts.map +1 -0
  144. package/types/models/jina_clip/processing_jina_clip.d.ts +9 -0
  145. package/types/models/jina_clip/processing_jina_clip.d.ts.map +1 -0
  146. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts +4 -0
  147. package/types/models/llava_onevision/image_processing_llava_onevision.d.ts.map +1 -0
  148. package/types/models/mask2former/image_processing_mask2former.d.ts +4 -0
  149. package/types/models/mask2former/image_processing_mask2former.d.ts.map +1 -0
  150. package/types/models/maskformer/image_processing_maskformer.d.ts +22 -0
  151. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -0
  152. package/types/models/mgp_str/processing_mgp_str.d.ts +64 -0
  153. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -0
  154. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts +6 -0
  155. package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts.map +1 -0
  156. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts +6 -0
  157. package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts.map +1 -0
  158. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts +6 -0
  159. package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts.map +1 -0
  160. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts +6 -0
  161. package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts.map +1 -0
  162. package/types/models/mobilevit/image_processing_mobilevit.d.ts +6 -0
  163. package/types/models/mobilevit/image_processing_mobilevit.d.ts.map +1 -0
  164. package/types/models/nougat/image_processing_nougat.d.ts +4 -0
  165. package/types/models/nougat/image_processing_nougat.d.ts.map +1 -0
  166. package/types/models/owlv2/image_processing_owlv2.d.ts +4 -0
  167. package/types/models/owlv2/image_processing_owlv2.d.ts.map +1 -0
  168. package/types/models/owlvit/image_processing_owlvit.d.ts +10 -0
  169. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -0
  170. package/types/models/owlvit/processing_owlvit.d.ts +8 -0
  171. package/types/models/owlvit/processing_owlvit.d.ts.map +1 -0
  172. package/types/models/processors.d.ts +12 -0
  173. package/types/models/processors.d.ts.map +1 -0
  174. package/types/models/pvt/image_processing_pvt.d.ts +4 -0
  175. package/types/models/pvt/image_processing_pvt.d.ts.map +1 -0
  176. package/types/models/pyannote/feature_extraction_pyannote.d.ts +13 -0
  177. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -0
  178. package/types/models/pyannote/processing_pyannote.d.ts +30 -0
  179. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -0
  180. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +11 -0
  181. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -0
  182. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +17 -0
  183. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -0
  184. package/types/models/rt_detr/image_processing_rt_detr.d.ts +8 -0
  185. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -0
  186. package/types/models/sam/image_processing_sam.d.ts +103 -0
  187. package/types/models/sam/image_processing_sam.d.ts.map +1 -0
  188. package/types/models/sam/processing_sam.d.ts +9 -0
  189. package/types/models/sam/processing_sam.d.ts.map +1 -0
  190. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +34 -0
  191. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -0
  192. package/types/models/segformer/image_processing_segformer.d.ts +10 -0
  193. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -0
  194. package/types/models/siglip/image_processing_siglip.d.ts +4 -0
  195. package/types/models/siglip/image_processing_siglip.d.ts.map +1 -0
  196. package/types/models/speecht5/feature_extraction_speecht5.d.ts +4 -0
  197. package/types/models/speecht5/feature_extraction_speecht5.d.ts.map +1 -0
  198. package/types/models/speecht5/processing_speecht5.d.ts +14 -0
  199. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -0
  200. package/types/models/swin2sr/image_processing_swin2sr.d.ts +5 -0
  201. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -0
  202. package/types/models/vit/image_processing_vit.d.ts +6 -0
  203. package/types/models/vit/image_processing_vit.d.ts.map +1 -0
  204. package/types/models/vitmatte/image_processing_vitmatte.d.ts +12 -0
  205. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -0
  206. package/types/models/vitpose/image_processing_vitpose.d.ts +26 -0
  207. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -0
  208. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts +19 -0
  209. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -0
  210. package/types/models/wav2vec2/processing_wav2vec2.d.ts +12 -0
  211. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -0
  212. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +23 -0
  213. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -0
  214. package/types/models/whisper/feature_extraction_whisper.d.ts +21 -0
  215. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -0
  216. package/types/models/whisper/processing_whisper.d.ts +17 -0
  217. package/types/models/whisper/processing_whisper.d.ts.map +1 -0
  218. package/types/models/yolos/image_processing_yolos.d.ts +10 -0
  219. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -0
  220. package/types/models.d.ts +152 -0
  221. package/types/models.d.ts.map +1 -1
  222. package/types/pipelines.d.ts +2 -3
  223. package/types/pipelines.d.ts.map +1 -1
  224. package/types/tokenizers.d.ts +3 -0
  225. package/types/tokenizers.d.ts.map +1 -1
  226. package/types/transformers.d.ts +10 -1
  227. package/types/utils/constants.d.ts +6 -0
  228. package/types/utils/constants.d.ts.map +1 -1
  229. package/types/utils/core.d.ts +58 -3
  230. package/types/utils/core.d.ts.map +1 -1
  231. package/types/utils/hub.d.ts +1 -1
  232. package/types/utils/hub.d.ts.map +1 -1
  233. package/types/utils/image.d.ts +10 -2
  234. package/types/utils/image.d.ts.map +1 -1
  235. package/types/utils/tensor.d.ts +34 -1
  236. package/types/utils/tensor.d.ts.map +1 -1
  237. package/src/processors.js +0 -2655
  238. package/types/processors.d.ts +0 -924
  239. package/types/processors.d.ts.map +0 -1
@@ -0,0 +1,145 @@
1
+
2
+ /**
3
+ * @file Processors are used to prepare inputs (e.g., text, image or audio) for a model.
4
+ *
5
+ * **Example:** Using a `WhisperProcessor` to prepare an audio input for a model.
6
+ * ```javascript
7
+ * import { AutoProcessor, read_audio } from '@huggingface/transformers';
8
+ *
9
+ * const processor = await AutoProcessor.from_pretrained('openai/whisper-tiny.en');
10
+ * const audio = await read_audio('https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac', 16000);
11
+ * const { input_features } = await processor(audio);
12
+ * // Tensor {
13
+ * // data: Float32Array(240000) [0.4752984642982483, 0.5597258806228638, 0.56434166431427, ...],
14
+ * // dims: [1, 80, 3000],
15
+ * // type: 'float32',
16
+ * // size: 240000,
17
+ * // }
18
+ * ```
19
+ *
20
+ * @module processors
21
+ */
22
+ import { PROCESSOR_NAME } from '../utils/constants.js';
23
+ import {
24
+ Callable,
25
+ } from '../utils/generic.js';
26
+ import { getModelJSON } from '../utils/hub.js';
27
+
28
+ /**
29
+ * @typedef {Object} ProcessorProperties Additional processor-specific properties.
30
+ * @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
31
+ */
32
+
33
+
34
+ /**
35
+ * Represents a Processor that extracts features from an input.
36
+ */
37
+ export class Processor extends Callable {
38
+ static classes = [
39
+ 'image_processor_class',
40
+ 'tokenizer_class',
41
+ 'feature_extractor_class',
42
+ ]
43
+ static uses_processor_config = false;
44
+
45
+ /**
46
+ * Creates a new Processor with the given components
47
+ * @param {Object} config
48
+ * @param {Record<string, Object>} components
49
+ */
50
+ constructor(config, components) {
51
+ super();
52
+ this.config = config;
53
+ this.components = components;
54
+ }
55
+
56
+ /**
57
+ * @returns {import('./image_processors_utils.js').ImageProcessor|undefined} The image processor of the processor, if it exists.
58
+ */
59
+ get image_processor() {
60
+ return this.components.image_processor;
61
+ }
62
+
63
+ /**
64
+ * @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
65
+ */
66
+ get tokenizer() {
67
+ return this.components.tokenizer;
68
+ }
69
+
70
+ /**
71
+ * @returns {import('./feature_extraction_utils.js').FeatureExtractor|undefined} The feature extractor of the processor, if it exists.
72
+ */
73
+ get feature_extractor() {
74
+ return this.components.feature_extractor;
75
+ }
76
+
77
+ apply_chat_template(messages, options = {}) {
78
+ if (!this.tokenizer) {
79
+ throw new Error('Unable to apply chat template without a tokenizer.');
80
+ }
81
+ return this.tokenizer.apply_chat_template(messages, {
82
+ tokenize: false, // default to false
83
+ ...options,
84
+ });
85
+ }
86
+
87
+ batch_decode(...args) {
88
+ if (!this.tokenizer) {
89
+ throw new Error('Unable to decode without a tokenizer.');
90
+ }
91
+ return this.tokenizer.batch_decode(...args);
92
+ }
93
+
94
+
95
+ /**
96
+ * Calls the feature_extractor function with the given input.
97
+ * @param {any} input The input to extract features from.
98
+ * @param {...any} args Additional arguments.
99
+ * @returns {Promise<any>} A Promise that resolves with the extracted features.
100
+ */
101
+ async _call(input, ...args) {
102
+ for (const item of [this.image_processor, this.feature_extractor, this.tokenizer]) {
103
+ if (item) {
104
+ return item(input, ...args);
105
+ }
106
+ }
107
+ throw new Error('No image processor, feature extractor, or tokenizer found.');
108
+ }
109
+
110
+
111
+ /**
112
+ * Instantiate one of the processor classes of the library from a pretrained model.
113
+ *
114
+ * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object
115
+ * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
116
+ *
117
+ * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
118
+ * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
119
+ * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
120
+ * user or organization name, like `dbmdz/bert-base-german-cased`.
121
+ * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
122
+ * @param {PretrainedProcessorOptions} options Additional options for loading the processor.
123
+ *
124
+ * @returns {Promise<Processor>} A new instance of the Processor class.
125
+ */
126
+ static async from_pretrained(pretrained_model_name_or_path, options) {
127
+
128
+ const [config, components] = await Promise.all([
129
+ // TODO:
130
+ this.uses_processor_config
131
+ ? getModelJSON(pretrained_model_name_or_path, PROCESSOR_NAME, true, options)
132
+ : {},
133
+ Promise.all(
134
+ this.classes
135
+ .filter((cls) => cls in this)
136
+ .map(async (cls) => {
137
+ const component = await this[cls].from_pretrained(pretrained_model_name_or_path, options);
138
+ return [cls.replace(/_class$/, ''), component];
139
+ })
140
+ ).then(Object.fromEntries)
141
+ ]);
142
+
143
+ return new this(config, components);
144
+ }
145
+ }
package/src/configs.js CHANGED
@@ -36,6 +36,13 @@ import {
36
36
  * @typedef {import('./utils/hub.js').PretrainedOptions} PretrainedOptions
37
37
  */
38
38
 
39
+ /**
40
+ * @typedef {import('./utils/core.js').ProgressCallback} ProgressCallback
41
+ */
42
+
43
+ /**
44
+ * @typedef {import('./utils/core.js').ProgressInfo} ProgressInfo
45
+ */
39
46
 
40
47
  /**
41
48
  * Loads a config from the specified path.
@@ -61,6 +68,7 @@ function getNormalizedConfig(config) {
61
68
  case 'llava':
62
69
  case 'paligemma':
63
70
  case 'florence2':
71
+ case 'llava_onevision':
64
72
  init_normalized_config = getNormalizedConfig(config.text_config);
65
73
  break;
66
74
  case 'moondream1':
@@ -69,6 +77,9 @@ function getNormalizedConfig(config) {
69
77
  case 'musicgen':
70
78
  init_normalized_config = getNormalizedConfig(config.decoder);
71
79
  break;
80
+ case 'multi_modality':
81
+ init_normalized_config = getNormalizedConfig(config.language_config);
82
+ break;
72
83
 
73
84
  // Decoder-only models
74
85
  case 'gpt2':
@@ -91,11 +102,14 @@ function getNormalizedConfig(config) {
91
102
  mapping['hidden_size'] = 'hidden_size';
92
103
  break;
93
104
  case 'llama':
105
+ case 'olmo':
106
+ case 'mobilellm':
94
107
  case 'granite':
95
108
  case 'cohere':
96
109
  case 'mistral':
97
110
  case 'starcoder2':
98
111
  case 'qwen2':
112
+ case 'qwen2_vl':
99
113
  mapping['num_heads'] = 'num_key_value_heads';
100
114
  mapping['num_layers'] = 'num_hidden_layers';
101
115
  mapping['hidden_size'] = 'hidden_size';
@@ -216,14 +230,12 @@ function getNormalizedConfig(config) {
216
230
  */
217
231
  export function getKeyValueShapes(config, {
218
232
  prefix = 'past_key_values',
233
+ batch_size=1,
219
234
  } = {}) {
220
235
  /** @type {Record<string, number[]>} */
221
236
  const decoderFeeds = {};
222
237
  const normalized_config = config.normalized_config;
223
238
 
224
- // TODO support batches (i.e., batch_size > 1)
225
- const batch_size = 1;
226
-
227
239
  if (normalized_config.is_encoder_decoder && (
228
240
  'num_encoder_heads' in normalized_config && 'num_decoder_heads' in normalized_config
229
241
  )) {
package/src/env.js CHANGED
@@ -26,7 +26,7 @@ import fs from 'fs';
26
26
  import path from 'path';
27
27
  import url from 'url';
28
28
 
29
- const VERSION = '3.0.1';
29
+ const VERSION = '3.1.0';
30
30
 
31
31
  // Check if various APIs are available (depends on environment)
32
32
  const IS_BROWSER_ENV = typeof self !== 'undefined';
@@ -73,9 +73,20 @@ export const apis = Object.freeze({
73
73
  });
74
74
 
75
75
  const RUNNING_LOCALLY = IS_FS_AVAILABLE && IS_PATH_AVAILABLE;
76
- const dirname__ = RUNNING_LOCALLY
77
- ? path.dirname(path.dirname(url.fileURLToPath(import.meta.url)))
78
- : './';
76
+
77
+ let dirname__ = './';
78
+ if (RUNNING_LOCALLY) {
79
+ // NOTE: We wrap `import.meta` in a call to `Object` to prevent Webpack from trying to bundle it in CommonJS.
80
+ // Although we get the warning: "Accessing import.meta directly is unsupported (only property access or destructuring is supported)",
81
+ // it is safe to ignore since the bundled value (`{}`) isn't used for CommonJS environments (we use __dirname instead).
82
+ const _import_meta_url = Object(import.meta).url;
83
+
84
+ if (_import_meta_url) {
85
+ dirname__ = path.dirname(path.dirname(url.fileURLToPath(_import_meta_url))) // ESM
86
+ } else if (typeof __dirname !== 'undefined') {
87
+ dirname__ = path.dirname(__dirname) // CommonJS
88
+ }
89
+ }
79
90
 
80
91
  // Only used for environments with access to file system
81
92
  const DEFAULT_CACHE_DIR = RUNNING_LOCALLY
@@ -0,0 +1,90 @@
1
+ import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
2
+ import { Tensor } from '../../utils/tensor.js';
3
+ import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
4
+
5
+
6
+ export class ASTFeatureExtractor extends FeatureExtractor {
7
+
8
+ constructor(config) {
9
+ super(config);
10
+
11
+ const sampling_rate = this.config.sampling_rate;
12
+ const mel_filters = mel_filter_bank(
13
+ 256, // num_frequency_bins
14
+ this.config.num_mel_bins, // num_mel_filters
15
+ 20, // min_frequency
16
+ Math.floor(sampling_rate / 2), // max_frequency
17
+ sampling_rate, // sampling_rate
18
+ null, // norm
19
+ "kaldi", // mel_scale
20
+ true, // triangularize_in_mel_space
21
+ );
22
+
23
+ // Do padding:
24
+ for (let i = 0; i < mel_filters.length; ++i) {
25
+ mel_filters[i].push(0);
26
+ }
27
+ this.mel_filters = mel_filters;
28
+
29
+ this.window = window_function(400, 'hann', {
30
+ periodic: false,
31
+ })
32
+
33
+ this.mean = this.config.mean;
34
+ this.std = this.config.std;
35
+ }
36
+
37
+ /**
38
+ * Computes the log-Mel spectrogram of the provided audio waveform.
39
+ * @param {Float32Array|Float64Array} waveform The audio waveform to process.
40
+ * @param {number} max_length The maximum number of frames to return.
41
+ * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
42
+ */
43
+ async _extract_fbank_features(waveform, max_length) {
44
+ // NOTE: We don't pad/truncate since that is passed in as `max_num_frames`
45
+ return spectrogram(
46
+ waveform,
47
+ this.window, // window
48
+ 400, // frame_length
49
+ 160, // hop_length
50
+ {
51
+ fft_length: 512,
52
+ power: 2.0,
53
+ center: false,
54
+ preemphasis: 0.97,
55
+ mel_filters: this.mel_filters,
56
+ log_mel: 'log',
57
+ mel_floor: 1.192092955078125e-07,
58
+ remove_dc_offset: true,
59
+
60
+ // Custom
61
+ max_num_frames: max_length,
62
+ transpose: true,
63
+ }
64
+ )
65
+ }
66
+
67
+
68
+ /**
69
+ * Asynchronously extracts features from a given audio using the provided configuration.
70
+ * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
71
+ * @returns {Promise<{ input_values: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor.
72
+ */
73
+ async _call(audio) {
74
+ validate_audio_inputs(audio, 'ASTFeatureExtractor');
75
+
76
+ const features = await this._extract_fbank_features(audio, this.config.max_length);
77
+ if (this.config.do_normalize) {
78
+ // Normalize the input audio spectrogram to have mean=0, std=0.5
79
+ const denom = this.std * 2;
80
+ const features_data = features.data;
81
+ for (let i = 0; i < features_data.length; ++i) {
82
+ features_data[i] = (features_data[i] - this.mean) / denom;
83
+ }
84
+ }
85
+
86
+ return {
87
+ input_values: features.unsqueeze_(0)
88
+ };
89
+ }
90
+ }
@@ -0,0 +1,41 @@
1
+
2
+ import { FEATURE_EXTRACTOR_NAME, GITHUB_ISSUE_URL } from '../../utils/constants.js';
3
+ import { getModelJSON } from '../../utils/hub.js';
4
+ import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
5
+ import * as AllFeatureExtractors from '../feature_extractors.js';
6
+
7
+ export class AutoFeatureExtractor {
8
+
9
+ /**
10
+ * Instantiate one of the feature extractor classes of the library from a pretrained model.
11
+ *
12
+ * The processor class to instantiate is selected based on the `feature_extractor_type` property of
13
+ * the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
14
+ *
15
+ * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
16
+ * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
17
+ * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
18
+ * user or organization name, like `dbmdz/bert-base-german-cased`.
19
+ * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
20
+ * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
21
+ *
22
+ * @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
23
+ */
24
+
25
+ /** @type {typeof FeatureExtractor.from_pretrained} */
26
+ static async from_pretrained(pretrained_model_name_or_path, options={}) {
27
+
28
+ const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, true, options);
29
+
30
+ // Determine feature extractor class
31
+ const key = preprocessorConfig.feature_extractor_type;
32
+ const feature_extractor_class = AllFeatureExtractors[key];
33
+
34
+ if (!feature_extractor_class) {
35
+ throw new Error(`Unknown feature_extractor_type: '${key}'. Please report this at ${GITHUB_ISSUE_URL}.`);
36
+ }
37
+
38
+ // Instantiate feature extractor
39
+ return new feature_extractor_class(preprocessorConfig);
40
+ }
41
+ }
@@ -0,0 +1,29 @@
1
+
2
+ import { GITHUB_ISSUE_URL, IMAGE_PROCESSOR_NAME } from '../../utils/constants.js';
3
+ import { getModelJSON } from '../../utils/hub.js';
4
+ import { ImageProcessor } from '../../base/image_processors_utils.js';
5
+ import * as AllImageProcessors from '../image_processors.js';
6
+
7
+ export class AutoImageProcessor {
8
+
9
+ /** @type {typeof ImageProcessor.from_pretrained} */
10
+ static async from_pretrained(pretrained_model_name_or_path, options={}) {
11
+
12
+ const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME, true, options);
13
+
14
+ // Determine image processor class
15
+ const key = preprocessorConfig.image_processor_type ?? preprocessorConfig.feature_extractor_type;
16
+ let image_processor_class = AllImageProcessors[key];
17
+
18
+ if (!image_processor_class) {
19
+ if (key !== undefined) {
20
+ // Only log a warning if the class is not found and the key is set.
21
+ console.warn(`Image processor type '${key}' not found, assuming base ImageProcessor. Please report this at ${GITHUB_ISSUE_URL}.`)
22
+ }
23
+ image_processor_class = ImageProcessor;
24
+ }
25
+
26
+ // Instantiate image processor
27
+ return new image_processor_class(preprocessorConfig);
28
+ }
29
+ }
@@ -0,0 +1,100 @@
1
+
2
+
3
+ import { IMAGE_PROCESSOR_NAME } from '../../utils/constants.js';
4
+ import { getModelJSON } from '../../utils/hub.js';
5
+ import { Processor } from '../../base/processing_utils.js';
6
+
7
+ import * as AllProcessors from '../processors.js';
8
+ import * as AllImageProcessors from '../image_processors.js';
9
+ import * as AllFeatureExtractors from '../feature_extractors.js';
10
+
11
+ /**
12
+ * Helper class which is used to instantiate pretrained processors with the `from_pretrained` function.
13
+ * The chosen processor class is determined by the type specified in the processor config.
14
+ *
15
+ * **Example:** Load a processor using `from_pretrained`.
16
+ * ```javascript
17
+ * let processor = await AutoProcessor.from_pretrained('openai/whisper-tiny.en');
18
+ * ```
19
+ *
20
+ * **Example:** Run an image through a processor.
21
+ * ```javascript
22
+ * let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');
23
+ * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
24
+ * let image_inputs = await processor(image);
25
+ * // {
26
+ * // "pixel_values": {
27
+ * // "dims": [ 1, 3, 224, 224 ],
28
+ * // "type": "float32",
29
+ * // "data": Float32Array [ -1.558687686920166, -1.558687686920166, -1.5440893173217773, ... ],
30
+ * // "size": 150528
31
+ * // },
32
+ * // "original_sizes": [
33
+ * // [ 533, 800 ]
34
+ * // ],
35
+ * // "reshaped_input_sizes": [
36
+ * // [ 224, 224 ]
37
+ * // ]
38
+ * // }
39
+ * ```
40
+ */
41
+ export class AutoProcessor {
42
+
43
+ /**
44
+ * Instantiate one of the processor classes of the library from a pretrained model.
45
+ *
46
+ * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
47
+ * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
48
+ *
49
+ * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
50
+ * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
51
+ * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
52
+ * user or organization name, like `dbmdz/bert-base-german-cased`.
53
+ * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
54
+ * @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
55
+ *
56
+ * @returns {Promise<Processor>} A new instance of the Processor class.
57
+ */
58
+
59
+ /** @type {typeof Processor.from_pretrained} */
60
+ static async from_pretrained(pretrained_model_name_or_path, options={}) {
61
+
62
+ // TODO: first check for processor.json
63
+ const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME, true, options);
64
+
65
+ const { image_processor_type, feature_extractor_type, processor_class } = preprocessorConfig;
66
+ if (processor_class && AllProcessors[processor_class]) {
67
+ return AllProcessors[processor_class].from_pretrained(pretrained_model_name_or_path, options);
68
+ }
69
+
70
+ if (!image_processor_type && !feature_extractor_type) {
71
+ throw new Error('No `image_processor_type` or `feature_extractor_type` found in the config.');
72
+ }
73
+
74
+ const components = {};
75
+ if (image_processor_type) {
76
+ const image_processor_class = AllImageProcessors[image_processor_type];
77
+ if (!image_processor_class) {
78
+ throw new Error(`Unknown image_processor_type: '${image_processor_type}'.`);
79
+ }
80
+ components.image_processor = new image_processor_class(preprocessorConfig);
81
+ }
82
+
83
+ if (feature_extractor_type) {
84
+ const image_processor_class = AllImageProcessors[feature_extractor_type];
85
+ if (image_processor_class) {
86
+ // Handle legacy case where image processors were specified as feature extractors
87
+ components.image_processor = new image_processor_class(preprocessorConfig);
88
+ } else {
89
+ const feature_extractor_class = AllFeatureExtractors[feature_extractor_type];
90
+ if (!feature_extractor_class) {
91
+ throw new Error(`Unknown feature_extractor_type: '${feature_extractor_type}'.`);
92
+ }
93
+ components.feature_extractor = new feature_extractor_class(preprocessorConfig);
94
+ }
95
+ }
96
+
97
+ const config = {};
98
+ return new Processor(config, components);
99
+ }
100
+ }
@@ -0,0 +1,5 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class BeitFeatureExtractor extends ImageProcessor { }
@@ -0,0 +1,5 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class BitImageProcessor extends ImageProcessor { }
@@ -0,0 +1,5 @@
1
+ import {
2
+ ImageProcessor,
3
+ } from "../../base/image_processors_utils.js";
4
+
5
+ export class ChineseCLIPFeatureExtractor extends ImageProcessor { }