@huggingface/transformers 4.0.0-next.3 → 4.0.0-next.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/README.md +10 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +28 -28
  3. package/dist/transformers.js +3109 -2099
  4. package/dist/transformers.min.js +17 -19
  5. package/dist/transformers.node.cjs +3100 -2060
  6. package/dist/transformers.node.min.cjs +19 -21
  7. package/dist/transformers.node.min.mjs +19 -21
  8. package/dist/transformers.node.mjs +3085 -2060
  9. package/dist/transformers.web.js +1312 -276
  10. package/dist/transformers.web.min.js +15 -15
  11. package/package.json +4 -4
  12. package/src/backends/onnx.js +66 -10
  13. package/src/backends/utils/cacheWasm.js +9 -6
  14. package/src/configs.js +52 -3
  15. package/src/env.js +66 -7
  16. package/src/generation/logits_sampler.js +3 -15
  17. package/src/image_processors_utils.js +2 -6
  18. package/src/models/afmoe/modeling_afmoe.js +5 -0
  19. package/src/models/auto/image_processing_auto.js +2 -1
  20. package/src/models/auto/modeling_auto.js +2 -1
  21. package/src/models/auto/tokenization_auto.js +2 -1
  22. package/src/models/clap/feature_extraction_clap.js +2 -1
  23. package/src/models/cohere2/modeling_cohere2.js +5 -0
  24. package/src/models/marian/tokenization_marian.js +3 -2
  25. package/src/models/modeling_utils.js +14 -4
  26. package/src/models/models.js +6 -0
  27. package/src/models/paligemma/processing_paligemma.js +3 -2
  28. package/src/models/processors.js +2 -0
  29. package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -0
  30. package/src/models/qwen2_5_vl/processing_qwen2_5_vl.js +3 -0
  31. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +54 -0
  32. package/src/models/qwen2_vl/modeling_qwen2_vl.js +8 -2
  33. package/src/models/qwen3_5/modeling_qwen3_5.js +3 -0
  34. package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +3 -0
  35. package/src/models/qwen3_vl/modeling_qwen3_vl.js +3 -0
  36. package/src/models/qwen3_vl/processing_qwen3_vl.js +3 -0
  37. package/src/models/registry.js +9 -1
  38. package/src/models/session.js +16 -50
  39. package/src/models/whisper/feature_extraction_whisper.js +2 -1
  40. package/src/models/whisper/modeling_whisper.js +6 -5
  41. package/src/models/xlm/tokenization_xlm.js +2 -1
  42. package/src/pipelines/automatic-speech-recognition.js +3 -2
  43. package/src/pipelines/index.js +395 -0
  44. package/src/pipelines/text-generation.js +4 -0
  45. package/src/pipelines/text-to-audio.js +4 -2
  46. package/src/pipelines/zero-shot-classification.js +3 -2
  47. package/src/pipelines.js +104 -356
  48. package/src/tokenization_utils.js +42 -21
  49. package/src/transformers.js +8 -1
  50. package/src/utils/audio.js +2 -1
  51. package/src/utils/cache.js +4 -1
  52. package/src/utils/core.js +23 -1
  53. package/src/utils/devices.js +22 -0
  54. package/src/utils/dtypes.js +55 -0
  55. package/src/utils/hub/files.js +17 -2
  56. package/src/utils/hub/utils.js +10 -4
  57. package/src/utils/hub.js +57 -17
  58. package/src/utils/image.js +2 -1
  59. package/src/utils/logger.js +67 -0
  60. package/src/utils/model-loader.js +35 -17
  61. package/src/utils/model_registry/ModelRegistry.js +299 -0
  62. package/src/utils/model_registry/clear_cache.js +128 -0
  63. package/src/utils/model_registry/get_file_metadata.js +149 -0
  64. package/src/utils/model_registry/get_files.js +42 -0
  65. package/src/utils/model_registry/get_model_files.js +182 -0
  66. package/src/utils/model_registry/get_pipeline_files.js +53 -0
  67. package/src/utils/model_registry/get_processor_files.js +20 -0
  68. package/src/utils/model_registry/get_tokenizer_files.js +21 -0
  69. package/src/utils/model_registry/is_cached.js +92 -0
  70. package/src/utils/random.js +225 -0
  71. package/src/utils/tensor.js +8 -21
  72. package/src/utils/video.js +2 -2
  73. package/types/backends/onnx.d.ts.map +1 -1
  74. package/types/backends/utils/cacheWasm.d.ts.map +1 -1
  75. package/types/configs.d.ts.map +1 -1
  76. package/types/env.d.ts +42 -24
  77. package/types/env.d.ts.map +1 -1
  78. package/types/generation/logits_sampler.d.ts +2 -2
  79. package/types/generation/logits_sampler.d.ts.map +1 -1
  80. package/types/image_processors_utils.d.ts.map +1 -1
  81. package/types/models/afmoe/modeling_afmoe.d.ts +8 -0
  82. package/types/models/afmoe/modeling_afmoe.d.ts.map +1 -0
  83. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  84. package/types/models/auto/modeling_auto.d.ts.map +1 -1
  85. package/types/models/auto/tokenization_auto.d.ts.map +1 -1
  86. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
  87. package/types/models/cohere2/modeling_cohere2.d.ts +8 -0
  88. package/types/models/cohere2/modeling_cohere2.d.ts.map +1 -0
  89. package/types/models/marian/tokenization_marian.d.ts.map +1 -1
  90. package/types/models/modeling_utils.d.ts.map +1 -1
  91. package/types/models/models.d.ts +6 -0
  92. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  93. package/types/models/processors.d.ts +2 -0
  94. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +4 -0
  95. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -0
  96. package/types/models/qwen2_5_vl/processing_qwen2_5_vl.d.ts +4 -0
  97. package/types/models/qwen2_5_vl/processing_qwen2_5_vl.d.ts.map +1 -0
  98. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +3 -0
  99. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
  100. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +1 -0
  101. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  102. package/types/models/qwen3_5/modeling_qwen3_5.d.ts +4 -0
  103. package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -0
  104. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +4 -0
  105. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -0
  106. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +4 -0
  107. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -0
  108. package/types/models/qwen3_vl/processing_qwen3_vl.d.ts +4 -0
  109. package/types/models/qwen3_vl/processing_qwen3_vl.d.ts.map +1 -0
  110. package/types/models/registry.d.ts.map +1 -1
  111. package/types/models/session.d.ts.map +1 -1
  112. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  113. package/types/models/whisper/modeling_whisper.d.ts.map +1 -1
  114. package/types/models/xlm/tokenization_xlm.d.ts.map +1 -1
  115. package/types/pipelines/automatic-speech-recognition.d.ts.map +1 -1
  116. package/types/pipelines/index.d.ts +299 -0
  117. package/types/pipelines/index.d.ts.map +1 -0
  118. package/types/pipelines/text-generation.d.ts +5 -1
  119. package/types/pipelines/text-generation.d.ts.map +1 -1
  120. package/types/pipelines/text-to-audio.d.ts.map +1 -1
  121. package/types/pipelines/zero-shot-classification.d.ts.map +1 -1
  122. package/types/pipelines.d.ts +50 -291
  123. package/types/pipelines.d.ts.map +1 -1
  124. package/types/tokenization_utils.d.ts +44 -26
  125. package/types/tokenization_utils.d.ts.map +1 -1
  126. package/types/transformers.d.ts +6 -1
  127. package/types/transformers.d.ts.map +1 -1
  128. package/types/utils/audio.d.ts.map +1 -1
  129. package/types/utils/cache.d.ts +6 -0
  130. package/types/utils/cache.d.ts.map +1 -1
  131. package/types/utils/core.d.ts +59 -2
  132. package/types/utils/core.d.ts.map +1 -1
  133. package/types/utils/devices.d.ts +15 -0
  134. package/types/utils/devices.d.ts.map +1 -1
  135. package/types/utils/dtypes.d.ts +16 -0
  136. package/types/utils/dtypes.d.ts.map +1 -1
  137. package/types/utils/hub/files.d.ts +6 -0
  138. package/types/utils/hub/files.d.ts.map +1 -1
  139. package/types/utils/hub/utils.d.ts +2 -1
  140. package/types/utils/hub/utils.d.ts.map +1 -1
  141. package/types/utils/hub.d.ts +29 -0
  142. package/types/utils/hub.d.ts.map +1 -1
  143. package/types/utils/image.d.ts.map +1 -1
  144. package/types/utils/logger.d.ts +28 -0
  145. package/types/utils/logger.d.ts.map +1 -0
  146. package/types/utils/model-loader.d.ts +15 -0
  147. package/types/utils/model-loader.d.ts.map +1 -1
  148. package/types/utils/model_registry/ModelRegistry.d.ts +211 -0
  149. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -0
  150. package/types/utils/model_registry/clear_cache.d.ts +74 -0
  151. package/types/utils/model_registry/clear_cache.d.ts.map +1 -0
  152. package/types/utils/model_registry/get_file_metadata.d.ts +20 -0
  153. package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -0
  154. package/types/utils/model_registry/get_files.d.ts +23 -0
  155. package/types/utils/model_registry/get_files.d.ts.map +1 -0
  156. package/types/utils/model_registry/get_model_files.d.ts +22 -0
  157. package/types/utils/model_registry/get_model_files.d.ts.map +1 -0
  158. package/types/utils/model_registry/get_pipeline_files.d.ts +21 -0
  159. package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -0
  160. package/types/utils/model_registry/get_processor_files.d.ts +9 -0
  161. package/types/utils/model_registry/get_processor_files.d.ts.map +1 -0
  162. package/types/utils/model_registry/get_tokenizer_files.d.ts +9 -0
  163. package/types/utils/model_registry/get_tokenizer_files.d.ts.map +1 -0
  164. package/types/utils/model_registry/is_cached.d.ts +62 -0
  165. package/types/utils/model_registry/is_cached.d.ts.map +1 -0
  166. package/types/utils/random.d.ts +86 -0
  167. package/types/utils/random.d.ts.map +1 -0
  168. package/types/utils/tensor.d.ts.map +1 -1
package/src/pipelines.js CHANGED
@@ -13,362 +13,40 @@
13
13
  * @module pipelines
14
14
  */
15
15
 
16
- import { AutoTokenizer } from './models/auto/tokenization_auto.js';
17
- import { AutoProcessor } from './models/auto/processing_auto.js';
18
- import {
19
- AutoModel,
20
- AutoModelForSequenceClassification,
21
- AutoModelForAudioClassification,
22
- AutoModelForTokenClassification,
23
- AutoModelForQuestionAnswering,
24
- AutoModelForMaskedLM,
25
- AutoModelForSeq2SeqLM,
26
- AutoModelForSpeechSeq2Seq,
27
- AutoModelForTextToWaveform,
28
- AutoModelForTextToSpectrogram,
29
- AutoModelForCTC,
30
- AutoModelForCausalLM,
31
- AutoModelForVision2Seq,
32
- AutoModelForImageClassification,
33
- AutoModelForImageSegmentation,
34
- AutoModelForSemanticSegmentation,
35
- AutoModelForUniversalSegmentation,
36
- AutoModelForObjectDetection,
37
- AutoModelForZeroShotObjectDetection,
38
- AutoModelForDocumentQuestionAnswering,
39
- AutoModelForImageToImage,
40
- AutoModelForDepthEstimation,
41
- AutoModelForImageFeatureExtraction,
42
- } from './models/auto/modeling_auto.js';
43
-
44
16
  import { dispatchCallback } from './utils/core.js';
17
+ import { logger } from './utils/logger.js';
45
18
 
46
- import { TextClassificationPipeline } from './pipelines/text-classification.js';
47
- import { TokenClassificationPipeline } from './pipelines/token-classification.js';
48
- import { QuestionAnsweringPipeline } from './pipelines/question-answering.js';
49
- import { FillMaskPipeline } from './pipelines/fill-mask.js';
50
- import { SummarizationPipeline } from './pipelines/summarization.js';
51
- import { TranslationPipeline } from './pipelines/translation.js';
52
- import { Text2TextGenerationPipeline } from './pipelines/text2text-generation.js';
53
- import { TextGenerationPipeline } from './pipelines/text-generation.js';
54
- import { ZeroShotClassificationPipeline } from './pipelines/zero-shot-classification.js';
55
- import { AudioClassificationPipeline } from './pipelines/audio-classification.js';
56
- import { ZeroShotAudioClassificationPipeline } from './pipelines/zero-shot-audio-classification.js';
57
- import { AutomaticSpeechRecognitionPipeline } from './pipelines/automatic-speech-recognition.js';
58
- import { TextToAudioPipeline } from './pipelines/text-to-audio.js';
59
- import { ImageToTextPipeline } from './pipelines/image-to-text.js';
60
- import { ImageClassificationPipeline } from './pipelines/image-classification.js';
61
- import { ImageSegmentationPipeline } from './pipelines/image-segmentation.js';
62
- import { BackgroundRemovalPipeline } from './pipelines/background-removal.js';
63
- import { ZeroShotImageClassificationPipeline } from './pipelines/zero-shot-image-classification.js';
64
- import { ObjectDetectionPipeline } from './pipelines/object-detection.js';
65
- import { ZeroShotObjectDetectionPipeline } from './pipelines/zero-shot-object-detection.js';
66
- import { DocumentQuestionAnsweringPipeline } from './pipelines/document-question-answering.js';
67
- import { ImageToImagePipeline } from './pipelines/image-to-image.js';
68
- import { DepthEstimationPipeline } from './pipelines/depth-estimation.js';
69
- import { FeatureExtractionPipeline } from './pipelines/feature-extraction.js';
70
- import { ImageFeatureExtractionPipeline } from './pipelines/image-feature-extraction.js';
71
-
72
- const SUPPORTED_TASKS = Object.freeze({
73
- 'text-classification': {
74
- tokenizer: AutoTokenizer,
75
- pipeline: TextClassificationPipeline,
76
- model: AutoModelForSequenceClassification,
77
- default: {
78
- // TODO: replace with original
79
- // "model": "distilbert-base-uncased-finetuned-sst-2-english",
80
- model: 'Xenova/distilbert-base-uncased-finetuned-sst-2-english',
81
- },
82
- type: 'text',
83
- },
84
- 'token-classification': {
85
- tokenizer: AutoTokenizer,
86
- pipeline: TokenClassificationPipeline,
87
- model: AutoModelForTokenClassification,
88
- default: {
89
- // TODO: replace with original
90
- // "model": "Davlan/bert-base-multilingual-cased-ner-hrl",
91
- model: 'Xenova/bert-base-multilingual-cased-ner-hrl',
92
- },
93
- type: 'text',
94
- },
95
- 'question-answering': {
96
- tokenizer: AutoTokenizer,
97
- pipeline: QuestionAnsweringPipeline,
98
- model: AutoModelForQuestionAnswering,
99
- default: {
100
- // TODO: replace with original
101
- // "model": "distilbert-base-cased-distilled-squad",
102
- model: 'Xenova/distilbert-base-cased-distilled-squad',
103
- },
104
- type: 'text',
105
- },
106
-
107
- 'fill-mask': {
108
- tokenizer: AutoTokenizer,
109
- pipeline: FillMaskPipeline,
110
- model: AutoModelForMaskedLM,
111
- default: {
112
- model: 'onnx-community/ettin-encoder-32m-ONNX',
113
- dtype: 'fp32',
114
- },
115
- type: 'text',
116
- },
117
- summarization: {
118
- tokenizer: AutoTokenizer,
119
- pipeline: SummarizationPipeline,
120
- model: AutoModelForSeq2SeqLM,
121
- default: {
122
- // TODO: replace with original
123
- // "model": "sshleifer/distilbart-cnn-6-6",
124
- model: 'Xenova/distilbart-cnn-6-6',
125
- },
126
- type: 'text',
127
- },
128
- translation: {
129
- tokenizer: AutoTokenizer,
130
- pipeline: TranslationPipeline,
131
- model: AutoModelForSeq2SeqLM,
132
- default: {
133
- // TODO: replace with original
134
- // "model": "t5-small",
135
- model: 'Xenova/t5-small',
136
- },
137
- type: 'text',
138
- },
139
- 'text2text-generation': {
140
- tokenizer: AutoTokenizer,
141
- pipeline: Text2TextGenerationPipeline,
142
- model: AutoModelForSeq2SeqLM,
143
- default: {
144
- // TODO: replace with original
145
- // "model": "google/flan-t5-small",
146
- model: 'Xenova/flan-t5-small',
147
- },
148
- type: 'text',
149
- },
150
- 'text-generation': {
151
- tokenizer: AutoTokenizer,
152
- pipeline: TextGenerationPipeline,
153
- model: AutoModelForCausalLM,
154
- default: {
155
- model: 'onnx-community/Qwen3-0.6B-ONNX',
156
- dtype: 'q4',
157
- },
158
- type: 'text',
159
- },
160
- 'zero-shot-classification': {
161
- tokenizer: AutoTokenizer,
162
- pipeline: ZeroShotClassificationPipeline,
163
- model: AutoModelForSequenceClassification,
164
- default: {
165
- // TODO: replace with original
166
- // "model": "typeform/distilbert-base-uncased-mnli",
167
- model: 'Xenova/distilbert-base-uncased-mnli',
168
- },
169
- type: 'text',
170
- },
171
- 'audio-classification': {
172
- pipeline: AudioClassificationPipeline,
173
- model: AutoModelForAudioClassification,
174
- processor: AutoProcessor,
175
- default: {
176
- // TODO: replace with original
177
- // "model": "superb/wav2vec2-base-superb-ks",
178
- model: 'Xenova/wav2vec2-base-superb-ks',
179
- },
180
- type: 'audio',
181
- },
182
- 'zero-shot-audio-classification': {
183
- tokenizer: AutoTokenizer,
184
- pipeline: ZeroShotAudioClassificationPipeline,
185
- model: AutoModel,
186
- processor: AutoProcessor,
187
- default: {
188
- // TODO: replace with original
189
- // "model": "laion/clap-htsat-fused",
190
- model: 'Xenova/clap-htsat-unfused',
191
- },
192
- type: 'multimodal',
193
- },
194
- 'automatic-speech-recognition': {
195
- tokenizer: AutoTokenizer,
196
- pipeline: AutomaticSpeechRecognitionPipeline,
197
- model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
198
- processor: AutoProcessor,
199
- default: {
200
- // TODO: replace with original
201
- // "model": "openai/whisper-tiny.en",
202
- model: 'Xenova/whisper-tiny.en',
203
- },
204
- type: 'multimodal',
205
- },
206
- 'text-to-audio': {
207
- tokenizer: AutoTokenizer,
208
- pipeline: TextToAudioPipeline,
209
- model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
210
- processor: [AutoProcessor, /* Some don't use a processor */ null],
211
- default: {
212
- model: 'onnx-community/Supertonic-TTS-ONNX',
213
- dtype: 'fp32',
214
- },
215
- type: 'text',
216
- },
217
- 'image-to-text': {
218
- tokenizer: AutoTokenizer,
219
- pipeline: ImageToTextPipeline,
220
- model: AutoModelForVision2Seq,
221
- processor: AutoProcessor,
222
- default: {
223
- // TODO: replace with original
224
- // "model": "nlpconnect/vit-gpt2-image-captioning",
225
- model: 'Xenova/vit-gpt2-image-captioning',
226
- },
227
- type: 'multimodal',
228
- },
229
-
230
- 'image-classification': {
231
- // no tokenizer
232
- pipeline: ImageClassificationPipeline,
233
- model: AutoModelForImageClassification,
234
- processor: AutoProcessor,
235
- default: {
236
- // TODO: replace with original
237
- // "model": "google/vit-base-patch16-224",
238
- model: 'Xenova/vit-base-patch16-224',
239
- },
240
- type: 'multimodal',
241
- },
242
-
243
- 'image-segmentation': {
244
- // no tokenizer
245
- pipeline: ImageSegmentationPipeline,
246
- model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
247
- processor: AutoProcessor,
248
- default: {
249
- // TODO: replace with original
250
- // "model": "facebook/detr-resnet-50-panoptic",
251
- model: 'Xenova/detr-resnet-50-panoptic',
252
- },
253
- type: 'multimodal',
254
- },
255
- 'background-removal': {
256
- // no tokenizer
257
- pipeline: BackgroundRemovalPipeline,
258
- model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
259
- processor: AutoProcessor,
260
- default: {
261
- model: 'Xenova/modnet',
262
- },
263
- type: 'image',
264
- },
265
-
266
- 'zero-shot-image-classification': {
267
- tokenizer: AutoTokenizer,
268
- pipeline: ZeroShotImageClassificationPipeline,
269
- model: AutoModel,
270
- processor: AutoProcessor,
271
- default: {
272
- // TODO: replace with original
273
- // "model": "openai/clip-vit-base-patch32",
274
- model: 'Xenova/clip-vit-base-patch32',
275
- },
276
- type: 'multimodal',
277
- },
278
-
279
- 'object-detection': {
280
- // no tokenizer
281
- pipeline: ObjectDetectionPipeline,
282
- model: AutoModelForObjectDetection,
283
- processor: AutoProcessor,
284
- default: {
285
- // TODO: replace with original
286
- // "model": "facebook/detr-resnet-50",
287
- model: 'Xenova/detr-resnet-50',
288
- },
289
- type: 'multimodal',
290
- },
291
- 'zero-shot-object-detection': {
292
- tokenizer: AutoTokenizer,
293
- pipeline: ZeroShotObjectDetectionPipeline,
294
- model: AutoModelForZeroShotObjectDetection,
295
- processor: AutoProcessor,
296
- default: {
297
- // TODO: replace with original
298
- // "model": "google/owlvit-base-patch32",
299
- model: 'Xenova/owlvit-base-patch32',
300
- },
301
- type: 'multimodal',
302
- },
303
- 'document-question-answering': {
304
- tokenizer: AutoTokenizer,
305
- pipeline: DocumentQuestionAnsweringPipeline,
306
- model: AutoModelForDocumentQuestionAnswering,
307
- processor: AutoProcessor,
308
- default: {
309
- // TODO: replace with original
310
- // "model": "naver-clova-ix/donut-base-finetuned-docvqa",
311
- model: 'Xenova/donut-base-finetuned-docvqa',
312
- },
313
- type: 'multimodal',
314
- },
315
- 'image-to-image': {
316
- // no tokenizer
317
- pipeline: ImageToImagePipeline,
318
- model: AutoModelForImageToImage,
319
- processor: AutoProcessor,
320
- default: {
321
- // TODO: replace with original
322
- // "model": "caidas/swin2SR-classical-sr-x2-64",
323
- model: 'Xenova/swin2SR-classical-sr-x2-64',
324
- },
325
- type: 'image',
326
- },
327
- 'depth-estimation': {
328
- // no tokenizer
329
- pipeline: DepthEstimationPipeline,
330
- model: AutoModelForDepthEstimation,
331
- processor: AutoProcessor,
332
- default: {
333
- model: 'onnx-community/depth-anything-v2-small',
334
- },
335
- type: 'image',
336
- },
337
-
338
- // This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers).
339
- 'feature-extraction': {
340
- tokenizer: AutoTokenizer,
341
- pipeline: FeatureExtractionPipeline,
342
- model: AutoModel,
343
- default: {
344
- model: 'onnx-community/all-MiniLM-L6-v2-ONNX',
345
- dtype: 'fp32',
346
- },
347
- type: 'text',
348
- },
349
- 'image-feature-extraction': {
350
- processor: AutoProcessor,
351
- pipeline: ImageFeatureExtractionPipeline,
352
- model: [AutoModelForImageFeatureExtraction, AutoModel],
353
- default: {
354
- model: 'onnx-community/dinov3-vits16-pretrain-lvd1689m-ONNX',
355
- dtype: 'fp32',
356
- },
357
- type: 'image',
358
- },
359
- });
360
-
361
- // TODO: Add types for TASK_ALIASES
362
- const TASK_ALIASES = Object.freeze({
363
- 'sentiment-analysis': 'text-classification',
364
- ner: 'token-classification',
365
- // "vqa": "visual-question-answering", // TODO: Add
366
- asr: 'automatic-speech-recognition',
367
- 'text-to-speech': 'text-to-audio',
368
-
369
- // Add for backwards compatibility
370
- embeddings: 'feature-extraction',
371
- });
19
+ import {
20
+ SUPPORTED_TASKS,
21
+ TASK_ALIASES,
22
+ TextClassificationPipeline,
23
+ TokenClassificationPipeline,
24
+ QuestionAnsweringPipeline,
25
+ FillMaskPipeline,
26
+ SummarizationPipeline,
27
+ TranslationPipeline,
28
+ Text2TextGenerationPipeline,
29
+ TextGenerationPipeline,
30
+ ZeroShotClassificationPipeline,
31
+ AudioClassificationPipeline,
32
+ ZeroShotAudioClassificationPipeline,
33
+ AutomaticSpeechRecognitionPipeline,
34
+ TextToAudioPipeline,
35
+ ImageToTextPipeline,
36
+ ImageClassificationPipeline,
37
+ ImageSegmentationPipeline,
38
+ BackgroundRemovalPipeline,
39
+ ZeroShotImageClassificationPipeline,
40
+ ObjectDetectionPipeline,
41
+ ZeroShotObjectDetectionPipeline,
42
+ DocumentQuestionAnsweringPipeline,
43
+ ImageToImagePipeline,
44
+ DepthEstimationPipeline,
45
+ FeatureExtractionPipeline,
46
+ ImageFeatureExtractionPipeline,
47
+ } from './pipelines/index.js';
48
+ import { get_pipeline_files } from './utils/model_registry/get_pipeline_files.js';
49
+ import { get_file_metadata } from './utils/model_registry/get_file_metadata.js';
372
50
 
373
51
  /**
374
52
  * @typedef {keyof typeof SUPPORTED_TASKS} TaskType
@@ -443,14 +121,57 @@ export async function pipeline(
443
121
  // Use model if specified, otherwise, use default
444
122
  if (!model) {
445
123
  model = pipelineInfo.default.model;
446
- console.log(`No model specified. Using default model: "${model}".`);
124
+ logger.info(`No model specified. Using default model: "${model}".`);
447
125
  if (!dtype && pipelineInfo.default.dtype) {
448
126
  dtype = pipelineInfo.default.dtype;
449
127
  }
450
128
  }
451
129
 
130
+ /** @type {import('./utils/core.js').FilesLoadingMap} */
131
+ let files_loading = {};
132
+ if (progress_callback) {
133
+ const expected_files = await get_pipeline_files(task, model, {
134
+ device,
135
+ dtype,
136
+ });
137
+ /** @type {Array<{exists: boolean, size?: number, contentType?: string, fromCache?: boolean}>} */
138
+ const metadata = await Promise.all(expected_files.map(async (file) => get_file_metadata(model, file)));
139
+ metadata.forEach((m, i) => {
140
+ if (m.exists) {
141
+ files_loading[expected_files[i]] = {
142
+ loaded: 0,
143
+ total: m.size ?? 0,
144
+ };
145
+ }
146
+ });
147
+ }
148
+
452
149
  const pretrainedOptions = {
453
- progress_callback,
150
+ progress_callback: progress_callback
151
+ ? /** @param {import('./utils/core.js').ProgressInfo} info */
152
+ (info) => {
153
+ if (info.status === 'progress') {
154
+ files_loading[info.file] = {
155
+ loaded: info.loaded,
156
+ total: info.total,
157
+ };
158
+
159
+ const loaded = Object.values(files_loading).reduce((acc, curr) => acc + curr.loaded, 0);
160
+ const total = Object.values(files_loading).reduce((acc, curr) => acc + curr.total, 0);
161
+ const progress = total > 0 ? (loaded / total) * 100 : 0;
162
+
163
+ progress_callback({
164
+ status: 'progress_total',
165
+ name: info.name,
166
+ progress,
167
+ loaded,
168
+ total,
169
+ files: structuredClone(files_loading),
170
+ });
171
+ }
172
+ progress_callback(info);
173
+ }
174
+ : undefined,
454
175
  config,
455
176
  cache_dir,
456
177
  local_files_only,
@@ -574,3 +295,30 @@ export {
574
295
  FeatureExtractionPipeline,
575
296
  ImageFeatureExtractionPipeline,
576
297
  };
298
+
299
+ // Export pipeline output types
300
+ /**
301
+ * @typedef {import('./pipelines/fill-mask.js').FillMaskOutput} FillMaskOutput
302
+ * @typedef {import('./pipelines/text-classification.js').TextClassificationOutput} TextClassificationOutput
303
+ * @typedef {import('./pipelines/token-classification.js').TokenClassificationOutput} TokenClassificationOutput
304
+ * @typedef {import('./pipelines/question-answering.js').QuestionAnsweringOutput} QuestionAnsweringOutput
305
+ * @typedef {import('./pipelines/summarization.js').SummarizationOutput} SummarizationOutput
306
+ * @typedef {import('./pipelines/translation.js').TranslationOutput} TranslationOutput
307
+ * @typedef {import('./pipelines/text2text-generation.js').Text2TextGenerationOutput} Text2TextGenerationOutput
308
+ * @typedef {import('./pipelines/text-generation.js').TextGenerationOutput} TextGenerationOutput
309
+ * @typedef {import('./pipelines/text-generation.js').TextGenerationStringOutput} TextGenerationStringOutput
310
+ * @typedef {import('./pipelines/text-generation.js').TextGenerationChatOutput} TextGenerationChatOutput
311
+ * @typedef {import('./pipelines/zero-shot-classification.js').ZeroShotClassificationOutput} ZeroShotClassificationOutput
312
+ * @typedef {import('./pipelines/audio-classification.js').AudioClassificationOutput} AudioClassificationOutput
313
+ * @typedef {import('./pipelines/zero-shot-audio-classification.js').ZeroShotAudioClassificationOutput} ZeroShotAudioClassificationOutput
314
+ * @typedef {import('./pipelines/automatic-speech-recognition.js').AutomaticSpeechRecognitionOutput} AutomaticSpeechRecognitionOutput
315
+ * @typedef {import('./pipelines/text-to-audio.js').TextToAudioOutput} TextToAudioOutput
316
+ * @typedef {import('./pipelines/image-classification.js').ImageClassificationOutput} ImageClassificationOutput
317
+ * @typedef {import('./pipelines/image-segmentation.js').ImageSegmentationOutput} ImageSegmentationOutput
318
+ * @typedef {import('./pipelines/image-to-text.js').ImageToTextOutput} ImageToTextOutput
319
+ * @typedef {import('./pipelines/object-detection.js').ObjectDetectionOutput} ObjectDetectionOutput
320
+ * @typedef {import('./pipelines/zero-shot-object-detection.js').ZeroShotObjectDetectionOutput} ZeroShotObjectDetectionOutput
321
+ * @typedef {import('./pipelines/zero-shot-image-classification.js').ZeroShotImageClassificationOutput} ZeroShotImageClassificationOutput
322
+ * @typedef {import('./pipelines/document-question-answering.js').DocumentQuestionAnsweringOutput} DocumentQuestionAnsweringOutput
323
+ * @typedef {import('./pipelines/depth-estimation.js').DepthEstimationOutput} DepthEstimationOutput
324
+ */
@@ -12,6 +12,8 @@ import { isIntegralNumber, mergeArrays } from './utils/core.js';
12
12
  import { getModelJSON } from './utils/hub.js';
13
13
  import { max } from './utils/maths.js';
14
14
  import { Tensor } from './utils/tensor.js';
15
+ import { logger } from './utils/logger.js';
16
+ import { get_tokenizer_files } from './utils/model_registry/get_tokenizer_files.js';
15
17
 
16
18
  /**
17
19
  * @typedef {import('./utils/hub.js').PretrainedOptions} PretrainedTokenizerOptions
@@ -24,11 +26,10 @@ import { Tensor } from './utils/tensor.js';
24
26
  * @returns {Promise<any[]>} A promise that resolves with information about the loaded tokenizer.
25
27
  */
26
28
  export async function loadTokenizer(pretrained_model_name_or_path, options) {
27
- const info = await Promise.all([
28
- getModelJSON(pretrained_model_name_or_path, 'tokenizer.json', true, options),
29
- getModelJSON(pretrained_model_name_or_path, 'tokenizer_config.json', true, options),
30
- ]);
31
- return info;
29
+ const tokenizerFiles = await get_tokenizer_files(pretrained_model_name_or_path);
30
+ return await Promise.all(
31
+ tokenizerFiles.map((file) => getModelJSON(pretrained_model_name_or_path, file, true, options)),
32
+ );
32
33
  }
33
34
 
34
35
  /**
@@ -64,10 +65,30 @@ const SPECIAL_TOKEN_ATTRIBUTES = [
64
65
  // additional_special_tokens (TODO)
65
66
  ];
66
67
 
68
+ /**
69
+ * @typedef {{ type: 'text', text: string, [key: string]: any }} TextContent
70
+ * @property {'text'} type The type of content (must be 'text').
71
+ * @property {string} text The text content.
72
+ */
73
+
74
+ /**
75
+ * @typedef {{ type: 'image', image?: string | import('./utils/image.js').RawImage, [key: string]: any }} ImageContent
76
+ * @property {'image'} type The type of content (must be 'image').
77
+ * @property {string | import('./utils/image.js').RawImage} [image] Optional URL or instance of the image.
78
+ *
79
+ * Note: This works for SmolVLM. Qwen2VL and Idefics3 have different implementations.
80
+ */
81
+
82
+ /**
83
+ * @typedef {TextContent | ImageContent | { type: string & {}, [key: string]: any }} MessageContent
84
+ * Base type for message content. This is a discriminated union that can be extended with additional content types.
85
+ * Example: `@typedef {TextContent | ImageContent | AudioContent} MessageContent`
86
+ */
87
+
67
88
  /**
68
89
  * @typedef {Object} Message
69
- * @property {string} role The role of the message (e.g., "user" or "assistant" or "system").
70
- * @property {string} content The content of the message.
90
+ * @property {'user' | 'assistant' | 'system' | (string & {})} role The role of the message.
91
+ * @property {string | MessageContent[]} content The content of the message. Can be a simple string or an array of content objects.
71
92
  */
72
93
 
73
94
  /**
@@ -276,10 +297,10 @@ export class PreTrainedTokenizer extends Callable {
276
297
  * @param {string|string[]} [options.text_pair=null] Optional second sequence to be encoded. If set, must be the same type as text.
277
298
  * @param {boolean|'max_length'} [options.padding=false] Whether to pad the input sequences.
278
299
  * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model.
279
- * @param {boolean} [options.truncation=null] Whether to truncate the input sequences.
280
- * @param {number} [options.max_length=null] Maximum length of the returned list and optionally padding length.
300
+ * @param {boolean|null} [options.truncation=null] Whether to truncate the input sequences.
301
+ * @param {number|null} [options.max_length=null] Maximum length of the returned list and optionally padding length.
281
302
  * @param {boolean} [options.return_tensor=true] Whether to return the results as Tensors or arrays.
282
- * @param {boolean} [options.return_token_type_ids=null] Whether to return the token type ids.
303
+ * @param {boolean|null} [options.return_token_type_ids=null] Whether to return the token type ids.
283
304
  * @returns {BatchEncoding} Object to be passed to the model.
284
305
  */
285
306
  _call(
@@ -339,13 +360,13 @@ export class PreTrainedTokenizer extends Callable {
339
360
  max_length = this.model_max_length;
340
361
  } else if (truncation === null) {
341
362
  if (padding === true) {
342
- console.warn(
363
+ logger.warn(
343
364
  '`max_length` is ignored when `padding: true` and there is no truncation strategy. ' +
344
365
  "To pad to max length, use `padding: 'max_length'`.",
345
366
  );
346
367
  max_length = this.model_max_length;
347
368
  } else if (padding === false) {
348
- console.warn(
369
+ logger.warn(
349
370
  'Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation: true` to explicitly truncate examples to max length.',
350
371
  );
351
372
  truncation = true;
@@ -455,9 +476,9 @@ export class PreTrainedTokenizer extends Callable {
455
476
  *
456
477
  * @param {string} text The text to encode.
457
478
  * @param {Object} options An optional object containing the following properties:
458
- * @param {string} [options.text_pair=null] The optional second text to encode.
479
+ * @param {string|null} [options.text_pair=null] The optional second text to encode.
459
480
  * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model.
460
- * @param {boolean} [options.return_token_type_ids=null] Whether to return token_type_ids.
481
+ * @param {boolean|null} [options.return_token_type_ids=null] Whether to return token_type_ids.
461
482
  * @returns {{input_ids: number[], attention_mask: number[], token_type_ids?: number[]}} An object containing the encoded text.
462
483
  * @private
463
484
  */
@@ -478,7 +499,7 @@ export class PreTrainedTokenizer extends Callable {
478
499
  * Converts a string into a sequence of tokens.
479
500
  * @param {string} text The sequence to be encoded.
480
501
  * @param {Object} options An optional object containing the following properties:
481
- * @param {string} [options.pair] A second sequence to be encoded with the first.
502
+ * @param {string|null} [options.pair] A second sequence to be encoded with the first.
482
503
  * @param {boolean} [options.add_special_tokens=false] Whether or not to add the special tokens associated with the corresponding model.
483
504
  * @returns {string[]} The list of tokens.
484
505
  */
@@ -491,9 +512,9 @@ export class PreTrainedTokenizer extends Callable {
491
512
  *
492
513
  * @param {string} text The text to encode.
493
514
  * @param {Object} options An optional object containing the following properties:
494
- * @param {string} [options.text_pair=null] The optional second text to encode.
515
+ * @param {string|null} [options.text_pair=null] The optional second text to encode.
495
516
  * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model.
496
- * @param {boolean} [options.return_token_type_ids=null] Whether to return token_type_ids.
517
+ * @param {boolean|null} [options.return_token_type_ids=null] Whether to return token_type_ids.
497
518
  * @returns {number[]} An array of token IDs representing the encoded text(s).
498
519
  */
499
520
  encode(text, { text_pair = null, add_special_tokens = true, return_token_type_ids = null } = {}) {
@@ -545,7 +566,7 @@ export class PreTrainedTokenizer extends Callable {
545
566
  * @param {number[]|bigint[]} token_ids List of token ids to decode
546
567
  * @param {Object} decode_args Optional arguments for decoding
547
568
  * @param {boolean} [decode_args.skip_special_tokens=false] Whether to skip special tokens during decoding
548
- * @param {boolean} [decode_args.clean_up_tokenization_spaces=null] Whether to clean up tokenization spaces during decoding.
569
+ * @param {boolean|null} [decode_args.clean_up_tokenization_spaces=null] Whether to clean up tokenization spaces during decoding.
549
570
  * If null, the value is set to `this.decoder.cleanup` if it exists, falling back to `this.clean_up_tokenization_spaces` if it exists, falling back to `true`.
550
571
  * @returns {string} The decoded string
551
572
  */
@@ -562,7 +583,7 @@ export class PreTrainedTokenizer extends Callable {
562
583
  * template for better generation tracking.
563
584
  *
564
585
  * @param {Object} options An optional object containing the following properties:
565
- * @param {string} [options.chat_template=null]
586
+ * @param {string|null} [options.chat_template=null]
566
587
  * A Jinja template or the name of a template to use for this conversion.
567
588
  * It is usually not necessary to pass anything to this argument,
568
589
  * as the model's template will be used by default.
@@ -642,7 +663,7 @@ export class PreTrainedTokenizer extends Callable {
642
663
  * @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys,
643
664
  * representing the chat history so far.
644
665
  * @param {Object} options An optional object containing the following properties:
645
- * @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If
666
+ * @param {string|null} [options.chat_template=null] A Jinja template to use for this conversion. If
646
667
  * this is not passed, the model's chat template will be used instead.
647
668
  * @param {Object[]} [options.tools=null]
648
669
  * A list of tools (callable functions) that will be accessible to the model. If the template does not
@@ -663,7 +684,7 @@ export class PreTrainedTokenizer extends Callable {
663
684
  * @param {boolean} [options.tokenize=true] Whether to tokenize the output. If false, the output will be a string.
664
685
  * @param {boolean} [options.padding=false] Whether to pad sequences to the maximum length. Has no effect if tokenize is false.
665
686
  * @param {boolean} [options.truncation=false] Whether to truncate sequences to the maximum length. Has no effect if tokenize is false.
666
- * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false.
687
+ * @param {number|null} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false.
667
688
  * If not specified, the tokenizer's `max_length` attribute will be used as a default.
668
689
  * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false.
669
690
  * @param {boolean} [options.return_dict=true] Whether to return a dictionary with named outputs. Has no effect if tokenize is false.
@@ -13,7 +13,7 @@
13
13
  */
14
14
 
15
15
  // Environment variables
16
- export { env } from './env.js';
16
+ export { env, LogLevel } from './env.js';
17
17
 
18
18
  // Pipelines
19
19
  export * from './pipelines.js';
@@ -51,12 +51,19 @@ export { load_image, RawImage } from './utils/image.js';
51
51
  export { load_video, RawVideo, RawVideoFrame } from './utils/video.js';
52
52
  export * from './utils/tensor.js';
53
53
  export { softmax, log_softmax, dot, cos_sim } from './utils/maths.js';
54
+ export { random } from './utils/random.js';
55
+
56
+ // Cache and file management
57
+ export { ModelRegistry } from './utils/model_registry/ModelRegistry.js';
54
58
 
55
59
  // Expose common types used across the library for developers to access
56
60
  /**
57
61
  * @typedef {import('./utils/hub.js').PretrainedModelOptions} PretrainedModelOptions
58
62
  * @typedef {import('./processing_utils.js').PretrainedProcessorOptions} PretrainedProcessorOptions
63
+ * @typedef {import('./tokenization_utils.js').Message} Message
59
64
  * @typedef {import('./tokenization_utils.js').PretrainedTokenizerOptions} PretrainedTokenizerOptions
60
65
  * @typedef {import('./utils/dtypes.js').DataType} DataType
61
66
  * @typedef {import('./utils/devices.js').DeviceType} DeviceType
67
+ * @typedef {import('./utils/core.js').ProgressCallback} ProgressCallback
68
+ * @typedef {import('./utils/core.js').ProgressInfo} ProgressInfo
62
69
  */