parakeet.js 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/.gitmodules +3 -0
  2. package/README.md +240 -239
  3. package/examples/hf-spaces-demo/README.md +6 -9
  4. package/examples/hf-spaces-demo/package.json +1 -1
  5. package/examples/hf-spaces-demo/src/App.js +307 -316
  6. package/examples/react-demo/package.json +19 -19
  7. package/examples/react-demo/src/App.jsx +324 -326
  8. package/examples/react-demo-dev/src/App.jsx +23 -24
  9. package/package.json +1 -1
  10. package/publish.ps1 +65 -0
  11. package/src/hub.js +235 -241
  12. package/src/parakeet.js +15 -8
  13. package/src/preprocessor.js +75 -68
  14. package/docs/parakeet-transformers-js/.gitattributes +0 -2
  15. package/docs/parakeet-transformers-js/.prettierignore +0 -8
  16. package/docs/parakeet-transformers-js/.prettierrc +0 -10
  17. package/docs/parakeet-transformers-js/.tmp_features.json +0 -1
  18. package/docs/parakeet-transformers-js/LICENSE +0 -202
  19. package/docs/parakeet-transformers-js/README.md +0 -448
  20. package/docs/parakeet-transformers-js/assets/nemo128.onnx +0 -0
  21. package/docs/parakeet-transformers-js/assets/nemo80.onnx +0 -0
  22. package/docs/parakeet-transformers-js/debug_test.js +0 -84
  23. package/docs/parakeet-transformers-js/dev/inspect_decoder.cjs +0 -9
  24. package/docs/parakeet-transformers-js/dev/inspect_joiner.cjs +0 -9
  25. package/docs/parakeet-transformers-js/dev/js_step_by_step.js +0 -249
  26. package/docs/parakeet-transformers-js/dev/parakeet_cli.js +0 -91
  27. package/docs/parakeet-transformers-js/jest.config.mjs +0 -194
  28. package/docs/parakeet-transformers-js/js_preprocessing.json +0 -225
  29. package/docs/parakeet-transformers-js/js_step_by_step.json +0 -837
  30. package/docs/parakeet-transformers-js/js_step_by_step_v2.json +0 -450
  31. package/docs/parakeet-transformers-js/js_step_by_step_v3.json +0 -450
  32. package/docs/parakeet-transformers-js/js_steps.json +0 -821
  33. package/docs/parakeet-transformers-js/package-lock.json +0 -12251
  34. package/docs/parakeet-transformers-js/package.json +0 -96
  35. package/docs/parakeet-transformers-js/src/audio_features.js +0 -178
  36. package/docs/parakeet-transformers-js/src/backends/onnx.js +0 -210
  37. package/docs/parakeet-transformers-js/src/base/feature_extraction_utils.js +0 -54
  38. package/docs/parakeet-transformers-js/src/base/image_processors_utils.js +0 -1105
  39. package/docs/parakeet-transformers-js/src/base/processing_utils.js +0 -173
  40. package/docs/parakeet-transformers-js/src/configs.js +0 -455
  41. package/docs/parakeet-transformers-js/src/env.js +0 -167
  42. package/docs/parakeet-transformers-js/src/generation/configuration_utils.js +0 -388
  43. package/docs/parakeet-transformers-js/src/generation/logits_process.js +0 -727
  44. package/docs/parakeet-transformers-js/src/generation/logits_sampler.js +0 -204
  45. package/docs/parakeet-transformers-js/src/generation/parameters.js +0 -35
  46. package/docs/parakeet-transformers-js/src/generation/stopping_criteria.js +0 -156
  47. package/docs/parakeet-transformers-js/src/generation/streamers.js +0 -225
  48. package/docs/parakeet-transformers-js/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +0 -85
  49. package/docs/parakeet-transformers-js/src/models/auto/feature_extraction_auto.js +0 -25
  50. package/docs/parakeet-transformers-js/src/models/auto/image_processing_auto.js +0 -29
  51. package/docs/parakeet-transformers-js/src/models/auto/processing_auto.js +0 -85
  52. package/docs/parakeet-transformers-js/src/models/beit/image_processing_beit.js +0 -5
  53. package/docs/parakeet-transformers-js/src/models/bit/image_processing_bit.js +0 -5
  54. package/docs/parakeet-transformers-js/src/models/chinese_clip/image_processing_chinese_clip.js +0 -5
  55. package/docs/parakeet-transformers-js/src/models/clap/feature_extraction_clap.js +0 -159
  56. package/docs/parakeet-transformers-js/src/models/clip/image_processing_clip.js +0 -6
  57. package/docs/parakeet-transformers-js/src/models/convnext/image_processing_convnext.js +0 -46
  58. package/docs/parakeet-transformers-js/src/models/dac/feature_extraction_dac.js +0 -3
  59. package/docs/parakeet-transformers-js/src/models/deit/image_processing_deit.js +0 -6
  60. package/docs/parakeet-transformers-js/src/models/detr/image_processing_detr.js +0 -52
  61. package/docs/parakeet-transformers-js/src/models/donut/image_processing_donut.js +0 -31
  62. package/docs/parakeet-transformers-js/src/models/dpt/image_processing_dpt.js +0 -6
  63. package/docs/parakeet-transformers-js/src/models/efficientnet/image_processing_efficientnet.js +0 -14
  64. package/docs/parakeet-transformers-js/src/models/encodec/feature_extraction_encodec.js +0 -32
  65. package/docs/parakeet-transformers-js/src/models/feature_extractors.js +0 -17
  66. package/docs/parakeet-transformers-js/src/models/florence2/processing_florence2.js +0 -131
  67. package/docs/parakeet-transformers-js/src/models/gemma3n/feature_extraction_gemma3n.js +0 -97
  68. package/docs/parakeet-transformers-js/src/models/gemma3n/processing_gemma3n.js +0 -74
  69. package/docs/parakeet-transformers-js/src/models/glpn/image_processing_glpn.js +0 -5
  70. package/docs/parakeet-transformers-js/src/models/grounding_dino/image_processing_grounding_dino.js +0 -29
  71. package/docs/parakeet-transformers-js/src/models/grounding_dino/processing_grounding_dino.js +0 -101
  72. package/docs/parakeet-transformers-js/src/models/idefics3/image_processing_idefics3.js +0 -232
  73. package/docs/parakeet-transformers-js/src/models/idefics3/processing_idefics3.js +0 -136
  74. package/docs/parakeet-transformers-js/src/models/image_processors.js +0 -40
  75. package/docs/parakeet-transformers-js/src/models/janus/image_processing_janus.js +0 -27
  76. package/docs/parakeet-transformers-js/src/models/janus/processing_janus.js +0 -123
  77. package/docs/parakeet-transformers-js/src/models/jina_clip/image_processing_jina_clip.js +0 -26
  78. package/docs/parakeet-transformers-js/src/models/jina_clip/processing_jina_clip.js +0 -24
  79. package/docs/parakeet-transformers-js/src/models/llava/processing_llava.js +0 -44
  80. package/docs/parakeet-transformers-js/src/models/llava_onevision/image_processing_llava_onevision.js +0 -5
  81. package/docs/parakeet-transformers-js/src/models/mask2former/image_processing_mask2former.js +0 -5
  82. package/docs/parakeet-transformers-js/src/models/maskformer/image_processing_maskformer.js +0 -18
  83. package/docs/parakeet-transformers-js/src/models/mgp_str/processing_mgp_str.js +0 -172
  84. package/docs/parakeet-transformers-js/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +0 -7
  85. package/docs/parakeet-transformers-js/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +0 -7
  86. package/docs/parakeet-transformers-js/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +0 -7
  87. package/docs/parakeet-transformers-js/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +0 -7
  88. package/docs/parakeet-transformers-js/src/models/mobilevit/image_processing_mobilevit.js +0 -6
  89. package/docs/parakeet-transformers-js/src/models/moonshine/feature_extraction_moonshine.js +0 -26
  90. package/docs/parakeet-transformers-js/src/models/moonshine/processing_moonshine.js +0 -20
  91. package/docs/parakeet-transformers-js/src/models/nougat/image_processing_nougat.js +0 -5
  92. package/docs/parakeet-transformers-js/src/models/owlv2/image_processing_owlv2.js +0 -5
  93. package/docs/parakeet-transformers-js/src/models/owlvit/image_processing_owlvit.js +0 -12
  94. package/docs/parakeet-transformers-js/src/models/owlvit/processing_owlvit.js +0 -7
  95. package/docs/parakeet-transformers-js/src/models/paligemma/processing_paligemma.js +0 -83
  96. package/docs/parakeet-transformers-js/src/models/parakeet/feature_extraction_parakeet.js +0 -3
  97. package/docs/parakeet-transformers-js/src/models/parakeet/modeling_parakeet.js +0 -3
  98. package/docs/parakeet-transformers-js/src/models/parakeet/processing_parakeet.js +0 -3
  99. package/docs/parakeet-transformers-js/src/models/parakeet/tokenization_parakeet.js +0 -3
  100. package/docs/parakeet-transformers-js/src/models/phi3_v/image_processing_phi3_v.js +0 -163
  101. package/docs/parakeet-transformers-js/src/models/phi3_v/processing_phi3_v.js +0 -53
  102. package/docs/parakeet-transformers-js/src/models/processors.js +0 -22
  103. package/docs/parakeet-transformers-js/src/models/pvt/image_processing_pvt.js +0 -5
  104. package/docs/parakeet-transformers-js/src/models/pyannote/feature_extraction_pyannote.js +0 -85
  105. package/docs/parakeet-transformers-js/src/models/pyannote/processing_pyannote.js +0 -24
  106. package/docs/parakeet-transformers-js/src/models/qwen2_vl/image_processing_qwen2_vl.js +0 -52
  107. package/docs/parakeet-transformers-js/src/models/qwen2_vl/processing_qwen2_vl.js +0 -53
  108. package/docs/parakeet-transformers-js/src/models/rt_detr/image_processing_rt_detr.js +0 -12
  109. package/docs/parakeet-transformers-js/src/models/sam/image_processing_sam.js +0 -242
  110. package/docs/parakeet-transformers-js/src/models/sam/processing_sam.js +0 -20
  111. package/docs/parakeet-transformers-js/src/models/sapiens/image_processing_sapiens.js +0 -13
  112. package/docs/parakeet-transformers-js/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +0 -175
  113. package/docs/parakeet-transformers-js/src/models/segformer/image_processing_segformer.js +0 -13
  114. package/docs/parakeet-transformers-js/src/models/siglip/image_processing_siglip.js +0 -5
  115. package/docs/parakeet-transformers-js/src/models/smolvlm/image_processing_smolvlm.js +0 -2
  116. package/docs/parakeet-transformers-js/src/models/smolvlm/processing_smolvlm.js +0 -2
  117. package/docs/parakeet-transformers-js/src/models/snac/feature_extraction_snac.js +0 -3
  118. package/docs/parakeet-transformers-js/src/models/speecht5/feature_extraction_speecht5.js +0 -4
  119. package/docs/parakeet-transformers-js/src/models/speecht5/processing_speecht5.js +0 -17
  120. package/docs/parakeet-transformers-js/src/models/swin2sr/image_processing_swin2sr.js +0 -24
  121. package/docs/parakeet-transformers-js/src/models/ultravox/processing_ultravox.js +0 -54
  122. package/docs/parakeet-transformers-js/src/models/vit/image_processing_vit.js +0 -7
  123. package/docs/parakeet-transformers-js/src/models/vitmatte/image_processing_vitmatte.js +0 -50
  124. package/docs/parakeet-transformers-js/src/models/vitpose/image_processing_vitpose.js +0 -89
  125. package/docs/parakeet-transformers-js/src/models/wav2vec2/feature_extraction_wav2vec2.js +0 -44
  126. package/docs/parakeet-transformers-js/src/models/wav2vec2/processing_wav2vec2.js +0 -17
  127. package/docs/parakeet-transformers-js/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js +0 -17
  128. package/docs/parakeet-transformers-js/src/models/wespeaker/feature_extraction_wespeaker.js +0 -95
  129. package/docs/parakeet-transformers-js/src/models/whisper/common_whisper.js +0 -157
  130. package/docs/parakeet-transformers-js/src/models/whisper/feature_extraction_whisper.js +0 -92
  131. package/docs/parakeet-transformers-js/src/models/whisper/generation_whisper.js +0 -89
  132. package/docs/parakeet-transformers-js/src/models/whisper/processing_whisper.js +0 -21
  133. package/docs/parakeet-transformers-js/src/models/yolos/image_processing_yolos.js +0 -12
  134. package/docs/parakeet-transformers-js/src/models.js +0 -8644
  135. package/docs/parakeet-transformers-js/src/ops/registry.js +0 -133
  136. package/docs/parakeet-transformers-js/src/ort_env.js +0 -8
  137. package/docs/parakeet-transformers-js/src/parakeet.js +0 -792
  138. package/docs/parakeet-transformers-js/src/pipelines.js +0 -3540
  139. package/docs/parakeet-transformers-js/src/processors.js +0 -16
  140. package/docs/parakeet-transformers-js/src/tokenizers.js +0 -4432
  141. package/docs/parakeet-transformers-js/src/transformers.js +0 -50
  142. package/docs/parakeet-transformers-js/src/utils/audio.js +0 -893
  143. package/docs/parakeet-transformers-js/src/utils/constants.js +0 -9
  144. package/docs/parakeet-transformers-js/src/utils/core.js +0 -259
  145. package/docs/parakeet-transformers-js/src/utils/data-structures.js +0 -574
  146. package/docs/parakeet-transformers-js/src/utils/devices.js +0 -22
  147. package/docs/parakeet-transformers-js/src/utils/dtypes.js +0 -63
  148. package/docs/parakeet-transformers-js/src/utils/generic.js +0 -35
  149. package/docs/parakeet-transformers-js/src/utils/hub.js +0 -780
  150. package/docs/parakeet-transformers-js/src/utils/image.js +0 -834
  151. package/docs/parakeet-transformers-js/src/utils/maths.js +0 -1061
  152. package/docs/parakeet-transformers-js/src/utils/tensor.js +0 -1539
  153. package/docs/parakeet-transformers-js/src/utils/video.js +0 -128
  154. package/docs/parakeet-transformers-js/test/decoder.test.js +0 -114
  155. package/docs/parakeet-transformers-js/test/encoder.test.js +0 -108
  156. package/docs/parakeet-transformers-js/test/preprocessor.test.js +0 -85
  157. package/docs/parakeet-transformers-js/test/tokenizer.test.js +0 -24
  158. package/docs/parakeet-transformers-js/test/transcribe.js +0 -89
  159. package/docs/parakeet-transformers-js/tsconfig.json +0 -21
  160. package/docs/parakeet-transformers-js/webpack.config.js +0 -223
@@ -1,3540 +0,0 @@
1
- /**
2
- * @file Pipelines provide a high-level, easy to use, API for running machine learning models.
3
- *
4
- * **Example:** Instantiate pipeline using the `pipeline` function.
5
- * ```javascript
6
- * import { pipeline } from '@huggingface/transformers';
7
- *
8
- * const classifier = await pipeline('sentiment-analysis');
9
- * const output = await classifier('I love transformers!');
10
- * // [{'label': 'POSITIVE', 'score': 0.999817686}]
11
- * ```
12
- *
13
- * @module pipelines
14
- */
15
-
16
- import {
17
- AutoTokenizer,
18
- PreTrainedTokenizer,
19
- } from './tokenizers.js';
20
- import {
21
- AutoModel,
22
- AutoModelForSequenceClassification,
23
- AutoModelForAudioClassification,
24
- AutoModelForTokenClassification,
25
- AutoModelForQuestionAnswering,
26
- AutoModelForMaskedLM,
27
- AutoModelForSeq2SeqLM,
28
- AutoModelForSpeechSeq2Seq,
29
- AutoModelForTextToWaveform,
30
- AutoModelForTextToSpectrogram,
31
- AutoModelForCTC,
32
- AutoModelForCausalLM,
33
- AutoModelForVision2Seq,
34
- AutoModelForImageClassification,
35
- AutoModelForImageSegmentation,
36
- AutoModelForSemanticSegmentation,
37
- AutoModelForUniversalSegmentation,
38
- AutoModelForObjectDetection,
39
- AutoModelForZeroShotObjectDetection,
40
- AutoModelForDocumentQuestionAnswering,
41
- AutoModelForImageToImage,
42
- AutoModelForDepthEstimation,
43
- AutoModelForImageFeatureExtraction,
44
- PreTrainedModel,
45
- } from './models.js';
46
- import {
47
- AutoProcessor,
48
- } from './models/auto/processing_auto.js';
49
- import {
50
- Processor,
51
- } from './base/processing_utils.js';
52
-
53
- import {
54
- Callable,
55
- } from './utils/generic.js';
56
-
57
- import {
58
- dispatchCallback,
59
- product,
60
- } from './utils/core.js';
61
- import {
62
- softmax,
63
- max,
64
- round,
65
- } from './utils/maths.js';
66
- import {
67
- read_audio,
68
- RawAudio
69
- } from './utils/audio.js';
70
- import {
71
- Tensor,
72
- mean_pooling,
73
- interpolate_4d,
74
- quantize_embeddings,
75
- topk,
76
- } from './utils/tensor.js';
77
- import { RawImage } from './utils/image.js';
78
-
79
-
80
- /**
81
- * @typedef {string | RawImage | URL | Blob | HTMLCanvasElement | OffscreenCanvas} ImageInput
82
- * @typedef {ImageInput|ImageInput[]} ImagePipelineInputs
83
- */
84
-
85
- /**
86
- * Prepare images for further tasks.
87
- * @param {ImagePipelineInputs} images images to prepare.
88
- * @returns {Promise<RawImage[]>} returns processed images.
89
- * @private
90
- */
91
- async function prepareImages(images) {
92
- if (!Array.isArray(images)) {
93
- images = [images];
94
- }
95
-
96
- // Possibly convert any non-images to images
97
- return await Promise.all(images.map(x => RawImage.read(x)));
98
- }
99
-
100
- /**
101
- * @typedef {string | URL | Float32Array | Float64Array} AudioInput
102
- * @typedef {AudioInput|AudioInput[]} AudioPipelineInputs
103
- */
104
-
105
- /**
106
- * Prepare audios for further tasks.
107
- * @param {AudioPipelineInputs} audios audios to prepare.
108
- * @param {number} sampling_rate sampling rate of the audios.
109
- * @returns {Promise<Float32Array[]>} The preprocessed audio data.
110
- * @private
111
- */
112
- async function prepareAudios(audios, sampling_rate) {
113
- if (!Array.isArray(audios)) {
114
- audios = [audios];
115
- }
116
-
117
- return await Promise.all(audios.map(x => {
118
- if (typeof x === 'string' || x instanceof URL) {
119
- return read_audio(x, sampling_rate);
120
- } else if (x instanceof Float64Array) {
121
- return new Float32Array(x);
122
- }
123
- return x;
124
- }));
125
- }
126
-
127
- /**
128
- * @typedef {Object} BoundingBox
129
- * @property {number} xmin The minimum x coordinate of the bounding box.
130
- * @property {number} ymin The minimum y coordinate of the bounding box.
131
- * @property {number} xmax The maximum x coordinate of the bounding box.
132
- * @property {number} ymax The maximum y coordinate of the bounding box.
133
- */
134
-
135
- /**
136
- * Helper function to convert list [xmin, xmax, ymin, ymax] into object { "xmin": xmin, ... }
137
- * @param {number[]} box The bounding box as a list.
138
- * @param {boolean} asInteger Whether to cast to integers.
139
- * @returns {BoundingBox} The bounding box as an object.
140
- * @private
141
- */
142
- function get_bounding_box(box, asInteger) {
143
- if (asInteger) {
144
- box = box.map(x => x | 0);
145
- }
146
- const [xmin, ymin, xmax, ymax] = box;
147
-
148
- return { xmin, ymin, xmax, ymax };
149
- }
150
-
151
-
152
- /**
153
- * @callback DisposeType Disposes the item.
154
- * @returns {Promise<void>} A promise that resolves when the item has been disposed.
155
- *
156
- * @typedef {Object} Disposable
157
- * @property {DisposeType} dispose A promise that resolves when the pipeline has been disposed.
158
- */
159
-
160
- /**
161
- * The Pipeline class is the class from which all pipelines inherit.
162
- * Refer to this class for methods shared across different pipelines.
163
- */
164
- export class Pipeline extends Callable {
165
- /**
166
- * Create a new Pipeline.
167
- * @param {Object} options An object containing the following properties:
168
- * @param {string} [options.task] The task of the pipeline. Useful for specifying subtasks.
169
- * @param {PreTrainedModel} [options.model] The model used by the pipeline.
170
- * @param {PreTrainedTokenizer} [options.tokenizer=null] The tokenizer used by the pipeline (if any).
171
- * @param {Processor} [options.processor=null] The processor used by the pipeline (if any).
172
- */
173
- constructor({ task, model, tokenizer = null, processor = null }) {
174
- super();
175
- this.task = task;
176
- this.model = model;
177
- this.tokenizer = tokenizer;
178
- this.processor = processor;
179
- }
180
-
181
- /** @type {DisposeType} */
182
- async dispose() {
183
- await this.model.dispose();
184
- }
185
- }
186
-
187
- /**
188
- * @typedef {Object} ModelTokenizerConstructorArgs
189
- * @property {string} task The task of the pipeline. Useful for specifying subtasks.
190
- * @property {PreTrainedModel} model The model used by the pipeline.
191
- * @property {PreTrainedTokenizer} tokenizer The tokenizer used by the pipeline.
192
- *
193
- * @typedef {ModelTokenizerConstructorArgs} TextPipelineConstructorArgs An object used to instantiate a text-based pipeline.
194
- */
195
-
196
- /**
197
- * @typedef {Object} ModelProcessorConstructorArgs
198
- * @property {string} task The task of the pipeline. Useful for specifying subtasks.
199
- * @property {PreTrainedModel} model The model used by the pipeline.
200
- * @property {Processor} processor The processor used by the pipeline.
201
- *
202
- * @typedef {ModelProcessorConstructorArgs} AudioPipelineConstructorArgs An object used to instantiate an audio-based pipeline.
203
- * @typedef {ModelProcessorConstructorArgs} ImagePipelineConstructorArgs An object used to instantiate an image-based pipeline.
204
- */
205
-
206
-
207
- /**
208
- * @typedef {Object} ModelTokenizerProcessorConstructorArgs
209
- * @property {string} task The task of the pipeline. Useful for specifying subtasks.
210
- * @property {PreTrainedModel} model The model used by the pipeline.
211
- * @property {PreTrainedTokenizer} tokenizer The tokenizer used by the pipeline.
212
- * @property {Processor} processor The processor used by the pipeline.
213
- *
214
- * @typedef {ModelTokenizerProcessorConstructorArgs} TextAudioPipelineConstructorArgs An object used to instantiate a text- and audio-based pipeline.
215
- * @typedef {ModelTokenizerProcessorConstructorArgs} TextImagePipelineConstructorArgs An object used to instantiate a text- and image-based pipeline.
216
- */
217
-
218
- /**
219
- * @typedef {Object} TextClassificationSingle
220
- * @property {string} label The label predicted.
221
- * @property {number} score The corresponding probability.
222
- * @typedef {TextClassificationSingle[]} TextClassificationOutput
223
- *
224
- * @typedef {Object} TextClassificationPipelineOptions Parameters specific to text classification pipelines.
225
- * @property {number} [top_k=1] The number of top predictions to be returned.
226
- *
227
- * @callback TextClassificationPipelineCallback Classify the text(s) given as inputs.
228
- * @param {string|string[]} texts The input text(s) to be classified.
229
- * @param {TextClassificationPipelineOptions} [options] The options to use for text classification.
230
- * @returns {Promise<TextClassificationOutput|TextClassificationOutput[]>} An array or object containing the predicted labels and scores.
231
- *
232
- * @typedef {TextPipelineConstructorArgs & TextClassificationPipelineCallback & Disposable} TextClassificationPipelineType
233
- */
234
-
235
- /**
236
- * Text classification pipeline using any `ModelForSequenceClassification`.
237
- *
238
- * **Example:** Sentiment-analysis w/ `Xenova/distilbert-base-uncased-finetuned-sst-2-english`.
239
- * ```javascript
240
- * const classifier = await pipeline('sentiment-analysis', 'Xenova/distilbert-base-uncased-finetuned-sst-2-english');
241
- * const output = await classifier('I love transformers!');
242
- * // [{ label: 'POSITIVE', score: 0.999788761138916 }]
243
- * ```
244
- *
245
- * **Example:** Multilingual sentiment-analysis w/ `Xenova/bert-base-multilingual-uncased-sentiment` (and return top 5 classes).
246
- * ```javascript
247
- * const classifier = await pipeline('sentiment-analysis', 'Xenova/bert-base-multilingual-uncased-sentiment');
248
- * const output = await classifier('Le meilleur film de tous les temps.', { top_k: 5 });
249
- * // [
250
- * // { label: '5 stars', score: 0.9610759615898132 },
251
- * // { label: '4 stars', score: 0.03323351591825485 },
252
- * // { label: '3 stars', score: 0.0036155181005597115 },
253
- * // { label: '1 star', score: 0.0011325967498123646 },
254
- * // { label: '2 stars', score: 0.0009423971059732139 }
255
- * // ]
256
- * ```
257
- *
258
- * **Example:** Toxic comment classification w/ `Xenova/toxic-bert` (and return all classes).
259
- * ```javascript
260
- * const classifier = await pipeline('text-classification', 'Xenova/toxic-bert');
261
- * const output = await classifier('I hate you!', { top_k: null });
262
- * // [
263
- * // { label: 'toxic', score: 0.9593140482902527 },
264
- * // { label: 'insult', score: 0.16187334060668945 },
265
- * // { label: 'obscene', score: 0.03452680632472038 },
266
- * // { label: 'identity_hate', score: 0.0223250575363636 },
267
- * // { label: 'threat', score: 0.019197041168808937 },
268
- * // { label: 'severe_toxic', score: 0.005651099607348442 }
269
- * // ]
270
- * ```
271
- */
272
- export class TextClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TextClassificationPipelineType} */ (Pipeline)) {
273
-
274
- /**
275
- * Create a new TextClassificationPipeline.
276
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
277
- */
278
- constructor(options) {
279
- super(options);
280
- }
281
-
282
- /** @type {TextClassificationPipelineCallback} */
283
- async _call(texts, {
284
- top_k = 1
285
- } = {}) {
286
-
287
- // Run tokenization
288
- const model_inputs = this.tokenizer(texts, {
289
- padding: true,
290
- truncation: true,
291
- });
292
-
293
- // Run model
294
- const outputs = await this.model(model_inputs)
295
-
296
- // TODO: Use softmax tensor function
297
- const function_to_apply =
298
- // @ts-expect-error TS2339
299
- this.model.config.problem_type === 'multi_label_classification'
300
- ? batch => batch.sigmoid()
301
- : batch => new Tensor(
302
- 'float32',
303
- softmax(batch.data),
304
- batch.dims,
305
- ); // single_label_classification (default)
306
-
307
- // @ts-expect-error TS2339
308
- const id2label = this.model.config.id2label;
309
-
310
- const toReturn = [];
311
- for (const batch of outputs.logits) {
312
- const output = function_to_apply(batch);
313
-
314
- const scores = await topk(output, top_k);
315
-
316
- const values = scores[0].tolist();
317
- const indices = scores[1].tolist();
318
- const vals = indices.map((x, i) => ({
319
- label: id2label ? id2label[x] : `LABEL_${x}`,
320
- score: values[i],
321
- }));
322
- if (top_k === 1) {
323
- toReturn.push(...vals);
324
- } else {
325
- toReturn.push(vals);
326
- }
327
- }
328
-
329
- return Array.isArray(texts) || top_k === 1 ? /** @type {TextClassificationOutput} */ (toReturn) : /** @type {TextClassificationOutput[]} */ (toReturn)[0];
330
- }
331
- }
332
-
333
- /**
334
- * @typedef {Object} TokenClassificationSingle
335
- * @property {string} word The token/word classified. This is obtained by decoding the selected tokens.
336
- * @property {number} score The corresponding probability for `entity`.
337
- * @property {string} entity The entity predicted for that token/word.
338
- * @property {number} index The index of the corresponding token in the sentence.
339
- * @property {number} [start] The index of the start of the corresponding entity in the sentence.
340
- * @property {number} [end] The index of the end of the corresponding entity in the sentence.
341
- * @typedef {TokenClassificationSingle[]} TokenClassificationOutput
342
- *
343
- * @typedef {Object} TokenClassificationPipelineOptions Parameters specific to token classification pipelines.
344
- * @property {string[]} [ignore_labels] A list of labels to ignore.
345
- *
346
- * @callback TokenClassificationPipelineCallback Classify each token of the text(s) given as inputs.
347
- * @param {string|string[]} texts One or several texts (or one list of texts) for token classification.
348
- * @param {TokenClassificationPipelineOptions} [options] The options to use for token classification.
349
- * @returns {Promise<TokenClassificationOutput|TokenClassificationOutput[]>} The result.
350
- *
351
- * @typedef {TextPipelineConstructorArgs & TokenClassificationPipelineCallback & Disposable} TokenClassificationPipelineType
352
- */
353
-
354
- /**
355
- * Named Entity Recognition pipeline using any `ModelForTokenClassification`.
356
- *
357
- * **Example:** Perform named entity recognition with `Xenova/bert-base-NER`.
358
- * ```javascript
359
- * const classifier = await pipeline('token-classification', 'Xenova/bert-base-NER');
360
- * const output = await classifier('My name is Sarah and I live in London');
361
- * // [
362
- * // { entity: 'B-PER', score: 0.9980202913284302, index: 4, word: 'Sarah' },
363
- * // { entity: 'B-LOC', score: 0.9994474053382874, index: 9, word: 'London' }
364
- * // ]
365
- * ```
366
- *
367
- * **Example:** Perform named entity recognition with `Xenova/bert-base-NER` (and return all labels).
368
- * ```javascript
369
- * const classifier = await pipeline('token-classification', 'Xenova/bert-base-NER');
370
- * const output = await classifier('Sarah lives in the United States of America', { ignore_labels: [] });
371
- * // [
372
- * // { entity: 'B-PER', score: 0.9966587424278259, index: 1, word: 'Sarah' },
373
- * // { entity: 'O', score: 0.9987385869026184, index: 2, word: 'lives' },
374
- * // { entity: 'O', score: 0.9990072846412659, index: 3, word: 'in' },
375
- * // { entity: 'O', score: 0.9988298416137695, index: 4, word: 'the' },
376
- * // { entity: 'B-LOC', score: 0.9995510578155518, index: 5, word: 'United' },
377
- * // { entity: 'I-LOC', score: 0.9990395307540894, index: 6, word: 'States' },
378
- * // { entity: 'I-LOC', score: 0.9986724853515625, index: 7, word: 'of' },
379
- * // { entity: 'I-LOC', score: 0.9975294470787048, index: 8, word: 'America' }
380
- * // ]
381
- * ```
382
- */
383
- export class TokenClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TokenClassificationPipelineType} */ (Pipeline)) {
384
-
385
- /**
386
- * Create a new TokenClassificationPipeline.
387
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
388
- */
389
- constructor(options) {
390
- super(options);
391
- }
392
-
393
- /** @type {TokenClassificationPipelineCallback} */
394
- async _call(texts, {
395
- ignore_labels = ['O'],
396
- } = {}) {
397
-
398
- const isBatched = Array.isArray(texts);
399
-
400
- // Run tokenization
401
- const model_inputs = this.tokenizer(isBatched ? texts : [texts], {
402
- padding: true,
403
- truncation: true,
404
- });
405
-
406
- // Run model
407
- const outputs = await this.model(model_inputs)
408
-
409
- const logits = outputs.logits;
410
- // @ts-expect-error TS2339
411
- const id2label = this.model.config.id2label;
412
-
413
- const toReturn = [];
414
- for (let i = 0; i < logits.dims[0]; ++i) {
415
- const ids = model_inputs.input_ids[i];
416
- const batch = logits[i];
417
-
418
- // List of tokens that aren't ignored
419
- const tokens = [];
420
- for (let j = 0; j < batch.dims[0]; ++j) {
421
- const tokenData = batch[j];
422
- const topScoreIndex = max(tokenData.data)[1];
423
-
424
- const entity = id2label ? id2label[topScoreIndex] : `LABEL_${topScoreIndex}`;
425
- if (ignore_labels.includes(entity)) {
426
- // We predicted a token that should be ignored. So, we skip it.
427
- continue;
428
- }
429
-
430
- // TODO add option to keep special tokens?
431
- const word = this.tokenizer.decode([ids[j].item()], { skip_special_tokens: true });
432
- if (word === '') {
433
- // Was a special token. So, we skip it.
434
- continue;
435
- }
436
-
437
- const scores = softmax(tokenData.data);
438
-
439
- tokens.push({
440
- entity: entity,
441
- score: scores[topScoreIndex],
442
- index: j,
443
- word: word,
444
-
445
- // TODO: Add support for start and end
446
- // start: null,
447
- // end: null,
448
- });
449
- }
450
- toReturn.push(tokens);
451
- }
452
- return isBatched ? toReturn : toReturn[0];
453
- }
454
- }
455
-
456
- /**
457
- * @typedef {Object} QuestionAnsweringOutput
458
- * @property {number} score The probability associated to the answer.
459
- * @property {number} [start] The character start index of the answer (in the tokenized version of the input).
460
- * @property {number} [end] The character end index of the answer (in the tokenized version of the input).
461
- * @property {string} answer The answer to the question.
462
- *
463
- * @typedef {Object} QuestionAnsweringPipelineOptions Parameters specific to question answering pipelines.
464
- * @property {number} [top_k=1] The number of top answer predictions to be returned.
465
- *
466
- * @callback QuestionAnsweringPipelineCallback Answer the question(s) given as inputs by using the context(s).
467
- * @param {string|string[]} question One or several question(s) (must be used in conjunction with the `context` argument).
468
- * @param {string|string[]} context One or several context(s) associated with the question(s) (must be used in conjunction with the `question` argument).
469
- * @param {QuestionAnsweringPipelineOptions} [options] The options to use for question answering.
470
- * @returns {Promise<QuestionAnsweringOutput|QuestionAnsweringOutput[]>} An array or object containing the predicted answers and scores.
471
- *
472
- * @typedef {TextPipelineConstructorArgs & QuestionAnsweringPipelineCallback & Disposable} QuestionAnsweringPipelineType
473
- */
474
-
475
- /**
476
- * Question Answering pipeline using any `ModelForQuestionAnswering`.
477
- *
478
- * **Example:** Run question answering with `Xenova/distilbert-base-uncased-distilled-squad`.
479
- * ```javascript
480
- * const answerer = await pipeline('question-answering', 'Xenova/distilbert-base-uncased-distilled-squad');
481
- * const question = 'Who was Jim Henson?';
482
- * const context = 'Jim Henson was a nice puppet.';
483
- * const output = await answerer(question, context);
484
- * // {
485
- * // answer: "a nice puppet",
486
- * // score: 0.5768911502526741
487
- * // }
488
- * ```
489
- */
490
- export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => QuestionAnsweringPipelineType} */ (Pipeline)) {
491
-
492
- /**
493
- * Create a new QuestionAnsweringPipeline.
494
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
495
- */
496
- constructor(options) {
497
- super(options);
498
- }
499
-
500
- /** @type {QuestionAnsweringPipelineCallback} */
501
- async _call(question, context, {
502
- top_k = 1
503
- } = {}) {
504
-
505
- // Run tokenization
506
- const inputs = this.tokenizer(question, {
507
- text_pair: context,
508
- padding: true,
509
- truncation: true,
510
- });
511
-
512
- const { start_logits, end_logits } = await this.model(inputs);
513
- const input_ids = inputs.input_ids.tolist();
514
- const attention_mask = inputs.attention_mask.tolist();
515
-
516
- // TODO: add support for `return_special_tokens_mask`
517
- const special_tokens = this.tokenizer.all_special_ids;
518
-
519
- /** @type {QuestionAnsweringOutput[]} */
520
- const toReturn = [];
521
- for (let j = 0; j < start_logits.dims[0]; ++j) {
522
- const ids = input_ids[j];
523
- const sepIndex = ids.findIndex(x =>
524
- // We use == to match bigint with number
525
- // @ts-ignore
526
- x == this.tokenizer.sep_token_id
527
- );
528
-
529
-
530
- const valid_mask = attention_mask[j].map((y, ix) => (
531
- y == 1
532
- && (
533
- ix === 0 // is cls_token
534
- || (
535
- ix > sepIndex
536
- && special_tokens.findIndex(x => x == ids[ix]) === -1 // token is not a special token (special_tokens_mask == 0)
537
- )
538
- )
539
- ));
540
-
541
- const start = start_logits[j].tolist();
542
- const end = end_logits[j].tolist();
543
-
544
- // Now, we mask out values that can't be in the answer
545
- // NOTE: We keep the cls_token unmasked (some models use it to indicate unanswerable questions)
546
- for (let i = 1; i < start.length; ++i) {
547
- if (
548
- attention_mask[j] == 0 // is part of padding
549
- || i <= sepIndex // is before the sep_token
550
- || special_tokens.findIndex(x => x == ids[i]) !== -1 // Is a special token
551
- ) {
552
- // Make sure non-context indexes in the tensor cannot contribute to the softmax
553
- start[i] = -Infinity;
554
- end[i] = -Infinity;
555
- }
556
- }
557
-
558
- // Normalize logits and spans to retrieve the answer
559
- const start_scores = softmax(start).map((x, i) => [x, i]);
560
- const end_scores = softmax(end).map((x, i) => [x, i]);
561
-
562
- // Mask CLS
563
- start_scores[0][0] = 0;
564
- end_scores[0][0] = 0;
565
-
566
- // Generate all valid spans and select best ones
567
- const options = product(start_scores, end_scores)
568
- .filter(x => x[0][1] <= x[1][1])
569
- .map(x => [x[0][1], x[1][1], x[0][0] * x[1][0]])
570
- .sort((a, b) => b[2] - a[2]);
571
-
572
- for (let k = 0; k < Math.min(options.length, top_k); ++k) {
573
- const [start, end, score] = options[k];
574
-
575
- const answer_tokens = ids.slice(start, end + 1)
576
-
577
- const answer = this.tokenizer.decode(answer_tokens, {
578
- skip_special_tokens: true,
579
- });
580
-
581
- // TODO add start and end?
582
- // NOTE: HF returns character index
583
- toReturn.push({
584
- answer, score
585
- });
586
- }
587
- }
588
-
589
- // Mimic HF's return type based on top_k
590
- return (top_k === 1) ? toReturn[0] : toReturn;
591
- }
592
- }
593
-
594
-
595
- /**
596
- * @typedef {Object} FillMaskSingle
597
- * @property {string} sequence The corresponding input with the mask token prediction.
598
- * @property {number} score The corresponding probability.
599
- * @property {number} token The predicted token id (to replace the masked one).
600
- * @property {string} token_str The predicted token (to replace the masked one).
601
- * @typedef {FillMaskSingle[]} FillMaskOutput
602
- *
603
- * @typedef {Object} FillMaskPipelineOptions Parameters specific to fill mask pipelines.
604
- * @property {number} [top_k=5] When passed, overrides the number of predictions to return.
605
- *
606
- * @callback FillMaskPipelineCallback Fill the masked token in the text(s) given as inputs.
607
- * @param {string|string[]} texts One or several texts (or one list of prompts) with masked tokens.
608
- * @param {FillMaskPipelineOptions} [options] The options to use for masked language modelling.
609
- * @returns {Promise<FillMaskOutput|FillMaskOutput[]>} An array of objects containing the score, predicted token, predicted token string,
610
- * and the sequence with the predicted token filled in, or an array of such arrays (one for each input text).
611
- * If only one input text is given, the output will be an array of objects.
612
- * @throws {Error} When the mask token is not found in the input text.
613
- *
614
- * @typedef {TextPipelineConstructorArgs & FillMaskPipelineCallback & Disposable} FillMaskPipelineType
615
- */
616
-
617
- /**
618
- * Masked language modeling prediction pipeline using any `ModelWithLMHead`.
619
- *
620
- * **Example:** Perform masked language modelling (a.k.a. "fill-mask") with `Xenova/bert-base-uncased`.
621
- * ```javascript
622
- * const unmasker = await pipeline('fill-mask', 'Xenova/bert-base-cased');
623
- * const output = await unmasker('The goal of life is [MASK].');
624
- * // [
625
- * // { token_str: 'survival', score: 0.06137419492006302, token: 8115, sequence: 'The goal of life is survival.' },
626
- * // { token_str: 'love', score: 0.03902450203895569, token: 1567, sequence: 'The goal of life is love.' },
627
- * // { token_str: 'happiness', score: 0.03253183513879776, token: 9266, sequence: 'The goal of life is happiness.' },
628
- * // { token_str: 'freedom', score: 0.018736306577920914, token: 4438, sequence: 'The goal of life is freedom.' },
629
- * // { token_str: 'life', score: 0.01859794743359089, token: 1297, sequence: 'The goal of life is life.' }
630
- * // ]
631
- * ```
632
- *
633
- * **Example:** Perform masked language modelling (a.k.a. "fill-mask") with `Xenova/bert-base-cased` (and return top result).
634
- * ```javascript
635
- * const unmasker = await pipeline('fill-mask', 'Xenova/bert-base-cased');
636
- * const output = await unmasker('The Milky Way is a [MASK] galaxy.', { top_k: 1 });
637
- * // [{ token_str: 'spiral', score: 0.6299987435340881, token: 14061, sequence: 'The Milky Way is a spiral galaxy.' }]
638
- * ```
639
- */
640
- export class FillMaskPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => FillMaskPipelineType} */ (Pipeline)) {
641
-
642
- /**
643
- * Create a new FillMaskPipeline.
644
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
645
- */
646
- constructor(options) {
647
- super(options);
648
- }
649
-
650
- /** @type {FillMaskPipelineCallback} */
651
- async _call(texts, {
652
- top_k = 5
653
- } = {}) {
654
-
655
- // Run tokenization
656
- const model_inputs = this.tokenizer(texts, {
657
- padding: true,
658
- truncation: true,
659
- });
660
-
661
- // Run model
662
- const { logits } = await this.model(model_inputs)
663
-
664
- const toReturn = [];
665
-
666
- /** @type {bigint[][]} */
667
- const input_ids = model_inputs.input_ids.tolist();
668
- for (let i = 0; i < input_ids.length; ++i) {
669
- const ids = input_ids[i];
670
- const mask_token_index = ids.findIndex(x =>
671
- // We use == to match bigint with number
672
- // @ts-ignore
673
- x == this.tokenizer.mask_token_id
674
- );
675
- if (mask_token_index === -1) {
676
- throw Error(`Mask token (${this.tokenizer.mask_token}) not found in text.`)
677
- }
678
- const itemLogits = logits[i][mask_token_index];
679
-
680
- const scores = await topk(new Tensor(
681
- 'float32',
682
- softmax(itemLogits.data),
683
- itemLogits.dims,
684
- ), top_k);
685
- const values = scores[0].tolist();
686
- const indices = scores[1].tolist();
687
-
688
- toReturn.push(indices.map((x, i) => {
689
- const sequence = ids.slice();
690
- sequence[mask_token_index] = x;
691
-
692
- return {
693
- score: values[i],
694
- token: Number(x),
695
- token_str: this.tokenizer.decode([x]),
696
- sequence: this.tokenizer.decode(sequence, { skip_special_tokens: true }),
697
- }
698
- }));
699
- }
700
- return Array.isArray(texts) ? toReturn : toReturn[0];
701
- }
702
- }
703
-
704
-
705
- /**
706
- * @typedef {Object} Text2TextGenerationSingle
707
- * @property {string} generated_text The generated text.
708
- * @typedef {Text2TextGenerationSingle[]} Text2TextGenerationOutput
709
- *
710
- * @callback Text2TextGenerationPipelineCallback Generate the output text(s) using text(s) given as inputs.
711
- * @param {string|string[]} texts Input text for the encoder.
712
- * @param {Partial<import('./generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
713
- * @returns {Promise<Text2TextGenerationOutput|Text2TextGenerationOutput[]>}
714
- *
715
- * @typedef {TextPipelineConstructorArgs & Text2TextGenerationPipelineCallback & Disposable} Text2TextGenerationPipelineType
716
- */
717
-
718
- /**
719
- * Text2TextGenerationPipeline class for generating text using a model that performs text-to-text generation tasks.
720
- *
721
- * **Example:** Text-to-text generation w/ `Xenova/LaMini-Flan-T5-783M`.
722
- * ```javascript
723
- * const generator = await pipeline('text2text-generation', 'Xenova/LaMini-Flan-T5-783M');
724
- * const output = await generator('how can I become more healthy?', {
725
- * max_new_tokens: 100,
726
- * });
727
- * // [{ generated_text: "To become more healthy, you can: 1. Eat a balanced diet with plenty of fruits, vegetables, whole grains, lean proteins, and healthy fats. 2. Stay hydrated by drinking plenty of water. 3. Get enough sleep and manage stress levels. 4. Avoid smoking and excessive alcohol consumption. 5. Regularly exercise and maintain a healthy weight. 6. Practice good hygiene and sanitation. 7. Seek medical attention if you experience any health issues." }]
728
- * ```
729
- */
730
- export class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => Text2TextGenerationPipelineType} */ (Pipeline)) {
731
- /** @type {'generated_text'} */
732
- _key = 'generated_text';
733
-
734
- /**
735
- * Create a new Text2TextGenerationPipeline.
736
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
737
- */
738
- constructor(options) {
739
- super(options);
740
- }
741
-
742
- /** @type {Text2TextGenerationPipelineCallback} */
743
- async _call(texts, generate_kwargs = {}) {
744
- if (!Array.isArray(texts)) {
745
- texts = [texts];
746
- }
747
-
748
-
749
- // Add global prefix, if present
750
- // @ts-expect-error TS2339
751
- if (this.model.config.prefix) {
752
- // @ts-expect-error TS2339
753
- texts = texts.map(x => this.model.config.prefix + x)
754
- }
755
-
756
- // Handle task specific params:
757
- // @ts-expect-error TS2339
758
- const task_specific_params = this.model.config.task_specific_params
759
- if (task_specific_params && task_specific_params[this.task]) {
760
- // Add prefixes, if present
761
- if (task_specific_params[this.task].prefix) {
762
- texts = texts.map(x => task_specific_params[this.task].prefix + x)
763
- }
764
-
765
- // TODO update generation config
766
- }
767
-
768
- const tokenizer = this.tokenizer;
769
- const tokenizer_options = {
770
- padding: true,
771
- truncation: true,
772
- }
773
- let inputs;
774
- if (this instanceof TranslationPipeline && '_build_translation_inputs' in tokenizer) {
775
- // TODO: move to Translation pipeline?
776
- // Currently put here to avoid code duplication
777
- // @ts-ignore
778
- inputs = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs);
779
-
780
- } else {
781
- inputs = tokenizer(texts, tokenizer_options);
782
- }
783
-
784
- const outputTokenIds = await this.model.generate({ ...inputs, ...generate_kwargs });
785
- return tokenizer.batch_decode(/** @type {Tensor} */(outputTokenIds), {
786
- skip_special_tokens: true,
787
- }).map(text => ({ [this._key]: text }));
788
- }
789
- }
790
-
791
-
792
- /**
793
- * @typedef {Object} SummarizationSingle
794
- * @property {string} summary_text The summary text.
795
- * @typedef {SummarizationSingle[]} SummarizationOutput
796
- *
797
- * @callback SummarizationPipelineCallback Summarize the text(s) given as inputs.
798
- * @param {string|string[]} texts One or several articles (or one list of articles) to summarize.
799
- * @param {import('./generation/configuration_utils.js').GenerationConfig} [options] Additional keyword arguments to pass along to the generate method of the model.
800
- * @returns {Promise<SummarizationOutput|SummarizationOutput[]>}
801
- *
802
- * @typedef {TextPipelineConstructorArgs & SummarizationPipelineCallback & Disposable} SummarizationPipelineType
803
- */
804
-
805
- /**
806
- * A pipeline for summarization tasks, inheriting from Text2TextGenerationPipeline.
807
- *
808
- * **Example:** Summarization w/ `Xenova/distilbart-cnn-6-6`.
809
- * ```javascript
810
- * const generator = await pipeline('summarization', 'Xenova/distilbart-cnn-6-6');
811
- * const text = 'The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, ' +
812
- * 'and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. ' +
813
- * 'During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest ' +
814
- * 'man-made structure in the world, a title it held for 41 years until the Chrysler Building in New ' +
815
- * 'York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to ' +
816
- * 'the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the ' +
817
- * 'Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second ' +
818
- * 'tallest free-standing structure in France after the Millau Viaduct.';
819
- * const output = await generator(text, {
820
- * max_new_tokens: 100,
821
- * });
822
- * // [{ summary_text: ' The Eiffel Tower is about the same height as an 81-storey building and the tallest structure in Paris. It is the second tallest free-standing structure in France after the Millau Viaduct.' }]
823
- * ```
824
- */
825
- export class SummarizationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => SummarizationPipelineType} */ (/** @type {any} */ (Text2TextGenerationPipeline))) {
826
- /** @type {'summary_text'} */
827
- _key = 'summary_text';
828
-
829
- /**
830
- * Create a new SummarizationPipeline.
831
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
832
- */
833
- constructor(options) {
834
- super(options);
835
- }
836
- }
837
-
838
-
839
- /**
840
- * @typedef {Object} TranslationSingle
841
- * @property {string} translation_text The translated text.
842
- * @typedef {TranslationSingle[]} TranslationOutput
843
- *
844
- * @callback TranslationPipelineCallback Translate the text(s) given as inputs.
845
- * @param {string|string[]} texts Texts to be translated.
846
- * @param {import('./generation/configuration_utils.js').GenerationConfig} [options] Additional keyword arguments to pass along to the generate method of the model.
847
- * @returns {Promise<TranslationOutput|TranslationOutput[]>}
848
- *
849
- * @typedef {TextPipelineConstructorArgs & TranslationPipelineCallback & Disposable} TranslationPipelineType
850
- */
851
-
852
- /**
853
- * Translates text from one language to another.
854
- *
855
- * **Example:** Multilingual translation w/ `Xenova/nllb-200-distilled-600M`.
856
- *
857
- * See [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)
858
- * for the full list of languages and their corresponding codes.
859
- *
860
- * ```javascript
861
- * const translator = await pipeline('translation', 'Xenova/nllb-200-distilled-600M');
862
- * const output = await translator('जीवन एक चॉकलेट बॉक्स की तरह है।', {
863
- * src_lang: 'hin_Deva', // Hindi
864
- * tgt_lang: 'fra_Latn', // French
865
- * });
866
- * // [{ translation_text: 'La vie est comme une boîte à chocolat.' }]
867
- * ```
868
- *
869
- * **Example:** Multilingual translation w/ `Xenova/m2m100_418M`.
870
- *
871
- * See [here](https://huggingface.co/facebook/m2m100_418M#languages-covered)
872
- * for the full list of languages and their corresponding codes.
873
- *
874
- * ```javascript
875
- * const translator = await pipeline('translation', 'Xenova/m2m100_418M');
876
- * const output = await translator('生活就像一盒巧克力。', {
877
- * src_lang: 'zh', // Chinese
878
- * tgt_lang: 'en', // English
879
- * });
880
- * // [{ translation_text: 'Life is like a box of chocolate.' }]
881
- * ```
882
- *
883
- * **Example:** Multilingual translation w/ `Xenova/mbart-large-50-many-to-many-mmt`.
884
- *
885
- * See [here](https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt#languages-covered)
886
- * for the full list of languages and their corresponding codes.
887
- *
888
- * ```javascript
889
- * const translator = await pipeline('translation', 'Xenova/mbart-large-50-many-to-many-mmt');
890
- * const output = await translator('संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है', {
891
- * src_lang: 'hi_IN', // Hindi
892
- * tgt_lang: 'fr_XX', // French
893
- * });
894
- * // [{ translation_text: 'Le chef des Nations affirme qu 'il n 'y a military solution in Syria.' }]
895
- * ```
896
- */
897
- export class TranslationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TranslationPipelineType} */ (/** @type {any} */ (Text2TextGenerationPipeline))) {
898
- /** @type {'translation_text'} */
899
- _key = 'translation_text';
900
-
901
- /**
902
- * Create a new TranslationPipeline.
903
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
904
- */
905
- constructor(options) {
906
- super(options);
907
- }
908
- }
909
-
910
- function isChat(x) {
911
- return Array.isArray(x) && x.every(x => 'role' in x && 'content' in x);
912
- }
913
-
914
- /**
915
- * @typedef {import('./tokenizers.js').Message[]} Chat
916
- *
917
- * @typedef {Object} TextGenerationSingle
918
- * @property {string|Chat} generated_text The generated text.
919
- * @typedef {TextGenerationSingle[]} TextGenerationOutput
920
- *
921
- * @typedef {Object} TextGenerationSpecificParams Parameters specific to text-generation pipelines.
922
- * @property {boolean} [add_special_tokens] Whether or not to add special tokens when tokenizing the sequences.
923
- * @property {boolean} [return_full_text=true] If set to `false` only added text is returned, otherwise the full text is returned.
924
- * @typedef {import('./generation/configuration_utils.js').GenerationConfig & TextGenerationSpecificParams} TextGenerationConfig
925
- *
926
- * @callback TextGenerationPipelineCallback Complete the prompt(s) given as inputs.
927
- * @param {string|string[]|Chat|Chat[]} texts One or several prompts (or one list of prompts) to complete.
928
- * @param {Partial<TextGenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
929
- * @returns {Promise<TextGenerationOutput|TextGenerationOutput[]>} An array or object containing the generated texts.
930
- *
931
- * @typedef {TextPipelineConstructorArgs & TextGenerationPipelineCallback & Disposable} TextGenerationPipelineType
932
- */
933
-
934
- /**
935
- * Language generation pipeline using any `ModelWithLMHead` or `ModelForCausalLM`.
936
- * This pipeline predicts the words that will follow a specified text prompt.
937
- * NOTE: For the full list of generation parameters, see [`GenerationConfig`](./utils/generation#module_utils/generation.GenerationConfig).
938
- *
939
- * **Example:** Text generation with `Xenova/distilgpt2` (default settings).
940
- * ```javascript
941
- * const generator = await pipeline('text-generation', 'Xenova/distilgpt2');
942
- * const text = 'I enjoy walking with my cute dog,';
943
- * const output = await generator(text);
944
- * // [{ generated_text: "I enjoy walking with my cute dog, and I love to play with the other dogs." }]
945
- * ```
946
- *
947
- * **Example:** Text generation with `Xenova/distilgpt2` (custom settings).
948
- * ```javascript
949
- * const generator = await pipeline('text-generation', 'Xenova/distilgpt2');
950
- * const text = 'Once upon a time, there was';
951
- * const output = await generator(text, {
952
- * temperature: 2,
953
- * max_new_tokens: 10,
954
- * repetition_penalty: 1.5,
955
- * no_repeat_ngram_size: 2,
956
- * num_beams: 2,
957
- * num_return_sequences: 2,
958
- * });
959
- * // [{
960
- * // "generated_text": "Once upon a time, there was an abundance of information about the history and activities that"
961
- * // }, {
962
- * // "generated_text": "Once upon a time, there was an abundance of information about the most important and influential"
963
- * // }]
964
- * ```
965
- *
966
- * **Example:** Run code generation with `Xenova/codegen-350M-mono`.
967
- * ```javascript
968
- * const generator = await pipeline('text-generation', 'Xenova/codegen-350M-mono');
969
- * const text = 'def fib(n):';
970
- * const output = await generator(text, {
971
- * max_new_tokens: 44,
972
- * });
973
- * // [{
974
- * // generated_text: 'def fib(n):\n' +
975
- * // ' if n == 0:\n' +
976
- * // ' return 0\n' +
977
- * // ' elif n == 1:\n' +
978
- * // ' return 1\n' +
979
- * // ' else:\n' +
980
- * // ' return fib(n-1) + fib(n-2)\n'
981
- * // }]
982
- * ```
983
- */
984
- export class TextGenerationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TextGenerationPipelineType} */ (Pipeline)) {
985
-
986
- /**
987
- * Create a new TextGenerationPipeline.
988
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
989
- */
990
- constructor(options) {
991
- super(options);
992
- }
993
-
994
- /** @type {TextGenerationPipelineCallback} */
995
- async _call(texts, generate_kwargs = {}) {
996
- let isBatched = false;
997
- let isChatInput = false;
998
-
999
- // Normalize inputs
1000
- /** @type {string[]} */
1001
- let inputs;
1002
- if (typeof texts === 'string') {
1003
- inputs = texts = [texts];
1004
- } else if (Array.isArray(texts) && texts.every(x => typeof x === 'string')) {
1005
- isBatched = true;
1006
- inputs = /** @type {string[]} */(texts);
1007
- } else {
1008
- if (isChat(texts)) {
1009
- texts = [/** @type {Chat} */(texts)];
1010
- } else if (Array.isArray(texts) && texts.every(isChat)) {
1011
- isBatched = true;
1012
- } else {
1013
- throw new Error('Input must be a string, an array of strings, a Chat, or an array of Chats');
1014
- }
1015
- isChatInput = true;
1016
-
1017
- // If the input is a chat, we need to apply the chat template
1018
- inputs = /** @type {string[]} */(/** @type {Chat[]} */ (texts).map(
1019
- x => this.tokenizer.apply_chat_template(x, {
1020
- tokenize: false,
1021
- add_generation_prompt: true,
1022
- })
1023
- ));
1024
- }
1025
-
1026
- // By default, do not add special tokens
1027
- const add_special_tokens = generate_kwargs.add_special_tokens ?? false;
1028
-
1029
- // By default, return full text
1030
- const return_full_text = isChatInput
1031
- ? false
1032
- : generate_kwargs.return_full_text ?? true;
1033
-
1034
- this.tokenizer.padding_side = 'left';
1035
- const text_inputs = this.tokenizer(inputs, {
1036
- add_special_tokens,
1037
- padding: true,
1038
- truncation: true,
1039
- });
1040
-
1041
- const outputTokenIds = /** @type {Tensor} */(await this.model.generate({
1042
- ...text_inputs,
1043
- ...generate_kwargs
1044
- }));
1045
-
1046
- const decoded = this.tokenizer.batch_decode(outputTokenIds, {
1047
- skip_special_tokens: true,
1048
- });
1049
-
1050
- let promptLengths;
1051
- if (!return_full_text && text_inputs.input_ids.dims.at(-1) > 0) {
1052
- promptLengths = this.tokenizer.batch_decode(text_inputs.input_ids, {
1053
- skip_special_tokens: true,
1054
- }).map(x => x.length);
1055
- }
1056
-
1057
- /** @type {TextGenerationOutput[]} */
1058
- const toReturn = Array.from({ length: texts.length }, _ => []);
1059
- for (let i = 0; i < decoded.length; ++i) {
1060
- const textIndex = Math.floor(i / outputTokenIds.dims[0] * texts.length);
1061
-
1062
- if (promptLengths) {
1063
- // Trim the decoded text to only include the generated part
1064
- decoded[i] = decoded[i].slice(promptLengths[textIndex]);
1065
- }
1066
- toReturn[textIndex].push({
1067
- generated_text: isChatInput
1068
- ? [
1069
- ...((/** @type {Chat[]} */(texts)[textIndex])),
1070
- { role: 'assistant', content: decoded[i] },
1071
- ]
1072
- : decoded[i]
1073
- });
1074
- }
1075
- return (!isBatched && toReturn.length === 1) ? toReturn[0] : toReturn;
1076
- }
1077
- }
1078
-
1079
- /**
1080
- * @typedef {Object} ZeroShotClassificationOutput
1081
- * @property {string} sequence The sequence for which this is the output.
1082
- * @property {string[]} labels The labels sorted by order of likelihood.
1083
- * @property {number[]} scores The probabilities for each of the labels.
1084
- *
1085
- * @typedef {Object} ZeroShotClassificationPipelineOptions Parameters specific to zero-shot classification pipelines.
1086
- * @property {string} [hypothesis_template="This example is {}."] The template used to turn each
1087
- * candidate label into an NLI-style hypothesis. The candidate label will replace the {} placeholder.
1088
- * @property {boolean} [multi_label=false] Whether or not multiple candidate labels can be true.
1089
- * If `false`, the scores are normalized such that the sum of the label likelihoods for each sequence
1090
- * is 1. If `true`, the labels are considered independent and probabilities are normalized for each
1091
- * candidate by doing a softmax of the entailment score vs. the contradiction score.
1092
- *
1093
- * @callback ZeroShotClassificationPipelineCallback Classify the sequence(s) given as inputs.
1094
- * @param {string|string[]} texts The sequence(s) to classify, will be truncated if the model input is too large.
1095
- * @param {string|string[]} candidate_labels The set of possible class labels to classify each sequence into.
1096
- * Can be a single label, a string of comma-separated labels, or a list of labels.
1097
- * @param {ZeroShotClassificationPipelineOptions} [options] The options to use for zero-shot classification.
1098
- * @returns {Promise<ZeroShotClassificationOutput|ZeroShotClassificationOutput[]>} An array or object containing the predicted labels and scores.
1099
- *
1100
- * @typedef {TextPipelineConstructorArgs & ZeroShotClassificationPipelineCallback & Disposable} ZeroShotClassificationPipelineType
1101
- */
1102
-
1103
- /**
1104
- * NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification`
1105
- * trained on NLI (natural language inference) tasks. Equivalent of `text-classification`
1106
- * pipelines, but these models don't require a hardcoded number of potential classes, they
1107
- * can be chosen at runtime. It usually means it's slower but it is **much** more flexible.
1108
- *
1109
- * **Example:** Zero shot classification with `Xenova/mobilebert-uncased-mnli`.
1110
- * ```javascript
1111
- * const classifier = await pipeline('zero-shot-classification', 'Xenova/mobilebert-uncased-mnli');
1112
- * const text = 'Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.';
1113
- * const labels = [ 'mobile', 'billing', 'website', 'account access' ];
1114
- * const output = await classifier(text, labels);
1115
- * // {
1116
- * // sequence: 'Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.',
1117
- * // labels: [ 'mobile', 'website', 'billing', 'account access' ],
1118
- * // scores: [ 0.5562091040482018, 0.1843621307860853, 0.13942646639336376, 0.12000229877234923 ]
1119
- * // }
1120
- * ```
1121
- *
1122
- * **Example:** Zero shot classification with `Xenova/nli-deberta-v3-xsmall` (multi-label).
1123
- * ```javascript
1124
- * const classifier = await pipeline('zero-shot-classification', 'Xenova/nli-deberta-v3-xsmall');
1125
- * const text = 'I have a problem with my iphone that needs to be resolved asap!';
1126
- * const labels = [ 'urgent', 'not urgent', 'phone', 'tablet', 'computer' ];
1127
- * const output = await classifier(text, labels, { multi_label: true });
1128
- * // {
1129
- * // sequence: 'I have a problem with my iphone that needs to be resolved asap!',
1130
- * // labels: [ 'urgent', 'phone', 'computer', 'tablet', 'not urgent' ],
1131
- * // scores: [ 0.9958870956360275, 0.9923963400697035, 0.002333537946160235, 0.0015134138567598765, 0.0010699384208377163 ]
1132
- * // }
1133
- * ```
1134
- */
1135
- export class ZeroShotClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => ZeroShotClassificationPipelineType} */ (Pipeline)) {
1136
- /**
1137
- * Create a new ZeroShotClassificationPipeline.
1138
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
1139
- */
1140
- constructor(options) {
1141
- super(options);
1142
-
1143
- // Use model config to get label2id mapping
1144
- this.label2id = Object.fromEntries(
1145
- Object.entries((/** @type {any} */(this).model).config.label2id).map(
1146
- ([k, v]) => [k.toLowerCase(), v]
1147
- )
1148
- );
1149
-
1150
- this.entailment_id = this.label2id['entailment'];
1151
- if (this.entailment_id === undefined) {
1152
- console.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id.");
1153
- this.entailment_id = 2;
1154
- }
1155
-
1156
- this.contradiction_id = this.label2id['contradiction'] ?? this.label2id['not_entailment'];
1157
- if (this.contradiction_id === undefined) {
1158
- console.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id.");
1159
- this.contradiction_id = 0;
1160
- }
1161
- }
1162
-
1163
- /** @type {ZeroShotClassificationPipelineCallback} */
1164
- async _call(texts, candidate_labels, {
1165
- hypothesis_template = "This example is {}.",
1166
- multi_label = false,
1167
- } = {}) {
1168
-
1169
- const isBatched = Array.isArray(texts);
1170
- if (!isBatched) {
1171
- texts = [/** @type {string} */ (texts)];
1172
- }
1173
- if (!Array.isArray(candidate_labels)) {
1174
- candidate_labels = [candidate_labels];
1175
- }
1176
-
1177
- // Insert labels into hypothesis template
1178
- const hypotheses = candidate_labels.map(
1179
- x => hypothesis_template.replace('{}', x)
1180
- );
1181
-
1182
- // How to perform the softmax over the logits:
1183
- // - true: softmax over the entailment vs. contradiction dim for each label independently
1184
- // - false: softmax the "entailment" logits over all candidate labels
1185
- const softmaxEach = multi_label || candidate_labels.length === 1;
1186
-
1187
- /** @type {ZeroShotClassificationOutput[]} */
1188
- const toReturn = [];
1189
- for (const premise of texts) {
1190
- const entails_logits = [];
1191
-
1192
- for (const hypothesis of hypotheses) {
1193
- const inputs = this.tokenizer(premise, {
1194
- text_pair: hypothesis,
1195
- padding: true,
1196
- truncation: true,
1197
- })
1198
- const outputs = await this.model(inputs)
1199
-
1200
- if (softmaxEach) {
1201
- entails_logits.push([
1202
- outputs.logits.data[this.contradiction_id],
1203
- outputs.logits.data[this.entailment_id]
1204
- ])
1205
- } else {
1206
- entails_logits.push(outputs.logits.data[this.entailment_id])
1207
- }
1208
- }
1209
-
1210
- /** @type {number[]} */
1211
- const scores = softmaxEach
1212
- ? entails_logits.map(x => softmax(x)[1])
1213
- : softmax(entails_logits);
1214
-
1215
- // Sort by scores (desc) and return scores with indices
1216
- const scores_sorted = scores
1217
- .map((x, i) => [x, i])
1218
- .sort((a, b) => (b[0] - a[0]));
1219
-
1220
- toReturn.push({
1221
- sequence: premise,
1222
- labels: scores_sorted.map(x => candidate_labels[x[1]]),
1223
- scores: scores_sorted.map(x => x[0]),
1224
- });
1225
- }
1226
- return isBatched ? toReturn : toReturn[0];
1227
- }
1228
- }
1229
-
1230
- /**
1231
- * @typedef {Object} FeatureExtractionPipelineOptions Parameters specific to feature extraction pipelines.
1232
- * @property {'none'|'mean'|'cls'|'first_token'|'eos'|'last_token'} [pooling="none"] The pooling method to use.
1233
- * @property {boolean} [normalize=false] Whether or not to normalize the embeddings in the last dimension.
1234
- * @property {boolean} [quantize=false] Whether or not to quantize the embeddings.
1235
- * @property {'binary'|'ubinary'} [precision='binary'] The precision to use for quantization.
1236
- *
1237
- * @callback FeatureExtractionPipelineCallback Extract the features of the input(s).
1238
- * @param {string|string[]} texts One or several texts (or one list of texts) to get the features of.
1239
- * @param {FeatureExtractionPipelineOptions} [options] The options to use for feature extraction.
1240
- * @returns {Promise<Tensor>} The features computed by the model.
1241
- *
1242
- * @typedef {TextPipelineConstructorArgs & FeatureExtractionPipelineCallback & Disposable} FeatureExtractionPipelineType
1243
- */
1244
-
1245
- /**
1246
- * Feature extraction pipeline using no model head. This pipeline extracts the hidden
1247
- * states from the base transformer, which can be used as features in downstream tasks.
1248
- *
1249
- * **Example:** Run feature extraction with `bert-base-uncased` (without pooling/normalization).
1250
- * ```javascript
1251
- * const extractor = await pipeline('feature-extraction', 'Xenova/bert-base-uncased', { revision: 'default' });
1252
- * const output = await extractor('This is a simple test.');
1253
- * // Tensor {
1254
- * // type: 'float32',
1255
- * // data: Float32Array [0.05939924716949463, 0.021655935794115067, ...],
1256
- * // dims: [1, 8, 768]
1257
- * // }
1258
- * ```
1259
- *
1260
- * **Example:** Run feature extraction with `bert-base-uncased` (with pooling/normalization).
1261
- * ```javascript
1262
- * const extractor = await pipeline('feature-extraction', 'Xenova/bert-base-uncased', { revision: 'default' });
1263
- * const output = await extractor('This is a simple test.', { pooling: 'mean', normalize: true });
1264
- * // Tensor {
1265
- * // type: 'float32',
1266
- * // data: Float32Array [0.03373778983950615, -0.010106077417731285, ...],
1267
- * // dims: [1, 768]
1268
- * // }
1269
- * ```
1270
- *
1271
- * **Example:** Calculating embeddings with `sentence-transformers` models.
1272
- * ```javascript
1273
- * const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
1274
- * const output = await extractor('This is a simple test.', { pooling: 'mean', normalize: true });
1275
- * // Tensor {
1276
- * // type: 'float32',
1277
- * // data: Float32Array [0.09094982594251633, -0.014774246141314507, ...],
1278
- * // dims: [1, 384]
1279
- * // }
1280
- * ```
1281
- * **Example:** Calculating binary embeddings with `sentence-transformers` models.
1282
- * ```javascript
1283
- * const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
1284
- * const output = await extractor('This is a simple test.', { pooling: 'mean', quantize: true, precision: 'binary' });
1285
- * // Tensor {
1286
- * // type: 'int8',
1287
- * // data: Int8Array [49, 108, 24, ...],
1288
- * // dims: [1, 48]
1289
- * // }
1290
- * ```
1291
- */
1292
- export class FeatureExtractionPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => FeatureExtractionPipelineType} */ (Pipeline)) {
1293
- /**
1294
- * Create a new FeatureExtractionPipeline.
1295
- * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
1296
- */
1297
- constructor(options) {
1298
- super(options);
1299
- }
1300
-
1301
- /** @type {FeatureExtractionPipelineCallback} */
1302
- async _call(texts, {
1303
- pooling = /** @type {'none'} */('none'),
1304
- normalize = false,
1305
- quantize = false,
1306
- precision = /** @type {'binary'} */('binary'),
1307
- } = {}) {
1308
-
1309
- // Run tokenization
1310
- const model_inputs = this.tokenizer(texts, {
1311
- padding: true,
1312
- truncation: true,
1313
- });
1314
-
1315
- // Run model
1316
- const outputs = await this.model(model_inputs)
1317
-
1318
- // TODO: Provide warning to the user that they might be using model which was not exported
1319
- // specifically for feature extraction
1320
- // console.log(this.model.config)
1321
- // console.log(outputs)
1322
-
1323
- /** @type {Tensor} */
1324
- let result = outputs.last_hidden_state ?? outputs.logits ?? outputs.token_embeddings;
1325
-
1326
- switch (pooling) {
1327
- case 'none':
1328
- // Skip pooling
1329
- break;
1330
- case 'mean':
1331
- result = mean_pooling(result, model_inputs.attention_mask);
1332
- break;
1333
- case 'first_token':
1334
- case 'cls':
1335
- result = result.slice(null, 0);
1336
- break;
1337
- case 'last_token':
1338
- case 'eos':
1339
- result = result.slice(null, -1);
1340
- break;
1341
- default:
1342
- throw Error(`Pooling method '${pooling}' not supported.`);
1343
- }
1344
-
1345
- if (normalize) {
1346
- result = result.normalize(2, -1);
1347
- }
1348
-
1349
- if (quantize) {
1350
- result = quantize_embeddings(result, precision);
1351
- }
1352
-
1353
- return result;
1354
- }
1355
- }
1356
-
1357
-
1358
- /**
1359
- * @typedef {Object} ImageFeatureExtractionPipelineOptions Parameters specific to image feature extraction pipelines.
1360
- * @property {boolean} [pool=null] Whether or not to return the pooled output. If set to `false`, the model will return the raw hidden states.
1361
- *
1362
- * @callback ImageFeatureExtractionPipelineCallback Extract the features of the input(s).
1363
- * @param {ImagePipelineInputs} images One or several images (or one list of images) to get the features of.
1364
- * @param {ImageFeatureExtractionPipelineOptions} [options] The options to use for image feature extraction.
1365
- * @returns {Promise<Tensor>} The image features computed by the model.
1366
- *
1367
- * @typedef {ImagePipelineConstructorArgs & ImageFeatureExtractionPipelineCallback & Disposable} ImageFeatureExtractionPipelineType
1368
- */
1369
-
1370
- /**
1371
- * Image feature extraction pipeline using no model head. This pipeline extracts the hidden
1372
- * states from the base transformer, which can be used as features in downstream tasks.
1373
- *
1374
- * **Example:** Perform image feature extraction with `Xenova/vit-base-patch16-224-in21k`.
1375
- * ```javascript
1376
- * const image_feature_extractor = await pipeline('image-feature-extraction', 'Xenova/vit-base-patch16-224-in21k');
1377
- * const url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png';
1378
- * const features = await image_feature_extractor(url);
1379
- * // Tensor {
1380
- * // dims: [ 1, 197, 768 ],
1381
- * // type: 'float32',
1382
- * // data: Float32Array(151296) [ ... ],
1383
- * // size: 151296
1384
- * // }
1385
- * ```
1386
- *
1387
- * **Example:** Compute image embeddings with `Xenova/clip-vit-base-patch32`.
1388
- * ```javascript
1389
- * const image_feature_extractor = await pipeline('image-feature-extraction', 'Xenova/clip-vit-base-patch32');
1390
- * const url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png';
1391
- * const features = await image_feature_extractor(url);
1392
- * // Tensor {
1393
- * // dims: [ 1, 512 ],
1394
- * // type: 'float32',
1395
- * // data: Float32Array(512) [ ... ],
1396
- * // size: 512
1397
- * // }
1398
- * ```
1399
- */
1400
- export class ImageFeatureExtractionPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageFeatureExtractionPipelineType} */ (Pipeline)) {
1401
- /**
1402
- * Create a new ImageFeatureExtractionPipeline.
1403
- * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
1404
- */
1405
- constructor(options) {
1406
- super(options);
1407
- }
1408
-
1409
- /** @type {ImageFeatureExtractionPipelineCallback} */
1410
- async _call(images, {
1411
- pool = null,
1412
- } = {}) {
1413
-
1414
- const preparedImages = await prepareImages(images);
1415
- const { pixel_values } = await this.processor(preparedImages);
1416
- const outputs = await this.model({ pixel_values });
1417
-
1418
- /** @type {Tensor} */
1419
- let result;
1420
- if (pool) {
1421
- if (!('pooler_output' in outputs)) {
1422
- throw Error(`No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.`);
1423
- }
1424
- result = outputs.pooler_output;
1425
-
1426
- } else {
1427
- result = outputs.last_hidden_state ?? outputs.logits ?? outputs.image_embeds;
1428
- }
1429
- return result;
1430
- }
1431
- }
1432
-
1433
- // TODO
1434
- // export class SentenceSimilarityPipeline extends Pipeline {
1435
- // }
1436
-
1437
- /**
1438
- * @typedef {Object} AudioClassificationSingle
1439
- * @property {string} label The label predicted.
1440
- * @property {number} score The corresponding probability.
1441
- * @typedef {AudioClassificationSingle[]} AudioClassificationOutput
1442
- *
1443
- * @typedef {Object} AudioClassificationPipelineOptions Parameters specific to audio classification pipelines.
1444
- * @property {number} [top_k=5] The number of top labels that will be returned by the pipeline.
1445
- * If the provided number is `null` or higher than the number of labels available in the model configuration,
1446
- * it will default to the number of labels.
1447
- *
1448
- * @callback AudioClassificationPipelineCallback Classify the sequence(s) given as inputs.
1449
- * @param {AudioPipelineInputs} audio The input audio file(s) to be classified. The input is either:
1450
- * - `string` or `URL` that is the filename/URL of the audio file, the file will be read at the processor's sampling rate
1451
- * to get the waveform using the [`AudioContext`](https://developer.mozilla.org/en-US/docs/Web/API/AudioContext) API.
1452
- * If `AudioContext` is not available, you should pass the raw waveform in as a Float32Array of shape `(n, )`.
1453
- * - `Float32Array` or `Float64Array` of shape `(n, )`, representing the raw audio at the correct sampling rate (no further check will be done).
1454
- * @param {AudioClassificationPipelineOptions} [options] The options to use for audio classification.
1455
- * @returns {Promise<AudioClassificationOutput|AudioClassificationOutput[]>} An array or object containing the predicted labels and scores.
1456
- *
1457
- * @typedef {AudioPipelineConstructorArgs & AudioClassificationPipelineCallback & Disposable} AudioClassificationPipelineType
1458
- */
1459
-
1460
- /**
1461
- * Audio classification pipeline using any `AutoModelForAudioClassification`.
1462
- * This pipeline predicts the class of a raw waveform or an audio file.
1463
- *
1464
- * **Example:** Perform audio classification with `Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech`.
1465
- * ```javascript
1466
- * const classifier = await pipeline('audio-classification', 'Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech');
1467
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
1468
- * const output = await classifier(url);
1469
- * // [
1470
- * // { label: 'male', score: 0.9981542229652405 },
1471
- * // { label: 'female', score: 0.001845747814513743 }
1472
- * // ]
1473
- * ```
1474
- *
1475
- * **Example:** Perform audio classification with `Xenova/ast-finetuned-audioset-10-10-0.4593` and return top 4 results.
1476
- * ```javascript
1477
- * const classifier = await pipeline('audio-classification', 'Xenova/ast-finetuned-audioset-10-10-0.4593');
1478
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav';
1479
- * const output = await classifier(url, { top_k: 4 });
1480
- * // [
1481
- * // { label: 'Meow', score: 0.5617874264717102 },
1482
- * // { label: 'Cat', score: 0.22365376353263855 },
1483
- * // { label: 'Domestic animals, pets', score: 0.1141069084405899 },
1484
- * // { label: 'Animal', score: 0.08985692262649536 },
1485
- * // ]
1486
- * ```
1487
- */
1488
- export class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelineConstructorArgs) => AudioClassificationPipelineType} */ (Pipeline)) {
1489
-
1490
- /**
1491
- * Create a new AudioClassificationPipeline.
1492
- * @param {AudioPipelineConstructorArgs} options An object used to instantiate the pipeline.
1493
- */
1494
- constructor(options) {
1495
- super(options);
1496
- }
1497
-
1498
- /** @type {AudioClassificationPipelineCallback} */
1499
- async _call(audio, {
1500
- top_k = 5
1501
- } = {}) {
1502
-
1503
- const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1504
- const preparedAudios = await prepareAudios(audio, sampling_rate);
1505
-
1506
- // @ts-expect-error TS2339
1507
- const id2label = this.model.config.id2label;
1508
-
1509
- const toReturn = [];
1510
- for (const aud of preparedAudios) {
1511
- const inputs = await this.processor(aud);
1512
- const output = await this.model(inputs);
1513
- const logits = output.logits[0];
1514
-
1515
- const scores = await topk(new Tensor(
1516
- 'float32',
1517
- softmax(logits.data),
1518
- logits.dims,
1519
- ), top_k);
1520
-
1521
- const values = scores[0].tolist();
1522
- const indices = scores[1].tolist();
1523
-
1524
- const vals = indices.map((x, i) => ({
1525
- label: /** @type {string} */ (id2label ? id2label[x] : `LABEL_${x}`),
1526
- score: /** @type {number} */ (values[i]),
1527
- }));
1528
-
1529
- toReturn.push(vals);
1530
- };
1531
- return Array.isArray(audio) ? toReturn : toReturn[0];
1532
- }
1533
- }
1534
-
1535
- /**
1536
- * @typedef {Object} ZeroShotAudioClassificationOutput
1537
- * @property {string} label The label identified by the model. It is one of the suggested `candidate_label`.
1538
- * @property {number} score The score attributed by the model for that label (between 0 and 1).
1539
- *
1540
- * @typedef {Object} ZeroShotAudioClassificationPipelineOptions Parameters specific to zero-shot audio classification pipelines.
1541
- * @property {string} [hypothesis_template="This is a sound of {}."] The sentence used in conjunction with `candidate_labels`
1542
- * to attempt the audio classification by replacing the placeholder with the candidate_labels.
1543
- * Then likelihood is estimated by using `logits_per_audio`.
1544
- *
1545
- * @callback ZeroShotAudioClassificationPipelineCallback Classify the sequence(s) given as inputs.
1546
- * @param {AudioPipelineInputs} audio The input audio file(s) to be classified. The input is either:
1547
- * - `string` or `URL` that is the filename/URL of the audio file, the file will be read at the processor's sampling rate
1548
- * to get the waveform using the [`AudioContext`](https://developer.mozilla.org/en-US/docs/Web/API/AudioContext) API.
1549
- * If `AudioContext` is not available, you should pass the raw waveform in as a Float32Array of shape `(n, )`.
1550
- * - `Float32Array` or `Float64Array` of shape `(n, )`, representing the raw audio at the correct sampling rate (no further check will be done).
1551
- * @param {string[]} candidate_labels The candidate labels for this audio.
1552
- * @param {ZeroShotAudioClassificationPipelineOptions} [options] The options to use for zero-shot audio classification.
1553
- * @returns {Promise<ZeroShotAudioClassificationOutput[]|ZeroShotAudioClassificationOutput[][]>} An array of objects containing the predicted labels and scores.
1554
- *
1555
- * @typedef {TextAudioPipelineConstructorArgs & ZeroShotAudioClassificationPipelineCallback & Disposable} ZeroShotAudioClassificationPipelineType
1556
- */
1557
-
1558
- /**
1559
- * Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
1560
- * provide an audio and a set of `candidate_labels`.
1561
- *
1562
- * **Example**: Perform zero-shot audio classification with `Xenova/clap-htsat-unfused`.
1563
- * ```javascript
1564
- * const classifier = await pipeline('zero-shot-audio-classification', 'Xenova/clap-htsat-unfused');
1565
- * const audio = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/dog_barking.wav';
1566
- * const candidate_labels = ['dog', 'vaccum cleaner'];
1567
- * const scores = await classifier(audio, candidate_labels);
1568
- * // [
1569
- * // { score: 0.9993992447853088, label: 'dog' },
1570
- * // { score: 0.0006007603369653225, label: 'vaccum cleaner' }
1571
- * // ]
1572
- * ```
1573
- */
1574
- export class ZeroShotAudioClassificationPipeline extends (/** @type {new (options: TextAudioPipelineConstructorArgs) => ZeroShotAudioClassificationPipelineType} */ (Pipeline)) {
1575
-
1576
- /**
1577
- * Create a new ZeroShotAudioClassificationPipeline.
1578
- * @param {TextAudioPipelineConstructorArgs} options An object used to instantiate the pipeline.
1579
- */
1580
- constructor(options) {
1581
- super(options);
1582
- }
1583
-
1584
- /** @type {ZeroShotAudioClassificationPipelineCallback} */
1585
- async _call(audio, candidate_labels, {
1586
- hypothesis_template = "This is a sound of {}."
1587
- } = {}) {
1588
-
1589
- const single = !Array.isArray(audio);
1590
- if (single) {
1591
- audio = [/** @type {AudioInput} */ (audio)];
1592
- }
1593
-
1594
- // Insert label into hypothesis template
1595
- const texts = candidate_labels.map(
1596
- x => hypothesis_template.replace('{}', x)
1597
- );
1598
-
1599
- // Run tokenization
1600
- const text_inputs = this.tokenizer(texts, {
1601
- padding: true,
1602
- truncation: true,
1603
- });
1604
-
1605
- const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1606
- const preparedAudios = await prepareAudios(audio, sampling_rate);
1607
-
1608
- const toReturn = [];
1609
- for (const aud of preparedAudios) {
1610
- const audio_inputs = await this.processor(aud);
1611
-
1612
- // Run model with both text and audio inputs
1613
- const output = await this.model({ ...text_inputs, ...audio_inputs });
1614
-
1615
- // Compute softmax per audio
1616
- const probs = softmax(output.logits_per_audio.data);
1617
-
1618
- toReturn.push([...probs].map((x, i) => ({
1619
- score: x,
1620
- label: candidate_labels[i]
1621
- })));
1622
- }
1623
- return single ? toReturn[0] : toReturn;
1624
- }
1625
- }
1626
-
1627
- /**
1628
- * @typedef {Object} Chunk
1629
- * @property {[number, number]} timestamp The start and end timestamp of the chunk in seconds.
1630
- * @property {string} text The recognized text.
1631
- */
1632
-
1633
- /**
1634
- * @typedef {Object} AutomaticSpeechRecognitionOutput
1635
- * @property {string} text The recognized text.
1636
- * @property {Chunk[]} [chunks] When using `return_timestamps`, the `chunks` will become a list
1637
- * containing all the various text chunks identified by the model.
1638
- *
1639
- * @typedef {Object} AutomaticSpeechRecognitionSpecificParams Parameters specific to automatic-speech-recognition pipelines.
1640
- * @property {boolean|'word'} [return_timestamps] Whether to return timestamps or not. Default is `false`.
1641
- * @property {number} [chunk_length_s] The length of audio chunks to process in seconds. Default is 0 (no chunking).
1642
- * @property {number} [stride_length_s] The length of overlap between consecutive audio chunks in seconds. If not provided, defaults to `chunk_length_s / 6`.
1643
- * @property {boolean} [force_full_sequences] Whether to force outputting full sequences or not. Default is `false`.
1644
- * @property {string} [language] The source language. Default is `null`, meaning it should be auto-detected. Use this to potentially improve performance if the source language is known.
1645
- * @property {string} [task] The task to perform. Default is `null`, meaning it should be auto-detected.
1646
- * @property {number} [num_frames] The number of frames in the input audio.
1647
- * @typedef {import('./generation/configuration_utils.js').GenerationConfig & AutomaticSpeechRecognitionSpecificParams} AutomaticSpeechRecognitionConfig
1648
- *
1649
- * @callback AutomaticSpeechRecognitionPipelineCallback Transcribe the audio sequence(s) given as inputs to text.
1650
- * @param {AudioPipelineInputs} audio The input audio file(s) to be transcribed. The input is either:
1651
- * - `string` or `URL` that is the filename/URL of the audio file, the file will be read at the processor's sampling rate
1652
- * to get the waveform using the [`AudioContext`](https://developer.mozilla.org/en-US/docs/Web/API/AudioContext) API.
1653
- * If `AudioContext` is not available, you should pass the raw waveform in as a Float32Array of shape `(n, )`.
1654
- * - `Float32Array` or `Float64Array` of shape `(n, )`, representing the raw audio at the correct sampling rate (no further check will be done).
1655
- * @param {Partial<AutomaticSpeechRecognitionConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
1656
- * @returns {Promise<AutomaticSpeechRecognitionOutput|AutomaticSpeechRecognitionOutput[]>} An object containing the transcription text and optionally timestamps if `return_timestamps` is `true`.
1657
- *
1658
- * @typedef {TextAudioPipelineConstructorArgs & AutomaticSpeechRecognitionPipelineCallback & Disposable} AutomaticSpeechRecognitionPipelineType
1659
- */
1660
-
1661
- /**
1662
- * Pipeline that aims at extracting spoken text contained within some audio.
1663
- *
1664
- * **Example:** Transcribe English.
1665
- * ```javascript
1666
- * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
1667
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
1668
- * const output = await transcriber(url);
1669
- * // { text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country." }
1670
- * ```
1671
- *
1672
- * **Example:** Transcribe English w/ timestamps.
1673
- * ```javascript
1674
- * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
1675
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
1676
- * const output = await transcriber(url, { return_timestamps: true });
1677
- * // {
1678
- * // text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country."
1679
- * // chunks: [
1680
- * // { timestamp: [0, 8], text: " And so my fellow Americans ask not what your country can do for you" }
1681
- * // { timestamp: [8, 11], text: " ask what you can do for your country." }
1682
- * // ]
1683
- * // }
1684
- * ```
1685
- *
1686
- * **Example:** Transcribe English w/ word-level timestamps.
1687
- * ```javascript
1688
- * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
1689
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
1690
- * const output = await transcriber(url, { return_timestamps: 'word' });
1691
- * // {
1692
- * // "text": " And so my fellow Americans ask not what your country can do for you ask what you can do for your country.",
1693
- * // "chunks": [
1694
- * // { "text": " And", "timestamp": [0, 0.78] },
1695
- * // { "text": " so", "timestamp": [0.78, 1.06] },
1696
- * // { "text": " my", "timestamp": [1.06, 1.46] },
1697
- * // ...
1698
- * // { "text": " for", "timestamp": [9.72, 9.92] },
1699
- * // { "text": " your", "timestamp": [9.92, 10.22] },
1700
- * // { "text": " country.", "timestamp": [10.22, 13.5] }
1701
- * // ]
1702
- * // }
1703
- * ```
1704
- *
1705
- * **Example:** Transcribe French.
1706
- * ```javascript
1707
- * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-small');
1708
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/french-audio.mp3';
1709
- * const output = await transcriber(url, { language: 'french', task: 'transcribe' });
1710
- * // { text: " J'adore, j'aime, je n'aime pas, je déteste." }
1711
- * ```
1712
- *
1713
- * **Example:** Translate French to English.
1714
- * ```javascript
1715
- * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-small');
1716
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/french-audio.mp3';
1717
- * const output = await transcriber(url, { language: 'french', task: 'translate' });
1718
- * // { text: " I love, I like, I don't like, I hate." }
1719
- * ```
1720
- *
1721
- * **Example:** Transcribe/translate audio longer than 30 seconds.
1722
- * ```javascript
1723
- * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
1724
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/ted_60.wav';
1725
- * const output = await transcriber(url, { chunk_length_s: 30, stride_length_s: 5 });
1726
- * // { text: " So in college, I was a government major, which means [...] So I'd start off light and I'd bump it up" }
1727
- * ```
1728
- */
1729
- export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextAudioPipelineConstructorArgs) => AutomaticSpeechRecognitionPipelineType} */ (Pipeline)) {
1730
-
1731
- /**
1732
- * Create a new AutomaticSpeechRecognitionPipeline.
1733
- * @param {TextAudioPipelineConstructorArgs} options An object used to instantiate the pipeline.
1734
- */
1735
- constructor(options) {
1736
- super(options);
1737
- }
1738
-
1739
- /** @type {AutomaticSpeechRecognitionPipelineCallback} */
1740
- async _call(audio, kwargs = {}) {
1741
- switch (this.model.config.model_type) {
1742
- case 'whisper':
1743
- case 'lite-whisper':
1744
- return this._call_whisper(audio, kwargs)
1745
- case 'wav2vec2':
1746
- case 'wav2vec2-bert':
1747
- case 'unispeech':
1748
- case 'unispeech-sat':
1749
- case 'hubert':
1750
- return this._call_wav2vec2(audio, kwargs)
1751
- case 'moonshine':
1752
- return this._call_moonshine(audio, kwargs)
1753
- default:
1754
- throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)
1755
- }
1756
- }
1757
-
1758
- /**
1759
- * @type {AutomaticSpeechRecognitionPipelineCallback}
1760
- * @private
1761
- */
1762
- async _call_wav2vec2(audio, kwargs) {
1763
- // TODO use kwargs
1764
-
1765
- if (kwargs.language) {
1766
- console.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".');
1767
- }
1768
- if (kwargs.task) {
1769
- console.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');
1770
- }
1771
-
1772
- const single = !Array.isArray(audio);
1773
- if (single) {
1774
- audio = [/** @type {AudioInput} */ (audio)];
1775
- }
1776
-
1777
- const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1778
- const preparedAudios = await prepareAudios(audio, sampling_rate);
1779
-
1780
- const toReturn = [];
1781
- for (const aud of preparedAudios) {
1782
- const inputs = await this.processor(aud);
1783
- const output = await this.model(inputs);
1784
- const logits = output.logits[0];
1785
-
1786
- const predicted_ids = [];
1787
- for (const item of logits) {
1788
- predicted_ids.push(max(item.data)[1])
1789
- }
1790
- const predicted_sentences = this.tokenizer.decode(predicted_ids)
1791
- toReturn.push({ text: predicted_sentences })
1792
- }
1793
- return single ? toReturn[0] : toReturn;
1794
- }
1795
-
1796
- /**
1797
- * @type {AutomaticSpeechRecognitionPipelineCallback}
1798
- * @private
1799
- */
1800
- async _call_whisper(audio, kwargs) {
1801
- const return_timestamps = kwargs.return_timestamps ?? false;
1802
- const chunk_length_s = kwargs.chunk_length_s ?? 0;
1803
- const force_full_sequences = kwargs.force_full_sequences ?? false;
1804
- let stride_length_s = kwargs.stride_length_s ?? null;
1805
-
1806
- const generation_config = { ...kwargs }
1807
-
1808
- if (return_timestamps === 'word') {
1809
- generation_config['return_token_timestamps'] = true;
1810
- generation_config['return_timestamps'] = false; // Do not predict timestamp tokens
1811
- }
1812
-
1813
- const single = !Array.isArray(audio);
1814
- if (single) {
1815
- audio = [/** @type {AudioInput} */ (audio)];
1816
- }
1817
-
1818
- // @ts-expect-error TS2339
1819
- const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
1820
- const hop_length = this.processor.feature_extractor.config.hop_length;
1821
-
1822
- const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1823
- const preparedAudios = await prepareAudios(audio, sampling_rate);
1824
-
1825
- const toReturn = [];
1826
- for (const aud of preparedAudios) {
1827
- /** @type {{stride: number[], input_features: Tensor, is_last: boolean, tokens?: bigint[], token_timestamps?: number[]}[]} */
1828
- let chunks = [];
1829
- if (chunk_length_s > 0) {
1830
- if (stride_length_s === null) {
1831
- stride_length_s = chunk_length_s / 6;
1832
- } else if (chunk_length_s <= stride_length_s) {
1833
- throw Error("`chunk_length_s` must be larger than `stride_length_s`.")
1834
- }
1835
-
1836
- // TODO support different stride_length_s (for left and right)
1837
-
1838
- const window = sampling_rate * chunk_length_s;
1839
- const stride = sampling_rate * stride_length_s;
1840
- const jump = window - 2 * stride;
1841
- let offset = 0;
1842
-
1843
- // Create subarrays of audio with overlaps
1844
- while (true) {
1845
- const offset_end = offset + window;
1846
- const subarr = aud.subarray(offset, offset_end);
1847
- const feature = await this.processor(subarr);
1848
-
1849
- const is_first = offset === 0;
1850
- const is_last = offset_end >= aud.length;
1851
- chunks.push({
1852
- stride: [
1853
- subarr.length,
1854
- is_first ? 0 : stride,
1855
- is_last ? 0 : stride
1856
- ],
1857
- input_features: feature.input_features,
1858
- is_last,
1859
- })
1860
- if (is_last) break;
1861
- offset += jump;
1862
- }
1863
-
1864
- } else {
1865
- chunks = [{
1866
- stride: [aud.length, 0, 0],
1867
- input_features: (await this.processor(aud)).input_features,
1868
- is_last: true
1869
- }]
1870
- }
1871
-
1872
- // Generate for each set of input features
1873
- for (const chunk of chunks) {
1874
- generation_config.num_frames = Math.floor(chunk.stride[0] / hop_length);
1875
-
1876
- // NOTE: doing sequentially for now
1877
- const data = await this.model.generate({
1878
- inputs: chunk.input_features,
1879
- ...generation_config
1880
- });
1881
-
1882
- // TODO: Right now we only get top beam
1883
- if (return_timestamps === 'word') {
1884
- // @ts-expect-error TS2339
1885
- chunk.tokens = data.sequences.tolist()[0];
1886
- // @ts-expect-error TS2339
1887
- chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
1888
- (/** @type {number} */ x) => round(x, 2)
1889
- );
1890
-
1891
- } else {
1892
- chunk.tokens = (/** @type {Tensor} */(data))[0].tolist();
1893
- }
1894
-
1895
- // convert stride to seconds
1896
- chunk.stride = chunk.stride.map(x => x / sampling_rate);
1897
- }
1898
-
1899
- // Merge text chunks
1900
- // @ts-ignore
1901
- const [full_text, optional] = this.tokenizer._decode_asr(chunks, {
1902
- time_precision, return_timestamps, force_full_sequences
1903
- });
1904
-
1905
- toReturn.push({ text: full_text, ...optional })
1906
- }
1907
- return single ? toReturn[0] : toReturn;
1908
- }
1909
-
1910
- /**
1911
- * @type {AutomaticSpeechRecognitionPipelineCallback}
1912
- * @private
1913
- */
1914
- async _call_moonshine(audio, kwargs) {
1915
- const single = !Array.isArray(audio);
1916
- if (single) {
1917
- audio = [/** @type {AudioInput} */ (audio)];
1918
- }
1919
- const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1920
- const preparedAudios = await prepareAudios(audio, sampling_rate);
1921
- const toReturn = [];
1922
- for (const aud of preparedAudios) {
1923
- const inputs = await this.processor(aud);
1924
-
1925
- // According to the [paper](https://huggingface.co/papers/2410.15608):
1926
- // "We use greedy decoding, with a heuristic limit of 6 output tokens
1927
- // per second of audio to avoid repeated output sequences."
1928
- const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
1929
- const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
1930
-
1931
- const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
1932
- toReturn.push({ text });
1933
- }
1934
- return single ? toReturn[0] : toReturn;
1935
- }
1936
-
1937
- }
1938
-
1939
- /**
1940
- * @typedef {Object} ImageToTextSingle
1941
- * @property {string} generated_text The generated text.
1942
- * @typedef {ImageToTextSingle[]} ImageToTextOutput
1943
- *
1944
- * @callback ImageToTextPipelineCallback Assign labels to the image(s) passed as inputs.
1945
- * @param {ImagePipelineInputs} texts The images to be captioned.
1946
- * @param {Partial<import('./generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
1947
- * @returns {Promise<ImageToTextOutput|ImageToTextOutput[]>} An object (or array of objects) containing the generated text(s).
1948
- *
1949
- * @typedef {TextImagePipelineConstructorArgs & ImageToTextPipelineCallback & Disposable} ImageToTextPipelineType
1950
- */
1951
-
1952
- /**
1953
- * Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.
1954
- *
1955
- * **Example:** Generate a caption for an image w/ `Xenova/vit-gpt2-image-captioning`.
1956
- * ```javascript
1957
- * const captioner = await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning');
1958
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cats.jpg';
1959
- * const output = await captioner(url);
1960
- * // [{ generated_text: 'a cat laying on a couch with another cat' }]
1961
- * ```
1962
- *
1963
- * **Example:** Optical Character Recognition (OCR) w/ `Xenova/trocr-small-handwritten`.
1964
- * ```javascript
1965
- * const captioner = await pipeline('image-to-text', 'Xenova/trocr-small-handwritten');
1966
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/handwriting.jpg';
1967
- * const output = await captioner(url);
1968
- * // [{ generated_text: 'Mr. Brown commented icily.' }]
1969
- * ```
1970
- */
1971
- export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ImageToTextPipelineType} */ (Pipeline)) {
1972
-
1973
- /**
1974
- * Create a new ImageToTextPipeline.
1975
- * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
1976
- */
1977
- constructor(options) {
1978
- super(options);
1979
- }
1980
-
1981
- /** @type {ImageToTextPipelineCallback} */
1982
- async _call(images, generate_kwargs = {}) {
1983
-
1984
- const isBatched = Array.isArray(images);
1985
- const preparedImages = await prepareImages(images);
1986
-
1987
- const { pixel_values } = await this.processor(preparedImages);
1988
-
1989
- const toReturn = [];
1990
- for (const batch of pixel_values) {
1991
- batch.dims = [1, ...batch.dims]
1992
- const output = await this.model.generate({ inputs: batch, ...generate_kwargs });
1993
- const decoded = this.tokenizer.batch_decode(/** @type {Tensor} */(output), {
1994
- skip_special_tokens: true,
1995
- }).map(x => ({ generated_text: x.trim() }))
1996
- toReturn.push(decoded);
1997
- }
1998
-
1999
- return isBatched ? toReturn : toReturn[0];
2000
- }
2001
- }
2002
-
2003
- /**
2004
- * @typedef {Object} ImageClassificationSingle
2005
- * @property {string} label The label identified by the model.
2006
- * @property {number} score The score attributed by the model for that label.
2007
- * @typedef {ImageClassificationSingle[]} ImageClassificationOutput
2008
- *
2009
- * @typedef {Object} ImageClassificationPipelineOptions Parameters specific to image classification pipelines.
2010
- * @property {number} [top_k=1] The number of top labels that will be returned by the pipeline.
2011
- *
2012
- * @callback ImageClassificationPipelineCallback Assign labels to the image(s) passed as inputs.
2013
- * @param {ImagePipelineInputs} images The input images(s) to be classified.
2014
- * @param {ImageClassificationPipelineOptions} [options] The options to use for image classification.
2015
- * @returns {Promise<ImageClassificationOutput|ImageClassificationOutput[]>} An array or object containing the predicted labels and scores.
2016
- *
2017
- * @typedef {ImagePipelineConstructorArgs & ImageClassificationPipelineCallback & Disposable} ImageClassificationPipelineType
2018
- */
2019
-
2020
- /**
2021
- * Image classification pipeline using any `AutoModelForImageClassification`.
2022
- * This pipeline predicts the class of an image.
2023
- *
2024
- * **Example:** Classify an image.
2025
- * ```javascript
2026
- * const classifier = await pipeline('image-classification', 'Xenova/vit-base-patch16-224');
2027
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/tiger.jpg';
2028
- * const output = await classifier(url);
2029
- * // [
2030
- * // { label: 'tiger, Panthera tigris', score: 0.632695734500885 },
2031
- * // ]
2032
- * ```
2033
- *
2034
- * **Example:** Classify an image and return top `n` classes.
2035
- * ```javascript
2036
- * const classifier = await pipeline('image-classification', 'Xenova/vit-base-patch16-224');
2037
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/tiger.jpg';
2038
- * const output = await classifier(url, { top_k: 3 });
2039
- * // [
2040
- * // { label: 'tiger, Panthera tigris', score: 0.632695734500885 },
2041
- * // { label: 'tiger cat', score: 0.3634825646877289 },
2042
- * // { label: 'lion, king of beasts, Panthera leo', score: 0.00045060308184474707 },
2043
- * // ]
2044
- * ```
2045
- *
2046
- * **Example:** Classify an image and return all classes.
2047
- * ```javascript
2048
- * const classifier = await pipeline('image-classification', 'Xenova/vit-base-patch16-224');
2049
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/tiger.jpg';
2050
- * const output = await classifier(url, { top_k: 0 });
2051
- * // [
2052
- * // { label: 'tiger, Panthera tigris', score: 0.632695734500885 },
2053
- * // { label: 'tiger cat', score: 0.3634825646877289 },
2054
- * // { label: 'lion, king of beasts, Panthera leo', score: 0.00045060308184474707 },
2055
- * // { label: 'jaguar, panther, Panthera onca, Felis onca', score: 0.00035465499968267977 },
2056
- * // ...
2057
- * // ]
2058
- * ```
2059
- */
2060
- export class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageClassificationPipelineType} */ (Pipeline)) {
2061
-
2062
- /**
2063
- * Create a new ImageClassificationPipeline.
2064
- * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2065
- */
2066
- constructor(options) {
2067
- super(options);
2068
- }
2069
-
2070
- /** @type {ImageClassificationPipelineCallback} */
2071
- async _call(images, {
2072
- top_k = 5
2073
- } = {}) {
2074
-
2075
- const preparedImages = await prepareImages(images);
2076
-
2077
- const { pixel_values } = await this.processor(preparedImages);
2078
- const output = await this.model({ pixel_values });
2079
-
2080
- // @ts-expect-error TS2339
2081
- const id2label = this.model.config.id2label;
2082
-
2083
- /** @type {ImageClassificationOutput[]} */
2084
- const toReturn = [];
2085
- for (const batch of output.logits) {
2086
- const scores = await topk(new Tensor(
2087
- 'float32',
2088
- softmax(batch.data),
2089
- batch.dims,
2090
- ), top_k);
2091
-
2092
- const values = scores[0].tolist();
2093
- const indices = scores[1].tolist();
2094
-
2095
- const vals = indices.map((x, i) => ({
2096
- label: /** @type {string} */ (id2label ? id2label[x] : `LABEL_${x}`),
2097
- score: /** @type {number} */ (values[i]),
2098
- }));
2099
- toReturn.push(vals);
2100
- }
2101
-
2102
- return Array.isArray(images) ? toReturn : toReturn[0];
2103
- }
2104
-
2105
- }
2106
-
2107
- /**
2108
- * @typedef {Object} ImageSegmentationPipelineOutput
2109
- * @property {string|null} label The label of the segment.
2110
- * @property {number|null} score The score of the segment.
2111
- * @property {RawImage} mask The mask of the segment.
2112
- *
2113
- * @typedef {Object} ImageSegmentationPipelineOptions Parameters specific to image segmentation pipelines.
2114
- * @property {number} [threshold=0.5] Probability threshold to filter out predicted masks.
2115
- * @property {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
2116
- * @property {number} [overlap_mask_area_threshold=0.8] Mask overlap threshold to eliminate small, disconnected segments.
2117
- * @property {null|string} [subtask=null] Segmentation task to be performed. One of [`panoptic`, `instance`, and `semantic`],
2118
- * depending on model capabilities. If not set, the pipeline will attempt to resolve (in that order).
2119
- * @property {number[]} [label_ids_to_fuse=null] List of label ids to fuse. If not set, do not fuse any labels.
2120
- * @property {number[][]} [target_sizes=null] List of target sizes for the input images. If not set, use the original image sizes.
2121
- *
2122
- * @callback ImageSegmentationPipelineCallback Segment the input images.
2123
- * @param {ImagePipelineInputs} images The input images.
2124
- * @param {ImageSegmentationPipelineOptions} [options] The options to use for image segmentation.
2125
- * @returns {Promise<ImageSegmentationPipelineOutput[]>} The annotated segments.
2126
- *
2127
- * @typedef {ImagePipelineConstructorArgs & ImageSegmentationPipelineCallback & Disposable} ImageSegmentationPipelineType
2128
- */
2129
-
2130
- /**
2131
- * Image segmentation pipeline using any `AutoModelForXXXSegmentation`.
2132
- * This pipeline predicts masks of objects and their classes.
2133
- *
2134
- * **Example:** Perform image segmentation with `Xenova/detr-resnet-50-panoptic`.
2135
- * ```javascript
2136
- * const segmenter = await pipeline('image-segmentation', 'Xenova/detr-resnet-50-panoptic');
2137
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cats.jpg';
2138
- * const output = await segmenter(url);
2139
- * // [
2140
- * // { label: 'remote', score: 0.9984649419784546, mask: RawImage { ... } },
2141
- * // { label: 'cat', score: 0.9994316101074219, mask: RawImage { ... } }
2142
- * // ]
2143
- * ```
2144
- */
2145
- export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageSegmentationPipelineType} */ (Pipeline)) {
2146
- /**
2147
- * Create a new ImageSegmentationPipeline.
2148
- * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2149
- */
2150
- constructor(options) {
2151
- super(options);
2152
-
2153
- this.subtasks_mapping = {
2154
- // Mapping of subtasks to their corresponding post-processing function names.
2155
- panoptic: 'post_process_panoptic_segmentation',
2156
- instance: 'post_process_instance_segmentation',
2157
- semantic: 'post_process_semantic_segmentation'
2158
- }
2159
- }
2160
-
2161
- /** @type {ImageSegmentationPipelineCallback} */
2162
- async _call(images, {
2163
- threshold = 0.5,
2164
- mask_threshold = 0.5,
2165
- overlap_mask_area_threshold = 0.8,
2166
- label_ids_to_fuse = null,
2167
- target_sizes = null,
2168
- subtask = null,
2169
- } = {}) {
2170
- const isBatched = Array.isArray(images);
2171
-
2172
- if (isBatched && images.length !== 1) {
2173
- throw Error("Image segmentation pipeline currently only supports a batch size of 1.");
2174
- }
2175
-
2176
- const preparedImages = await prepareImages(images);
2177
- const imageSizes = preparedImages.map(x => [x.height, x.width]);
2178
-
2179
- const inputs = await this.processor(preparedImages);
2180
-
2181
- const { inputNames, outputNames } = this.model.sessions['model'];
2182
- if (!inputNames.includes('pixel_values')) {
2183
- if (inputNames.length !== 1) {
2184
- throw Error(`Expected a single input name, but got ${inputNames.length} inputs: ${inputNames}.`);
2185
- }
2186
-
2187
- const newName = inputNames[0];
2188
- if (newName in inputs) {
2189
- throw Error(`Input name ${newName} already exists in the inputs.`);
2190
- }
2191
- // To ensure compatibility with certain background-removal models,
2192
- // we may need to perform a mapping of input to output names
2193
- inputs[newName] = inputs.pixel_values;
2194
- }
2195
-
2196
- const output = await this.model(inputs);
2197
-
2198
- let fn = null;
2199
- if (subtask !== null) {
2200
- fn = this.subtasks_mapping[subtask];
2201
- } else if (this.processor.image_processor) {
2202
- for (const [task, func] of Object.entries(this.subtasks_mapping)) {
2203
- if (func in this.processor.image_processor) {
2204
- fn = this.processor.image_processor[func].bind(this.processor.image_processor);
2205
- subtask = task;
2206
- break;
2207
- }
2208
- }
2209
- }
2210
-
2211
- // @ts-expect-error TS2339
2212
- const id2label = this.model.config.id2label;
2213
-
2214
- /** @type {ImageSegmentationPipelineOutput[]} */
2215
- const annotation = [];
2216
- if (!subtask) {
2217
- // We define an epsilon to safeguard against numerical/precision issues when detecting
2218
- // the normalization mode of the output (i.e., sigmoid already applied, or not).
2219
- // See https://github.com/microsoft/onnxruntime/issues/23943 for more information.
2220
- const epsilon = 1e-5;
2221
-
2222
- // Perform standard image segmentation
2223
- const result = output[outputNames[0]];
2224
- for (let i = 0; i < imageSizes.length; ++i) {
2225
- const size = imageSizes[i];
2226
- const item = result[i];
2227
- if (item.data.some(x => x < -epsilon || x > 1 + epsilon)) {
2228
- item.sigmoid_();
2229
- }
2230
- const mask = await RawImage.fromTensor(item.mul_(255).to('uint8')).resize(size[1], size[0]);
2231
- annotation.push({
2232
- label: null,
2233
- score: null,
2234
- mask
2235
- });
2236
- }
2237
- } else if (subtask === 'panoptic' || subtask === 'instance') {
2238
- const processed = fn(
2239
- output,
2240
- threshold,
2241
- mask_threshold,
2242
- overlap_mask_area_threshold,
2243
- label_ids_to_fuse,
2244
- target_sizes ?? imageSizes, // TODO FIX?
2245
- )[0];
2246
-
2247
- const segmentation = processed.segmentation;
2248
-
2249
- for (const segment of processed.segments_info) {
2250
- const maskData = new Uint8ClampedArray(segmentation.data.length);
2251
- for (let i = 0; i < segmentation.data.length; ++i) {
2252
- if (segmentation.data[i] === segment.id) {
2253
- maskData[i] = 255;
2254
- }
2255
- }
2256
-
2257
- const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1)
2258
-
2259
- annotation.push({
2260
- score: segment.score,
2261
- label: id2label[segment.label_id],
2262
- mask: mask
2263
- })
2264
- }
2265
-
2266
- } else if (subtask === 'semantic') {
2267
- const { segmentation, labels } = fn(output, target_sizes ?? imageSizes)[0];
2268
-
2269
- for (const label of labels) {
2270
- const maskData = new Uint8ClampedArray(segmentation.data.length);
2271
- for (let i = 0; i < segmentation.data.length; ++i) {
2272
- if (segmentation.data[i] === label) {
2273
- maskData[i] = 255;
2274
- }
2275
- }
2276
-
2277
- const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1);
2278
-
2279
- annotation.push({
2280
- score: null,
2281
- label: id2label[label],
2282
- mask: mask
2283
- });
2284
- }
2285
- } else {
2286
- throw Error(`Subtask ${subtask} not supported.`);
2287
- }
2288
-
2289
- return annotation;
2290
- }
2291
- }
2292
-
2293
-
2294
- /**
2295
- * @typedef {Object} BackgroundRemovalPipelineOptions Parameters specific to image segmentation pipelines.
2296
- *
2297
- * @callback BackgroundRemovalPipelineCallback Segment the input images.
2298
- * @param {ImagePipelineInputs} images The input images.
2299
- * @param {BackgroundRemovalPipelineOptions} [options] The options to use for image segmentation.
2300
- * @returns {Promise<RawImage[]>} The images with the background removed.
2301
- *
2302
- * @typedef {ImagePipelineConstructorArgs & BackgroundRemovalPipelineCallback & Disposable} BackgroundRemovalPipelineType
2303
- */
2304
-
2305
- /**
2306
- * Background removal pipeline using certain `AutoModelForXXXSegmentation`.
2307
- * This pipeline removes the backgrounds of images.
2308
- *
2309
- * **Example:** Perform background removal with `Xenova/modnet`.
2310
- * ```javascript
2311
- * const segmenter = await pipeline('background-removal', 'Xenova/modnet');
2312
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/portrait-of-woman_small.jpg';
2313
- * const output = await segmenter(url);
2314
- * // [
2315
- * // RawImage { data: Uint8ClampedArray(648000) [ ... ], width: 360, height: 450, channels: 4 }
2316
- * // ]
2317
- * ```
2318
- */
2319
- export class BackgroundRemovalPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => BackgroundRemovalPipelineType} */ (/** @type {any} */(ImageSegmentationPipeline))) {
2320
- /**
2321
- * Create a new BackgroundRemovalPipeline.
2322
- * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2323
- */
2324
- constructor(options) {
2325
- super(options);
2326
- }
2327
-
2328
- /** @type {BackgroundRemovalPipelineCallback} */
2329
- async _call(images, options = {}) {
2330
- const isBatched = Array.isArray(images);
2331
-
2332
- if (isBatched && images.length !== 1) {
2333
- throw Error("Background removal pipeline currently only supports a batch size of 1.");
2334
- }
2335
-
2336
- const preparedImages = await prepareImages(images);
2337
-
2338
- // @ts-expect-error TS2339
2339
- const masks = await super._call(images, options);
2340
- const result = preparedImages.map((img, i) => {
2341
- const cloned = img.clone();
2342
- cloned.putAlpha(masks[i].mask);
2343
- return cloned;
2344
- });
2345
-
2346
- return result;
2347
- }
2348
- }
2349
-
2350
- /**
2351
- * @typedef {Object} ZeroShotImageClassificationOutput
2352
- * @property {string} label The label identified by the model. It is one of the suggested `candidate_label`.
2353
- * @property {number} score The score attributed by the model for that label (between 0 and 1).
2354
- *
2355
- * @typedef {Object} ZeroShotImageClassificationPipelineOptions Parameters specific to zero-shot image classification pipelines.
2356
- * @property {string} [hypothesis_template="This is a photo of {}"] The sentence used in conjunction with `candidate_labels`
2357
- * to attempt the image classification by replacing the placeholder with the candidate_labels.
2358
- * Then likelihood is estimated by using `logits_per_image`.
2359
- *
2360
- * @callback ZeroShotImageClassificationPipelineCallback Assign labels to the image(s) passed as inputs.
2361
- * @param {ImagePipelineInputs} images The input images.
2362
- * @param {string[]} candidate_labels The candidate labels for this image.
2363
- * @param {ZeroShotImageClassificationPipelineOptions} [options] The options to use for zero-shot image classification.
2364
- * @returns {Promise<ZeroShotImageClassificationOutput[]|ZeroShotImageClassificationOutput[][]>} An array of objects containing the predicted labels and scores.
2365
- *
2366
- * @typedef {TextImagePipelineConstructorArgs & ZeroShotImageClassificationPipelineCallback & Disposable} ZeroShotImageClassificationPipelineType
2367
- */
2368
-
2369
- /**
2370
- * Zero shot image classification pipeline. This pipeline predicts the class of
2371
- * an image when you provide an image and a set of `candidate_labels`.
2372
- *
2373
- * **Example:** Zero shot image classification w/ `Xenova/clip-vit-base-patch32`.
2374
- * ```javascript
2375
- * const classifier = await pipeline('zero-shot-image-classification', 'Xenova/clip-vit-base-patch32');
2376
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/tiger.jpg';
2377
- * const output = await classifier(url, ['tiger', 'horse', 'dog']);
2378
- * // [
2379
- * // { score: 0.9993917942047119, label: 'tiger' },
2380
- * // { score: 0.0003519294841680676, label: 'horse' },
2381
- * // { score: 0.0002562698791734874, label: 'dog' }
2382
- * // ]
2383
- * ```
2384
- */
2385
- export class ZeroShotImageClassificationPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotImageClassificationPipelineType} */ (Pipeline)) {
2386
- /**
2387
- * Create a new ZeroShotImageClassificationPipeline.
2388
- * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2389
- */
2390
- constructor(options) {
2391
- super(options);
2392
- }
2393
-
2394
- /** @type {ZeroShotImageClassificationPipelineCallback} */
2395
- async _call(images, candidate_labels, {
2396
- hypothesis_template = "This is a photo of {}"
2397
- } = {}) {
2398
-
2399
- const isBatched = Array.isArray(images);
2400
- const preparedImages = await prepareImages(images);
2401
-
2402
- // Insert label into hypothesis template
2403
- const texts = candidate_labels.map(
2404
- x => hypothesis_template.replace('{}', x)
2405
- );
2406
-
2407
- // Run tokenization
2408
- const text_inputs = this.tokenizer(texts, {
2409
- padding: this.model.config.model_type === 'siglip' ? 'max_length' : true,
2410
- truncation: true,
2411
- });
2412
-
2413
- // Run processor
2414
- const { pixel_values } = await this.processor(preparedImages);
2415
-
2416
- // Run model with both text and pixel inputs
2417
- const output = await this.model({ ...text_inputs, pixel_values });
2418
-
2419
- const function_to_apply =
2420
- this.model.config.model_type === 'siglip'
2421
- ? batch => batch.sigmoid().data
2422
- : batch => softmax(batch.data);
2423
-
2424
- // Compare each image with each candidate label
2425
- const toReturn = [];
2426
- for (const batch of output.logits_per_image) {
2427
- // Compute softmax per image
2428
- const probs = function_to_apply(batch);
2429
-
2430
- const result = [...probs].map((x, i) => ({
2431
- score: x,
2432
- label: candidate_labels[i]
2433
- }));
2434
- result.sort((a, b) => b.score - a.score); // sort by score in descending order
2435
- toReturn.push(result);
2436
- }
2437
-
2438
- return isBatched ? toReturn : toReturn[0];
2439
- }
2440
- }
2441
-
2442
-
2443
- /**
2444
- * @typedef {Object} ObjectDetectionPipelineSingle
2445
- * @property {string} label The class label identified by the model.
2446
- * @property {number} score The score attributed by the model for that label.
2447
- * @property {BoundingBox} box The bounding box of detected object in image's original size, or as a percentage if `percentage` is set to true.
2448
- * @typedef {ObjectDetectionPipelineSingle[]} ObjectDetectionPipelineOutput
2449
- *
2450
- * @typedef {Object} ObjectDetectionPipelineOptions Parameters specific to object detection pipelines.
2451
- * @property {number} [threshold=0.9] The threshold used to filter boxes by score.
2452
- * @property {boolean} [percentage=false] Whether to return the boxes coordinates in percentage (true) or in pixels (false).
2453
- *
2454
- * @callback ObjectDetectionPipelineCallback Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
2455
- * @param {ImagePipelineInputs} images The input images.
2456
- * @param {ObjectDetectionPipelineOptions} [options] The options to use for object detection.
2457
- * @returns {Promise<ObjectDetectionPipelineOutput|ObjectDetectionPipelineOutput[]>} A list of objects or a list of list of objects.
2458
- *
2459
- * @typedef {ImagePipelineConstructorArgs & ObjectDetectionPipelineCallback & Disposable} ObjectDetectionPipelineType
2460
- */
2461
-
2462
- /**
2463
- * Object detection pipeline using any `AutoModelForObjectDetection`.
2464
- * This pipeline predicts bounding boxes of objects and their classes.
2465
- *
2466
- * **Example:** Run object-detection with `Xenova/detr-resnet-50`.
2467
- * ```javascript
2468
- * const detector = await pipeline('object-detection', 'Xenova/detr-resnet-50');
2469
- * const img = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cats.jpg';
2470
- * const output = await detector(img, { threshold: 0.9 });
2471
- * // [{
2472
- * // score: 0.9976370930671692,
2473
- * // label: "remote",
2474
- * // box: { xmin: 31, ymin: 68, xmax: 190, ymax: 118 }
2475
- * // },
2476
- * // ...
2477
- * // {
2478
- * // score: 0.9984092116355896,
2479
- * // label: "cat",
2480
- * // box: { xmin: 331, ymin: 19, xmax: 649, ymax: 371 }
2481
- * // }]
2482
- * ```
2483
- */
2484
- export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ObjectDetectionPipelineType} */ (Pipeline)) {
2485
-
2486
- /**
2487
- * Create a new ObjectDetectionPipeline.
2488
- * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2489
- */
2490
- constructor(options) {
2491
- super(options);
2492
- }
2493
-
2494
- /** @type {ObjectDetectionPipelineCallback} */
2495
- async _call(images, {
2496
- threshold = 0.9,
2497
- percentage = false,
2498
- } = {}) {
2499
-
2500
- const isBatched = Array.isArray(images);
2501
-
2502
- if (isBatched && images.length !== 1) {
2503
- throw Error("Object detection pipeline currently only supports a batch size of 1.");
2504
- }
2505
- const preparedImages = await prepareImages(images);
2506
-
2507
- const imageSizes = percentage ? null : preparedImages.map(x => [x.height, x.width]);
2508
-
2509
- const { pixel_values, pixel_mask } = await this.processor(preparedImages);
2510
- const output = await this.model({ pixel_values, pixel_mask });
2511
-
2512
- // @ts-ignore
2513
- const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
2514
-
2515
- // Add labels
2516
- // @ts-expect-error TS2339
2517
- const id2label = this.model.config.id2label;
2518
-
2519
- // Format output
2520
- /** @type {ObjectDetectionPipelineOutput[]} */
2521
- const result = processed.map(batch => (
2522
- batch.boxes.map((box, i) => ({
2523
- score: batch.scores[i],
2524
- label: id2label[batch.classes[i]],
2525
- box: get_bounding_box(box, !percentage),
2526
- }))
2527
- ))
2528
-
2529
- return isBatched ? result : result[0];
2530
- }
2531
- }
2532
-
2533
-
2534
- /**
2535
- * @typedef {Object} ZeroShotObjectDetectionOutput
2536
- * @property {string} label Text query corresponding to the found object.
2537
- * @property {number} score Score corresponding to the object (between 0 and 1).
2538
- * @property {BoundingBox} box Bounding box of the detected object in image's original size, or as a percentage if `percentage` is set to true.
2539
- *
2540
- * @typedef {Object} ZeroShotObjectDetectionPipelineOptions Parameters specific to zero-shot object detection pipelines.
2541
- * @property {number} [threshold=0.1] The probability necessary to make a prediction.
2542
- * @property {number} [top_k=null] The number of top predictions that will be returned by the pipeline.
2543
- * If the provided number is `null` or higher than the number of predictions available, it will default
2544
- * to the number of predictions.
2545
- * @property {boolean} [percentage=false] Whether to return the boxes coordinates in percentage (true) or in pixels (false).
2546
- *
2547
- * @callback ZeroShotObjectDetectionPipelineCallback Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
2548
- * @param {ImagePipelineInputs} images The input images.
2549
- * @param {string[]} candidate_labels What the model should recognize in the image.
2550
- * @param {ZeroShotObjectDetectionPipelineOptions} [options] The options to use for zero-shot object detection.
2551
- * @returns {Promise<ZeroShotObjectDetectionOutput[]|ZeroShotObjectDetectionOutput[][]>} An array of objects containing the predicted labels, scores, and bounding boxes.
2552
- *
2553
- * @typedef {TextImagePipelineConstructorArgs & ZeroShotObjectDetectionPipelineCallback & Disposable} ZeroShotObjectDetectionPipelineType
2554
- */
2555
-
2556
- /**
2557
- * Zero-shot object detection pipeline. This pipeline predicts bounding boxes of
2558
- * objects when you provide an image and a set of `candidate_labels`.
2559
- *
2560
- * **Example:** Zero-shot object detection w/ `Xenova/owlvit-base-patch32`.
2561
- * ```javascript
2562
- * const detector = await pipeline('zero-shot-object-detection', 'Xenova/owlvit-base-patch32');
2563
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/astronaut.png';
2564
- * const candidate_labels = ['human face', 'rocket', 'helmet', 'american flag'];
2565
- * const output = await detector(url, candidate_labels);
2566
- * // [
2567
- * // {
2568
- * // score: 0.24392342567443848,
2569
- * // label: 'human face',
2570
- * // box: { xmin: 180, ymin: 67, xmax: 274, ymax: 175 }
2571
- * // },
2572
- * // {
2573
- * // score: 0.15129457414150238,
2574
- * // label: 'american flag',
2575
- * // box: { xmin: 0, ymin: 4, xmax: 106, ymax: 513 }
2576
- * // },
2577
- * // {
2578
- * // score: 0.13649864494800568,
2579
- * // label: 'helmet',
2580
- * // box: { xmin: 277, ymin: 337, xmax: 511, ymax: 511 }
2581
- * // },
2582
- * // {
2583
- * // score: 0.10262022167444229,
2584
- * // label: 'rocket',
2585
- * // box: { xmin: 352, ymin: -1, xmax: 463, ymax: 287 }
2586
- * // }
2587
- * // ]
2588
- * ```
2589
- *
2590
- * **Example:** Zero-shot object detection w/ `Xenova/owlvit-base-patch32` (returning top 4 matches and setting a threshold).
2591
- * ```javascript
2592
- * const detector = await pipeline('zero-shot-object-detection', 'Xenova/owlvit-base-patch32');
2593
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/beach.png';
2594
- * const candidate_labels = ['hat', 'book', 'sunglasses', 'camera'];
2595
- * const output = await detector(url, candidate_labels, { top_k: 4, threshold: 0.05 });
2596
- * // [
2597
- * // {
2598
- * // score: 0.1606510728597641,
2599
- * // label: 'sunglasses',
2600
- * // box: { xmin: 347, ymin: 229, xmax: 429, ymax: 264 }
2601
- * // },
2602
- * // {
2603
- * // score: 0.08935828506946564,
2604
- * // label: 'hat',
2605
- * // box: { xmin: 38, ymin: 174, xmax: 258, ymax: 364 }
2606
- * // },
2607
- * // {
2608
- * // score: 0.08530698716640472,
2609
- * // label: 'camera',
2610
- * // box: { xmin: 187, ymin: 350, xmax: 260, ymax: 411 }
2611
- * // },
2612
- * // {
2613
- * // score: 0.08349756896495819,
2614
- * // label: 'book',
2615
- * // box: { xmin: 261, ymin: 280, xmax: 494, ymax: 425 }
2616
- * // }
2617
- * // ]
2618
- * ```
2619
- */
2620
- export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotObjectDetectionPipelineType} */ (Pipeline)) {
2621
-
2622
- /**
2623
- * Create a new ZeroShotObjectDetectionPipeline.
2624
- * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2625
- */
2626
- constructor(options) {
2627
- super(options);
2628
- }
2629
-
2630
- /** @type {ZeroShotObjectDetectionPipelineCallback} */
2631
- async _call(images, candidate_labels, {
2632
- threshold = 0.1,
2633
- top_k = null,
2634
- percentage = false,
2635
- } = {}) {
2636
-
2637
- const isBatched = Array.isArray(images);
2638
- const preparedImages = await prepareImages(images);
2639
-
2640
- // Run tokenization
2641
- const text_inputs = this.tokenizer(candidate_labels, {
2642
- padding: true,
2643
- truncation: true,
2644
- });
2645
-
2646
- // Run processor
2647
- const model_inputs = await this.processor(preparedImages);
2648
-
2649
- // Since non-maximum suppression is performed for exporting, we need to
2650
- // process each image separately. For more information, see:
2651
- // https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032
2652
- const toReturn = [];
2653
- for (let i = 0; i < preparedImages.length; ++i) {
2654
- const image = preparedImages[i];
2655
- const imageSize = percentage ? null : [[image.height, image.width]];
2656
- const pixel_values = model_inputs.pixel_values[i].unsqueeze_(0);
2657
-
2658
- // Run model with both text and pixel inputs
2659
- const output = await this.model({ ...text_inputs, pixel_values });
2660
-
2661
- let result;
2662
- if ('post_process_grounded_object_detection' in this.processor) {
2663
- // @ts-ignore
2664
- const processed = this.processor.post_process_grounded_object_detection(
2665
- output,
2666
- text_inputs.input_ids,
2667
- {
2668
- // TODO: support separate threshold values
2669
- box_threshold: threshold,
2670
- text_threshold: threshold,
2671
- target_sizes: imageSize,
2672
- },
2673
- )[0];
2674
- result = processed.boxes.map((box, i) => ({
2675
- score: processed.scores[i],
2676
- label: processed.labels[i],
2677
- box: get_bounding_box(box, !percentage),
2678
- }))
2679
- } else {
2680
- // @ts-ignore
2681
- const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
2682
- result = processed.boxes.map((box, i) => ({
2683
- score: processed.scores[i],
2684
- label: candidate_labels[processed.classes[i]],
2685
- box: get_bounding_box(box, !percentage),
2686
- }))
2687
- }
2688
- result.sort((a, b) => b.score - a.score);
2689
-
2690
- if (top_k !== null) {
2691
- result = result.slice(0, top_k);
2692
- }
2693
- toReturn.push(result)
2694
- }
2695
-
2696
- return isBatched ? toReturn : toReturn[0];
2697
- }
2698
- }
2699
-
2700
- /**
2701
- * @typedef {Object} DocumentQuestionAnsweringSingle
2702
- * @property {string} answer The generated text.
2703
- * @typedef {DocumentQuestionAnsweringSingle[]} DocumentQuestionAnsweringOutput
2704
- *
2705
- * @callback DocumentQuestionAnsweringPipelineCallback Answer the question given as input by using the document.
2706
- * @param {ImageInput} image The image of the document to use.
2707
- * @param {string} question A question to ask of the document.
2708
- * @param {Partial<import('./generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
2709
- * @returns {Promise<DocumentQuestionAnsweringOutput|DocumentQuestionAnsweringOutput[]>} An object (or array of objects) containing the answer(s).
2710
- *
2711
- * @typedef {TextImagePipelineConstructorArgs & DocumentQuestionAnsweringPipelineCallback & Disposable} DocumentQuestionAnsweringPipelineType
2712
- */
2713
-
2714
- /**
2715
- * Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`.
2716
- * The inputs/outputs are similar to the (extractive) question answering pipeline; however,
2717
- * the pipeline takes an image (and optional OCR'd words/boxes) as input instead of text context.
2718
- *
2719
- * **Example:** Answer questions about a document with `Xenova/donut-base-finetuned-docvqa`.
2720
- * ```javascript
2721
- * const qa_pipeline = await pipeline('document-question-answering', 'Xenova/donut-base-finetuned-docvqa');
2722
- * const image = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/invoice.png';
2723
- * const question = 'What is the invoice number?';
2724
- * const output = await qa_pipeline(image, question);
2725
- * // [{ answer: 'us-001' }]
2726
- * ```
2727
- */
2728
- export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => DocumentQuestionAnsweringPipelineType} */ (Pipeline)) {
2729
-
2730
- /**
2731
- * Create a new DocumentQuestionAnsweringPipeline.
2732
- * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2733
- */
2734
- constructor(options) {
2735
- super(options);
2736
- }
2737
-
2738
- /** @type {DocumentQuestionAnsweringPipelineCallback} */
2739
- async _call(image, question, generate_kwargs = {}) {
2740
-
2741
- // NOTE: For now, we only support a batch size of 1
2742
-
2743
- // Preprocess image
2744
- const preparedImage = (await prepareImages(image))[0];
2745
- const { pixel_values } = await this.processor(preparedImage);
2746
-
2747
- // Run tokenization
2748
- const task_prompt = `<s_docvqa><s_question>${question}</s_question><s_answer>`;
2749
- const decoder_input_ids = this.tokenizer(task_prompt, {
2750
- add_special_tokens: false,
2751
- padding: true,
2752
- truncation: true,
2753
- }).input_ids;
2754
-
2755
- // Run model
2756
- const output = await this.model.generate({
2757
- inputs: pixel_values,
2758
- // @ts-expect-error TS2339
2759
- max_length: this.model.config.decoder.max_position_embeddings,
2760
- decoder_input_ids,
2761
- ...generate_kwargs,
2762
- });
2763
-
2764
- // Decode output
2765
- const decoded = this.tokenizer.batch_decode(/** @type {Tensor} */(output))[0];
2766
-
2767
- // Parse answer
2768
- const match = decoded.match(/<s_answer>(.*?)<\/s_answer>/);
2769
- let answer = null;
2770
- if (match && match.length >= 2) {
2771
- answer = match[1].trim();
2772
- }
2773
- return [{ answer }];
2774
- }
2775
- }
2776
-
2777
-
2778
- /**
2779
- * @typedef {Object} VocoderOptions
2780
- * @property {PreTrainedModel} [vocoder] The vocoder used by the pipeline (if the model uses one). If not provided, use the default HifiGan vocoder.
2781
- * @typedef {TextAudioPipelineConstructorArgs & VocoderOptions} TextToAudioPipelineConstructorArgs
2782
- */
2783
-
2784
- /**
2785
- * @typedef {Object} TextToAudioOutput
2786
- * @property {Float32Array} audio The generated audio waveform.
2787
- * @property {number} sampling_rate The sampling rate of the generated audio waveform.
2788
- *
2789
- * @typedef {Object} TextToAudioPipelineOptions Parameters specific to text-to-audio pipelines.
2790
- * @property {Tensor|Float32Array|string|URL} [speaker_embeddings=null] The speaker embeddings (if the model requires it).
2791
- *
2792
- * @callback TextToAudioPipelineCallback Generates speech/audio from the inputs.
2793
- * @param {string|string[]} texts The text(s) to generate.
2794
- * @param {TextToAudioPipelineOptions} options Parameters passed to the model generation/forward method.
2795
- * @returns {Promise<TextToAudioOutput>} An object containing the generated audio and sampling rate.
2796
- *
2797
- * @typedef {TextToAudioPipelineConstructorArgs & TextToAudioPipelineCallback & Disposable} TextToAudioPipelineType
2798
- */
2799
-
2800
- /**
2801
- * Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`.
2802
- * This pipeline generates an audio file from an input text and optional other conditional inputs.
2803
- *
2804
- * **Example:** Generate audio from text with `Xenova/speecht5_tts`.
2805
- * ```javascript
2806
- * const synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { quantized: false });
2807
- * const speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
2808
- * const out = await synthesizer('Hello, my dog is cute', { speaker_embeddings });
2809
- * // RawAudio {
2810
- * // audio: Float32Array(26112) [-0.00005657337896991521, 0.00020583874720614403, ...],
2811
- * // sampling_rate: 16000
2812
- * // }
2813
- * ```
2814
- *
2815
- * You can then save the audio to a .wav file with the `wavefile` package:
2816
- * ```javascript
2817
- * import wavefile from 'wavefile';
2818
- * import fs from 'fs';
2819
- *
2820
- * const wav = new wavefile.WaveFile();
2821
- * wav.fromScratch(1, out.sampling_rate, '32f', out.audio);
2822
- * fs.writeFileSync('out.wav', wav.toBuffer());
2823
- * ```
2824
- *
2825
- * **Example:** Multilingual speech generation with `Xenova/mms-tts-fra`. See [here](https://huggingface.co/models?pipeline_tag=text-to-speech&other=vits&sort=trending) for the full list of available languages (1107).
2826
- * ```javascript
2827
- * const synthesizer = await pipeline('text-to-speech', 'Xenova/mms-tts-fra');
2828
- * const out = await synthesizer('Bonjour');
2829
- * // RawAudio {
2830
- * // audio: Float32Array(23808) [-0.00037693005288019776, 0.0003325853613205254, ...],
2831
- * // sampling_rate: 16000
2832
- * // }
2833
- * ```
2834
- */
2835
- export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineConstructorArgs) => TextToAudioPipelineType} */ (Pipeline)) {
2836
- DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
2837
-
2838
- /**
2839
- * Create a new TextToAudioPipeline.
2840
- * @param {TextToAudioPipelineConstructorArgs} options An object used to instantiate the pipeline.
2841
- */
2842
- constructor(options) {
2843
- super(options);
2844
-
2845
- // TODO: Find a better way for `pipeline` to set the default vocoder
2846
- this.vocoder = options.vocoder ?? null;
2847
- }
2848
-
2849
-
2850
- /** @type {TextToAudioPipelineCallback} */
2851
- async _call(text_inputs, {
2852
- speaker_embeddings = null,
2853
- } = {}) {
2854
-
2855
- // If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
2856
- if (this.processor) {
2857
- return this._call_text_to_spectrogram(text_inputs, { speaker_embeddings });
2858
- } else {
2859
- return this._call_text_to_waveform(text_inputs);
2860
- }
2861
- }
2862
-
2863
- async _call_text_to_waveform(text_inputs) {
2864
-
2865
- // Run tokenization
2866
- const inputs = this.tokenizer(text_inputs, {
2867
- padding: true,
2868
- truncation: true,
2869
- });
2870
-
2871
- // Generate waveform
2872
- const { waveform } = await this.model(inputs);
2873
-
2874
- // @ts-expect-error TS2339
2875
- const sampling_rate = this.model.config.sampling_rate;
2876
- return new RawAudio(
2877
- waveform.data,
2878
- sampling_rate,
2879
- )
2880
- }
2881
-
2882
- async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) {
2883
-
2884
- // Load vocoder, if not provided
2885
- if (!this.vocoder) {
2886
- console.log('No vocoder specified, using default HifiGan vocoder.');
2887
- this.vocoder = await AutoModel.from_pretrained(this.DEFAULT_VOCODER_ID, { dtype: 'fp32' });
2888
- }
2889
-
2890
- // Load speaker embeddings as Float32Array from path/URL
2891
- if (typeof speaker_embeddings === 'string' || speaker_embeddings instanceof URL) {
2892
- // Load from URL with fetch
2893
- speaker_embeddings = new Float32Array(
2894
- await (await fetch(speaker_embeddings)).arrayBuffer()
2895
- );
2896
- }
2897
-
2898
- if (speaker_embeddings instanceof Float32Array) {
2899
- speaker_embeddings = new Tensor(
2900
- 'float32',
2901
- speaker_embeddings,
2902
- [1, speaker_embeddings.length]
2903
- )
2904
- } else if (!(speaker_embeddings instanceof Tensor)) {
2905
- throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.")
2906
- }
2907
-
2908
- // Run tokenization
2909
- const { input_ids } = this.tokenizer(text_inputs, {
2910
- padding: true,
2911
- truncation: true,
2912
- });
2913
-
2914
- // NOTE: At this point, we are guaranteed that `speaker_embeddings` is a `Tensor`
2915
- // @ts-ignore
2916
- const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder });
2917
-
2918
- const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
2919
- return new RawAudio(
2920
- waveform.data,
2921
- sampling_rate,
2922
- )
2923
- }
2924
- }
2925
-
2926
- /**
2927
- * @callback ImageToImagePipelineCallback Transform the image(s) passed as inputs.
2928
- * @param {ImagePipelineInputs} images The images to transform.
2929
- * @returns {Promise<RawImage|RawImage[]>} The transformed image or list of images.
2930
- *
2931
- * @typedef {ImagePipelineConstructorArgs & ImageToImagePipelineCallback & Disposable} ImageToImagePipelineType
2932
- */
2933
-
2934
- /**
2935
- * Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous image input.
2936
- *
2937
- * **Example:** Super-resolution w/ `Xenova/swin2SR-classical-sr-x2-64`
2938
- * ```javascript
2939
- * const upscaler = await pipeline('image-to-image', 'Xenova/swin2SR-classical-sr-x2-64');
2940
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/butterfly.jpg';
2941
- * const output = await upscaler(url);
2942
- * // RawImage {
2943
- * // data: Uint8Array(786432) [ 41, 31, 24, 43, ... ],
2944
- * // width: 512,
2945
- * // height: 512,
2946
- * // channels: 3
2947
- * // }
2948
- * ```
2949
- */
2950
- export class ImageToImagePipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageToImagePipelineType} */ (Pipeline)) {
2951
- /**
2952
- * Create a new ImageToImagePipeline.
2953
- * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2954
- */
2955
- constructor(options) {
2956
- super(options);
2957
- }
2958
-
2959
- /** @type {ImageToImagePipelineCallback} */
2960
- async _call(images) {
2961
-
2962
- const preparedImages = await prepareImages(images);
2963
- const inputs = await this.processor(preparedImages);
2964
- const outputs = await this.model(inputs);
2965
-
2966
- /** @type {RawImage[]} */
2967
- const toReturn = [];
2968
- for (const batch of outputs.reconstruction) {
2969
- const output = batch.squeeze().clamp_(0, 1).mul_(255).round_().to('uint8');
2970
- toReturn.push(RawImage.fromTensor(output));
2971
- }
2972
-
2973
- return toReturn.length > 1 ? toReturn : toReturn[0];
2974
- }
2975
- }
2976
-
2977
- /**
2978
- * @typedef {Object} DepthEstimationPipelineOutput
2979
- * @property {Tensor} predicted_depth The raw depth map predicted by the model.
2980
- * @property {RawImage} depth The processed depth map as an image (with the same size as the input image).
2981
- *
2982
- * @callback DepthEstimationPipelineCallback Predicts the depth for the image(s) passed as inputs.
2983
- * @param {ImagePipelineInputs} images The images to compute depth for.
2984
- * @returns {Promise<DepthEstimationPipelineOutput|DepthEstimationPipelineOutput[]>} An image or a list of images containing result(s).
2985
- *
2986
- * @typedef {ImagePipelineConstructorArgs & DepthEstimationPipelineCallback & Disposable} DepthEstimationPipelineType
2987
- */
2988
-
2989
- /**
2990
- * Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image.
2991
- *
2992
- * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`
2993
- * ```javascript
2994
- * const depth_estimator = await pipeline('depth-estimation', 'Xenova/dpt-hybrid-midas');
2995
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cats.jpg';
2996
- * const out = await depth_estimator(url);
2997
- * // {
2998
- * // predicted_depth: Tensor {
2999
- * // dims: [ 384, 384 ],
3000
- * // type: 'float32',
3001
- * // data: Float32Array(147456) [ 542.859130859375, 545.2833862304688, 546.1649169921875, ... ],
3002
- * // size: 147456
3003
- * // },
3004
- * // depth: RawImage {
3005
- * // data: Uint8Array(307200) [ 86, 86, 86, ... ],
3006
- * // width: 640,
3007
- * // height: 480,
3008
- * // channels: 1
3009
- * // }
3010
- * // }
3011
- * ```
3012
- */
3013
- export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => DepthEstimationPipelineType} */ (Pipeline)) {
3014
- /**
3015
- * Create a new DepthEstimationPipeline.
3016
- * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
3017
- */
3018
- constructor(options) {
3019
- super(options);
3020
- }
3021
-
3022
- /** @type {DepthEstimationPipelineCallback} */
3023
- async _call(images) {
3024
-
3025
- const preparedImages = await prepareImages(images);
3026
-
3027
- const inputs = await this.processor(preparedImages);
3028
- const { predicted_depth } = await this.model(inputs);
3029
-
3030
- const toReturn = [];
3031
- for (let i = 0; i < preparedImages.length; ++i) {
3032
- const batch = predicted_depth[i];
3033
- const [height, width] = batch.dims.slice(-2);
3034
- const [new_width, new_height] = preparedImages[i].size;
3035
-
3036
- // Interpolate to original size
3037
- const prediction = (await interpolate_4d(batch.view(1, 1, height, width), {
3038
- size: [new_height, new_width],
3039
- mode: 'bilinear',
3040
- })).view(new_height, new_width);
3041
-
3042
- const minval = /** @type {number} */(prediction.min().item());
3043
- const maxval = /** @type {number} */(prediction.max().item());
3044
- const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
3045
- const depth = RawImage.fromTensor(formatted);
3046
- toReturn.push({
3047
- predicted_depth: prediction,
3048
- depth,
3049
- });
3050
- }
3051
-
3052
- return toReturn.length > 1 ? toReturn : toReturn[0];
3053
- }
3054
- }
3055
-
3056
- const SUPPORTED_TASKS = Object.freeze({
3057
- "text-classification": {
3058
- "tokenizer": AutoTokenizer,
3059
- "pipeline": TextClassificationPipeline,
3060
- "model": AutoModelForSequenceClassification,
3061
- "default": {
3062
- // TODO: replace with original
3063
- // "model": "distilbert-base-uncased-finetuned-sst-2-english",
3064
- "model": "Xenova/distilbert-base-uncased-finetuned-sst-2-english",
3065
- },
3066
- "type": "text",
3067
- },
3068
- "token-classification": {
3069
- "tokenizer": AutoTokenizer,
3070
- "pipeline": TokenClassificationPipeline,
3071
- "model": AutoModelForTokenClassification,
3072
- "default": {
3073
- // TODO: replace with original
3074
- // "model": "Davlan/bert-base-multilingual-cased-ner-hrl",
3075
- "model": "Xenova/bert-base-multilingual-cased-ner-hrl",
3076
- },
3077
- "type": "text",
3078
- },
3079
- "question-answering": {
3080
- "tokenizer": AutoTokenizer,
3081
- "pipeline": QuestionAnsweringPipeline,
3082
- "model": AutoModelForQuestionAnswering,
3083
- "default": {
3084
- // TODO: replace with original
3085
- // "model": "distilbert-base-cased-distilled-squad",
3086
- "model": "Xenova/distilbert-base-cased-distilled-squad",
3087
- },
3088
- "type": "text",
3089
- },
3090
-
3091
- "fill-mask": {
3092
- "tokenizer": AutoTokenizer,
3093
- "pipeline": FillMaskPipeline,
3094
- "model": AutoModelForMaskedLM,
3095
- "default": {
3096
- // TODO: replace with original
3097
- // "model": "bert-base-uncased",
3098
- "model": "Xenova/bert-base-uncased",
3099
- },
3100
- "type": "text",
3101
- },
3102
- "summarization": {
3103
- "tokenizer": AutoTokenizer,
3104
- "pipeline": SummarizationPipeline,
3105
- "model": AutoModelForSeq2SeqLM,
3106
- "default": {
3107
- // TODO: replace with original
3108
- // "model": "sshleifer/distilbart-cnn-6-6",
3109
- "model": "Xenova/distilbart-cnn-6-6",
3110
- },
3111
- "type": "text",
3112
- },
3113
- "translation": {
3114
- "tokenizer": AutoTokenizer,
3115
- "pipeline": TranslationPipeline,
3116
- "model": AutoModelForSeq2SeqLM,
3117
- "default": {
3118
- // TODO: replace with original
3119
- // "model": "t5-small",
3120
- "model": "Xenova/t5-small",
3121
- },
3122
- "type": "text",
3123
- },
3124
- "text2text-generation": {
3125
- "tokenizer": AutoTokenizer,
3126
- "pipeline": Text2TextGenerationPipeline,
3127
- "model": AutoModelForSeq2SeqLM,
3128
- "default": {
3129
- // TODO: replace with original
3130
- // "model": "google/flan-t5-small",
3131
- "model": "Xenova/flan-t5-small",
3132
- },
3133
- "type": "text",
3134
- },
3135
- "text-generation": {
3136
- "tokenizer": AutoTokenizer,
3137
- "pipeline": TextGenerationPipeline,
3138
- "model": AutoModelForCausalLM,
3139
- "default": {
3140
- // TODO: replace with original
3141
- // "model": "gpt2",
3142
- "model": "Xenova/gpt2",
3143
- },
3144
- "type": "text",
3145
- },
3146
- "zero-shot-classification": {
3147
- "tokenizer": AutoTokenizer,
3148
- "pipeline": ZeroShotClassificationPipeline,
3149
- "model": AutoModelForSequenceClassification,
3150
- "default": {
3151
- // TODO: replace with original
3152
- // "model": "typeform/distilbert-base-uncased-mnli",
3153
- "model": "Xenova/distilbert-base-uncased-mnli",
3154
- },
3155
- "type": "text",
3156
- },
3157
- "audio-classification": {
3158
- "pipeline": AudioClassificationPipeline,
3159
- "model": AutoModelForAudioClassification,
3160
- "processor": AutoProcessor,
3161
- "default": {
3162
- // TODO: replace with original
3163
- // "model": "superb/wav2vec2-base-superb-ks",
3164
- "model": "Xenova/wav2vec2-base-superb-ks",
3165
- },
3166
- "type": "audio",
3167
- },
3168
- "zero-shot-audio-classification": {
3169
- "tokenizer": AutoTokenizer,
3170
- "pipeline": ZeroShotAudioClassificationPipeline,
3171
- "model": AutoModel,
3172
- "processor": AutoProcessor,
3173
- "default": {
3174
- // TODO: replace with original
3175
- // "model": "laion/clap-htsat-fused",
3176
- "model": "Xenova/clap-htsat-unfused",
3177
- },
3178
- "type": "multimodal",
3179
- },
3180
- "automatic-speech-recognition": {
3181
- "tokenizer": AutoTokenizer,
3182
- "pipeline": AutomaticSpeechRecognitionPipeline,
3183
- "model": [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
3184
- "processor": AutoProcessor,
3185
- "default": {
3186
- // TODO: replace with original
3187
- // "model": "openai/whisper-tiny.en",
3188
- "model": "Xenova/whisper-tiny.en",
3189
- },
3190
- "type": "multimodal",
3191
- },
3192
- "text-to-audio": {
3193
- "tokenizer": AutoTokenizer,
3194
- "pipeline": TextToAudioPipeline,
3195
- "model": [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
3196
- "processor": [AutoProcessor, /* Some don't use a processor */ null],
3197
- "default": {
3198
- // TODO: replace with original
3199
- // "model": "microsoft/speecht5_tts",
3200
- "model": "Xenova/speecht5_tts",
3201
- },
3202
- "type": "text",
3203
- },
3204
- "image-to-text": {
3205
- "tokenizer": AutoTokenizer,
3206
- "pipeline": ImageToTextPipeline,
3207
- "model": AutoModelForVision2Seq,
3208
- "processor": AutoProcessor,
3209
- "default": {
3210
- // TODO: replace with original
3211
- // "model": "nlpconnect/vit-gpt2-image-captioning",
3212
- "model": "Xenova/vit-gpt2-image-captioning",
3213
- },
3214
- "type": "multimodal",
3215
- },
3216
-
3217
- "image-classification": {
3218
- // no tokenizer
3219
- "pipeline": ImageClassificationPipeline,
3220
- "model": AutoModelForImageClassification,
3221
- "processor": AutoProcessor,
3222
- "default": {
3223
- // TODO: replace with original
3224
- // "model": "google/vit-base-patch16-224",
3225
- "model": "Xenova/vit-base-patch16-224",
3226
- },
3227
- "type": "multimodal",
3228
- },
3229
-
3230
- "image-segmentation": {
3231
- // no tokenizer
3232
- "pipeline": ImageSegmentationPipeline,
3233
- "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
3234
- "processor": AutoProcessor,
3235
- "default": {
3236
- // TODO: replace with original
3237
- // "model": "facebook/detr-resnet-50-panoptic",
3238
- "model": "Xenova/detr-resnet-50-panoptic",
3239
- },
3240
- "type": "multimodal",
3241
- },
3242
- "background-removal": {
3243
- // no tokenizer
3244
- "pipeline": BackgroundRemovalPipeline,
3245
- "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
3246
- "processor": AutoProcessor,
3247
- "default": {
3248
- "model": "Xenova/modnet",
3249
- },
3250
- "type": "image",
3251
- },
3252
-
3253
- "zero-shot-image-classification": {
3254
- "tokenizer": AutoTokenizer,
3255
- "pipeline": ZeroShotImageClassificationPipeline,
3256
- "model": AutoModel,
3257
- "processor": AutoProcessor,
3258
- "default": {
3259
- // TODO: replace with original
3260
- // "model": "openai/clip-vit-base-patch32",
3261
- "model": "Xenova/clip-vit-base-patch32",
3262
- },
3263
- "type": "multimodal",
3264
- },
3265
-
3266
- "object-detection": {
3267
- // no tokenizer
3268
- "pipeline": ObjectDetectionPipeline,
3269
- "model": AutoModelForObjectDetection,
3270
- "processor": AutoProcessor,
3271
- "default": {
3272
- // TODO: replace with original
3273
- // "model": "facebook/detr-resnet-50",
3274
- "model": "Xenova/detr-resnet-50",
3275
- },
3276
- "type": "multimodal",
3277
- },
3278
- "zero-shot-object-detection": {
3279
- "tokenizer": AutoTokenizer,
3280
- "pipeline": ZeroShotObjectDetectionPipeline,
3281
- "model": AutoModelForZeroShotObjectDetection,
3282
- "processor": AutoProcessor,
3283
- "default": {
3284
- // TODO: replace with original
3285
- // "model": "google/owlvit-base-patch32",
3286
- "model": "Xenova/owlvit-base-patch32",
3287
- },
3288
- "type": "multimodal",
3289
- },
3290
- "document-question-answering": {
3291
- "tokenizer": AutoTokenizer,
3292
- "pipeline": DocumentQuestionAnsweringPipeline,
3293
- "model": AutoModelForDocumentQuestionAnswering,
3294
- "processor": AutoProcessor,
3295
- "default": {
3296
- // TODO: replace with original
3297
- // "model": "naver-clova-ix/donut-base-finetuned-docvqa",
3298
- "model": "Xenova/donut-base-finetuned-docvqa",
3299
- },
3300
- "type": "multimodal",
3301
- },
3302
- "image-to-image": {
3303
- // no tokenizer
3304
- "pipeline": ImageToImagePipeline,
3305
- "model": AutoModelForImageToImage,
3306
- "processor": AutoProcessor,
3307
- "default": {
3308
- // TODO: replace with original
3309
- // "model": "caidas/swin2SR-classical-sr-x2-64",
3310
- "model": "Xenova/swin2SR-classical-sr-x2-64",
3311
- },
3312
- "type": "image",
3313
- },
3314
- "depth-estimation": {
3315
- // no tokenizer
3316
- "pipeline": DepthEstimationPipeline,
3317
- "model": AutoModelForDepthEstimation,
3318
- "processor": AutoProcessor,
3319
- "default": {
3320
- // TODO: replace with original
3321
- // "model": "Intel/dpt-large",
3322
- "model": "Xenova/dpt-large",
3323
- },
3324
- "type": "image",
3325
- },
3326
-
3327
- // This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers).
3328
- "feature-extraction": {
3329
- "tokenizer": AutoTokenizer,
3330
- "pipeline": FeatureExtractionPipeline,
3331
- "model": AutoModel,
3332
- "default": {
3333
- // TODO: replace with original
3334
- // "model": "sentence-transformers/all-MiniLM-L6-v2",
3335
- "model": "Xenova/all-MiniLM-L6-v2",
3336
- },
3337
- "type": "text",
3338
- },
3339
- "image-feature-extraction": {
3340
- "processor": AutoProcessor,
3341
- "pipeline": ImageFeatureExtractionPipeline,
3342
- "model": [AutoModelForImageFeatureExtraction, AutoModel],
3343
- "default": {
3344
- // TODO: replace with original
3345
- // "model": "google/vit-base-patch16-224",
3346
- "model": "Xenova/vit-base-patch16-224-in21k",
3347
- },
3348
- "type": "image",
3349
- },
3350
- })
3351
-
3352
-
3353
- // TODO: Add types for TASK_ALIASES
3354
- const TASK_ALIASES = Object.freeze({
3355
- "sentiment-analysis": "text-classification",
3356
- "ner": "token-classification",
3357
- // "vqa": "visual-question-answering", // TODO: Add
3358
- "asr": "automatic-speech-recognition",
3359
- "text-to-speech": "text-to-audio",
3360
-
3361
- // Add for backwards compatibility
3362
- "embeddings": "feature-extraction",
3363
- });
3364
-
3365
- /**
3366
- * @typedef {keyof typeof SUPPORTED_TASKS} TaskType
3367
- * @typedef {keyof typeof TASK_ALIASES} AliasType
3368
- * @typedef {TaskType | AliasType} PipelineType All possible pipeline types.
3369
- * @typedef {{[K in TaskType]: InstanceType<typeof SUPPORTED_TASKS[K]["pipeline"]>}} SupportedTasks A mapping of pipeline names to their corresponding pipeline classes.
3370
- * @typedef {{[K in AliasType]: InstanceType<typeof SUPPORTED_TASKS[TASK_ALIASES[K]]["pipeline"]>}} AliasTasks A mapping from pipeline aliases to their corresponding pipeline classes.
3371
- * @typedef {SupportedTasks & AliasTasks} AllTasks A mapping from all pipeline names and aliases to their corresponding pipeline classes.
3372
- */
3373
-
3374
- /**
3375
- * Utility factory method to build a `Pipeline` object.
3376
- *
3377
- * @template {PipelineType} T The type of pipeline to return.
3378
- * @param {T} task The task defining which pipeline will be returned. Currently accepted tasks are:
3379
- * - `"audio-classification"`: will return a `AudioClassificationPipeline`.
3380
- * - `"automatic-speech-recognition"`: will return a `AutomaticSpeechRecognitionPipeline`.
3381
- * - `"depth-estimation"`: will return a `DepthEstimationPipeline`.
3382
- * - `"document-question-answering"`: will return a `DocumentQuestionAnsweringPipeline`.
3383
- * - `"feature-extraction"`: will return a `FeatureExtractionPipeline`.
3384
- * - `"fill-mask"`: will return a `FillMaskPipeline`.
3385
- * - `"image-classification"`: will return a `ImageClassificationPipeline`.
3386
- * - `"image-segmentation"`: will return a `ImageSegmentationPipeline`.
3387
- * - `"image-to-text"`: will return a `ImageToTextPipeline`.
3388
- * - `"object-detection"`: will return a `ObjectDetectionPipeline`.
3389
- * - `"question-answering"`: will return a `QuestionAnsweringPipeline`.
3390
- * - `"summarization"`: will return a `SummarizationPipeline`.
3391
- * - `"text2text-generation"`: will return a `Text2TextGenerationPipeline`.
3392
- * - `"text-classification"` (alias "sentiment-analysis" available): will return a `TextClassificationPipeline`.
3393
- * - `"text-generation"`: will return a `TextGenerationPipeline`.
3394
- * - `"token-classification"` (alias "ner" available): will return a `TokenClassificationPipeline`.
3395
- * - `"translation"`: will return a `TranslationPipeline`.
3396
- * - `"translation_xx_to_yy"`: will return a `TranslationPipeline`.
3397
- * - `"zero-shot-classification"`: will return a `ZeroShotClassificationPipeline`.
3398
- * - `"zero-shot-audio-classification"`: will return a `ZeroShotAudioClassificationPipeline`.
3399
- * - `"zero-shot-image-classification"`: will return a `ZeroShotImageClassificationPipeline`.
3400
- * - `"zero-shot-object-detection"`: will return a `ZeroShotObjectDetectionPipeline`.
3401
- * @param {string} [model=null] The name of the pre-trained model to use. If not specified, the default model for the task will be used.
3402
- * @param {import('./utils/hub.js').PretrainedModelOptions} [options] Optional parameters for the pipeline.
3403
- * @returns {Promise<AllTasks[T]>} A Pipeline object for the specified task.
3404
- * @throws {Error} If an unsupported pipeline is requested.
3405
- */
3406
- export async function pipeline(
3407
- task,
3408
- model = null,
3409
- {
3410
- progress_callback = null,
3411
- config = null,
3412
- cache_dir = null,
3413
- local_files_only = false,
3414
- revision = 'main',
3415
- device = null,
3416
- dtype = null,
3417
- subfolder = 'onnx',
3418
- use_external_data_format = null,
3419
- model_file_name = null,
3420
- session_options = {},
3421
- } = {}
3422
- ) {
3423
- // Helper method to construct pipeline
3424
-
3425
- // Apply aliases
3426
- // @ts-ignore
3427
- task = TASK_ALIASES[task] ?? task;
3428
-
3429
- // Get pipeline info
3430
- const pipelineInfo = SUPPORTED_TASKS[task.split('_', 1)[0]];
3431
- if (!pipelineInfo) {
3432
- throw Error(`Unsupported pipeline: ${task}. Must be one of [${Object.keys(SUPPORTED_TASKS)}]`)
3433
- }
3434
-
3435
- // Use model if specified, otherwise, use default
3436
- if (!model) {
3437
- model = pipelineInfo.default.model
3438
- console.log(`No model specified. Using default model: "${model}".`);
3439
- }
3440
-
3441
- const pretrainedOptions = {
3442
- progress_callback,
3443
- config,
3444
- cache_dir,
3445
- local_files_only,
3446
- revision,
3447
- device,
3448
- dtype,
3449
- subfolder,
3450
- use_external_data_format,
3451
- model_file_name,
3452
- session_options,
3453
- }
3454
-
3455
- const classes = new Map([
3456
- ['tokenizer', pipelineInfo.tokenizer],
3457
- ['model', pipelineInfo.model],
3458
- ['processor', pipelineInfo.processor],
3459
- ]);
3460
-
3461
- // Load model, tokenizer, and processor (if they exist)
3462
- const results = await loadItems(classes, model, pretrainedOptions);
3463
- results.task = task;
3464
-
3465
- dispatchCallback(progress_callback, {
3466
- 'status': 'ready',
3467
- 'task': task,
3468
- 'model': model,
3469
- });
3470
-
3471
- const pipelineClass = pipelineInfo.pipeline;
3472
- return new pipelineClass(results);
3473
- }
3474
-
3475
-
3476
- /**
3477
- * Helper function to get applicable model, tokenizer, or processor classes for a given model.
3478
- * @param {Map<string, any>} mapping The mapping of names to classes, arrays of classes, or null.
3479
- * @param {string} model The name of the model to load.
3480
- * @param {import('./utils/hub.js').PretrainedOptions} pretrainedOptions The options to pass to the `from_pretrained` method.
3481
- * @private
3482
- */
3483
- async function loadItems(mapping, model, pretrainedOptions) {
3484
-
3485
- const result = Object.create(null);
3486
-
3487
- /**@type {Promise[]} */
3488
- const promises = [];
3489
- for (const [name, cls] of mapping.entries()) {
3490
- if (!cls) continue;
3491
-
3492
- /**@type {Promise} */
3493
- let promise;
3494
- if (Array.isArray(cls)) {
3495
- promise = new Promise(async (resolve, reject) => {
3496
- let e;
3497
- for (const c of cls) {
3498
- if (c === null) {
3499
- // If null, we resolve it immediately, meaning the relevant
3500
- // class was not found, but it is optional.
3501
- resolve(null);
3502
- return;
3503
- }
3504
- try {
3505
- resolve(await c.from_pretrained(model, pretrainedOptions));
3506
- return;
3507
- } catch (err) {
3508
- if (err.message?.includes('Unsupported model type')) {
3509
- // If the error is due to an unsupported model type, we
3510
- // save the error and try the next class.
3511
- e = err;
3512
- } else if (err.message?.includes('Could not locate file')) {
3513
- e = err;
3514
- } else {
3515
- reject(err);
3516
- return;
3517
- }
3518
-
3519
- }
3520
- }
3521
- reject(e);
3522
- })
3523
- } else {
3524
- promise = cls.from_pretrained(model, pretrainedOptions);
3525
- }
3526
-
3527
- result[name] = promise;
3528
- promises.push(promise);
3529
- }
3530
-
3531
- // Wait for all promises to resolve (in parallel)
3532
- await Promise.all(promises);
3533
-
3534
- // Then assign to result
3535
- for (const [name, promise] of Object.entries(result)) {
3536
- result[name] = await promise;
3537
- }
3538
-
3539
- return result;
3540
- }