@huggingface/transformers 3.3.3 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +9 -3
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
  3. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  4. package/dist/transformers.js +2480 -1457
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.js +1 -1
  7. package/dist/transformers.min.js.map +1 -1
  8. package/dist/{transformers.cjs → transformers.node.cjs} +1412 -2395
  9. package/dist/transformers.node.cjs.map +1 -0
  10. package/dist/transformers.node.min.cjs +2 -0
  11. package/dist/transformers.node.min.cjs.map +1 -0
  12. package/dist/transformers.node.min.mjs +2 -0
  13. package/dist/transformers.node.min.mjs.map +1 -0
  14. package/dist/{transformers.mjs → transformers.node.mjs} +1440 -2375
  15. package/dist/transformers.node.mjs.map +1 -0
  16. package/dist/transformers.web.js +35713 -0
  17. package/dist/transformers.web.js.map +1 -0
  18. package/dist/transformers.web.min.js +2 -0
  19. package/dist/transformers.web.min.js.map +1 -0
  20. package/package.json +6 -6
  21. package/src/backends/onnx.js +14 -15
  22. package/src/configs.js +4 -1
  23. package/src/env.js +1 -1
  24. package/src/generation/streamers.js +4 -3
  25. package/src/models/dac/feature_extraction_dac.js +3 -0
  26. package/src/models/encodec/feature_extraction_encodec.js +32 -0
  27. package/src/models/feature_extractors.js +2 -0
  28. package/src/models/idefics3/image_processing_idefics3.js +1 -1
  29. package/src/models/image_processors.js +1 -0
  30. package/src/models/processors.js +2 -0
  31. package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
  32. package/src/models/smolvlm/processing_smolvlm.js +2 -0
  33. package/src/models/ultravox/processing_ultravox.js +54 -0
  34. package/src/models/whisper/common_whisper.js +7 -1
  35. package/src/models/whisper/feature_extraction_whisper.js +18 -10
  36. package/src/models.js +456 -76
  37. package/src/pipelines.js +111 -7
  38. package/src/tokenizers.js +42 -28
  39. package/src/transformers.js +1 -0
  40. package/src/utils/audio.js +2 -0
  41. package/src/utils/hub.js +140 -80
  42. package/src/utils/maths.js +1 -1
  43. package/src/utils/tensor.js +6 -3
  44. package/src/utils/video.js +128 -0
  45. package/types/backends/onnx.d.ts +2 -2
  46. package/types/backends/onnx.d.ts.map +1 -1
  47. package/types/configs.d.ts +1 -1
  48. package/types/configs.d.ts.map +1 -1
  49. package/types/generation/streamers.d.ts.map +1 -1
  50. package/types/models/dac/feature_extraction_dac.d.ts +4 -0
  51. package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
  52. package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
  53. package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
  54. package/types/models/feature_extractors.d.ts +2 -0
  55. package/types/models/florence2/processing_florence2.d.ts +1 -1
  56. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  57. package/types/models/image_processors.d.ts +1 -0
  58. package/types/models/processors.d.ts +2 -0
  59. package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
  60. package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
  61. package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
  62. package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
  63. package/types/models/ultravox/processing_ultravox.d.ts +16 -0
  64. package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
  65. package/types/models/whisper/common_whisper.d.ts.map +1 -1
  66. package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
  67. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  68. package/types/models.d.ts +132 -4
  69. package/types/models.d.ts.map +1 -1
  70. package/types/pipelines.d.ts +50 -4
  71. package/types/pipelines.d.ts.map +1 -1
  72. package/types/tokenizers.d.ts.map +1 -1
  73. package/types/transformers.d.ts +1 -0
  74. package/types/tsconfig.tsbuildinfo +1 -1
  75. package/types/utils/audio.d.ts.map +1 -1
  76. package/types/utils/hub.d.ts +19 -7
  77. package/types/utils/hub.d.ts.map +1 -1
  78. package/types/utils/maths.d.ts +2 -2
  79. package/types/utils/maths.d.ts.map +1 -1
  80. package/types/utils/tensor.d.ts +17 -18
  81. package/types/utils/tensor.d.ts.map +1 -1
  82. package/types/utils/video.d.ts +37 -0
  83. package/types/utils/video.d.ts.map +1 -0
  84. package/dist/transformers.cjs.map +0 -1
  85. package/dist/transformers.min.cjs +0 -2
  86. package/dist/transformers.min.cjs.map +0 -1
  87. package/dist/transformers.min.mjs +0 -2
  88. package/dist/transformers.min.mjs.map +0 -1
  89. package/dist/transformers.mjs.map +0 -1
package/src/pipelines.js CHANGED
@@ -1730,6 +1730,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1730
1730
  async _call(audio, kwargs = {}) {
1731
1731
  switch (this.model.config.model_type) {
1732
1732
  case 'whisper':
1733
+ case 'lite-whisper':
1733
1734
  return this._call_whisper(audio, kwargs)
1734
1735
  case 'wav2vec2':
1735
1736
  case 'wav2vec2-bert':
@@ -2095,7 +2096,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image
2095
2096
 
2096
2097
  /**
2097
2098
  * @typedef {Object} ImageSegmentationPipelineOutput
2098
- * @property {string} label The label of the segment.
2099
+ * @property {string|null} label The label of the segment.
2099
2100
  * @property {number|null} score The score of the segment.
2100
2101
  * @property {RawImage} mask The mask of the segment.
2101
2102
  *
@@ -2165,14 +2166,30 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
2165
2166
  const preparedImages = await prepareImages(images);
2166
2167
  const imageSizes = preparedImages.map(x => [x.height, x.width]);
2167
2168
 
2168
- const { pixel_values, pixel_mask } = await this.processor(preparedImages);
2169
- const output = await this.model({ pixel_values, pixel_mask });
2169
+ const inputs = await this.processor(preparedImages);
2170
+
2171
+ const { inputNames, outputNames } = this.model.sessions['model'];
2172
+ if (!inputNames.includes('pixel_values')) {
2173
+ if (inputNames.length !== 1) {
2174
+ throw Error(`Expected a single input name, but got ${inputNames.length} inputs: ${inputNames}.`);
2175
+ }
2176
+
2177
+ const newName = inputNames[0];
2178
+ if (newName in inputs) {
2179
+ throw Error(`Input name ${newName} already exists in the inputs.`);
2180
+ }
2181
+ // To ensure compatibility with certain background-removal models,
2182
+ // we may need to perform a mapping of input to output names
2183
+ inputs[newName] = inputs.pixel_values;
2184
+ }
2185
+
2186
+ const output = await this.model(inputs);
2170
2187
 
2171
2188
  let fn = null;
2172
2189
  if (subtask !== null) {
2173
2190
  fn = this.subtasks_mapping[subtask];
2174
- } else {
2175
- for (let [task, func] of Object.entries(this.subtasks_mapping)) {
2191
+ } else if (this.processor.image_processor) {
2192
+ for (const [task, func] of Object.entries(this.subtasks_mapping)) {
2176
2193
  if (func in this.processor.image_processor) {
2177
2194
  fn = this.processor.image_processor[func].bind(this.processor.image_processor);
2178
2195
  subtask = task;
@@ -2186,7 +2203,23 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
2186
2203
 
2187
2204
  /** @type {ImageSegmentationPipelineOutput[]} */
2188
2205
  const annotation = [];
2189
- if (subtask === 'panoptic' || subtask === 'instance') {
2206
+ if (!subtask) {
2207
+ // Perform standard image segmentation
2208
+ const result = output[outputNames[0]];
2209
+ for (let i = 0; i < imageSizes.length; ++i) {
2210
+ const size = imageSizes[i];
2211
+ const item = result[i];
2212
+ if (item.data.some(x => x < 0 || x > 1)) {
2213
+ item.sigmoid_();
2214
+ }
2215
+ const mask = await RawImage.fromTensor(item.mul_(255).to('uint8')).resize(size[1], size[0]);
2216
+ annotation.push({
2217
+ label: null,
2218
+ score: null,
2219
+ mask
2220
+ });
2221
+ }
2222
+ } else if (subtask === 'panoptic' || subtask === 'instance') {
2190
2223
  const processed = fn(
2191
2224
  output,
2192
2225
  threshold,
@@ -2242,6 +2275,63 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
2242
2275
  }
2243
2276
  }
2244
2277
 
2278
+
2279
+ /**
2280
+ * @typedef {Object} BackgroundRemovalPipelineOptions Parameters specific to image segmentation pipelines.
2281
+ *
2282
+ * @callback BackgroundRemovalPipelineCallback Segment the input images.
2283
+ * @param {ImagePipelineInputs} images The input images.
2284
+ * @param {BackgroundRemovalPipelineOptions} [options] The options to use for image segmentation.
2285
+ * @returns {Promise<RawImage[]>} The images with the background removed.
2286
+ *
2287
+ * @typedef {ImagePipelineConstructorArgs & BackgroundRemovalPipelineCallback & Disposable} BackgroundRemovalPipelineType
2288
+ */
2289
+
2290
+ /**
2291
+ * Background removal pipeline using certain `AutoModelForXXXSegmentation`.
2292
+ * This pipeline removes the backgrounds of images.
2293
+ *
2294
+ * **Example:** Perform background removal with `Xenova/modnet`.
2295
+ * ```javascript
2296
+ * const segmenter = await pipeline('background-removal', 'Xenova/modnet');
2297
+ * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/portrait-of-woman_small.jpg';
2298
+ * const output = await segmenter(url);
2299
+ * // [
2300
+ * // RawImage { data: Uint8ClampedArray(648000) [ ... ], width: 360, height: 450, channels: 4 }
2301
+ * // ]
2302
+ * ```
2303
+ */
2304
+ export class BackgroundRemovalPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageSegmentationPipelineType} */ (ImageSegmentationPipeline)) {
2305
+ /**
2306
+ * Create a new BackgroundRemovalPipeline.
2307
+ * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
2308
+ */
2309
+ constructor(options) {
2310
+ super(options);
2311
+ }
2312
+
2313
+ /** @type {BackgroundRemovalPipelineCallback} */
2314
+ async _call(images, options = {}) {
2315
+ const isBatched = Array.isArray(images);
2316
+
2317
+ if (isBatched && images.length !== 1) {
2318
+ throw Error("Background removal pipeline currently only supports a batch size of 1.");
2319
+ }
2320
+
2321
+ const preparedImages = await prepareImages(images);
2322
+
2323
+ // @ts-expect-error TS2339
2324
+ const masks = await super._call(images, options);
2325
+ const result = preparedImages.map((img, i) => {
2326
+ const cloned = img.clone();
2327
+ cloned.putAlpha(masks[i].mask);
2328
+ return cloned;
2329
+ });
2330
+
2331
+ return result;
2332
+ }
2333
+ }
2334
+
2245
2335
  /**
2246
2336
  * @typedef {Object} ZeroShotImageClassificationOutput
2247
2337
  * @property {string} label The label identified by the model. It is one of the suggested `candidate_label`.
@@ -2554,7 +2644,7 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T
2554
2644
  const output = await this.model({ ...text_inputs, pixel_values });
2555
2645
 
2556
2646
  let result;
2557
- if('post_process_grounded_object_detection' in this.processor) {
2647
+ if ('post_process_grounded_object_detection' in this.processor) {
2558
2648
  // @ts-ignore
2559
2649
  const processed = this.processor.post_process_grounded_object_detection(
2560
2650
  output,
@@ -3134,6 +3224,16 @@ const SUPPORTED_TASKS = Object.freeze({
3134
3224
  },
3135
3225
  "type": "multimodal",
3136
3226
  },
3227
+ "background-removal": {
3228
+ // no tokenizer
3229
+ "pipeline": BackgroundRemovalPipeline,
3230
+ "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
3231
+ "processor": AutoProcessor,
3232
+ "default": {
3233
+ "model": "Xenova/modnet",
3234
+ },
3235
+ "type": "image",
3236
+ },
3137
3237
 
3138
3238
  "zero-shot-image-classification": {
3139
3239
  "tokenizer": AutoTokenizer,
@@ -3299,6 +3399,8 @@ export async function pipeline(
3299
3399
  revision = 'main',
3300
3400
  device = null,
3301
3401
  dtype = null,
3402
+ subfolder = 'onnx',
3403
+ use_external_data_format = null,
3302
3404
  model_file_name = null,
3303
3405
  session_options = {},
3304
3406
  } = {}
@@ -3329,6 +3431,8 @@ export async function pipeline(
3329
3431
  revision,
3330
3432
  device,
3331
3433
  dtype,
3434
+ subfolder,
3435
+ use_external_data_format,
3332
3436
  model_file_name,
3333
3437
  session_options,
3334
3438
  }
package/src/tokenizers.js CHANGED
@@ -995,6 +995,8 @@ class Normalizer extends Callable {
995
995
  return new Replace(config);
996
996
  case 'NFC':
997
997
  return new NFC(config);
998
+ case 'NFD':
999
+ return new NFD(config);
998
1000
  case 'NFKC':
999
1001
  return new NFKC(config);
1000
1002
  case 'NFKD':
@@ -1053,50 +1055,62 @@ class Replace extends Normalizer {
1053
1055
  }
1054
1056
 
1055
1057
  /**
1056
- * A normalizer that applies Unicode normalization form C (NFC) to the input text.
1058
+ * A normalizer that applies Unicode normalization to the input text.
1057
1059
  * @extends Normalizer
1060
+ * @abstract
1058
1061
  */
1059
- class NFC extends Normalizer {
1062
+ class UnicodeNormalizer extends Normalizer {
1063
+ /**
1064
+ * @type {string} The Unicode normalization form to apply.
1065
+ * Should be one of: 'NFC', 'NFD', 'NFKC', or 'NFKD'.
1066
+ */
1067
+ form = undefined;
1068
+
1060
1069
  /**
1061
- * Normalize the input text by applying Unicode normalization form C (NFC).
1070
+ * Normalize the input text by applying Unicode normalization.
1062
1071
  * @param {string} text The input text to be normalized.
1063
1072
  * @returns {string} The normalized text.
1064
1073
  */
1065
1074
  normalize(text) {
1066
- text = text.normalize('NFC')
1075
+ text = text.normalize(this.form)
1067
1076
  return text;
1068
1077
  }
1069
1078
  }
1070
1079
 
1071
1080
  /**
1072
- * NFKC Normalizer.
1073
- * @extends Normalizer
1081
+ * A normalizer that applies Unicode normalization form C (NFC) to the input text.
1082
+ * Canonical Decomposition, followed by Canonical Composition.
1083
+ * @extends UnicodeNormalizer
1074
1084
  */
1075
- class NFKC extends Normalizer {
1076
- /**
1077
- * Normalize text using NFKC normalization.
1078
- * @param {string} text The text to be normalized.
1079
- * @returns {string} The normalized text.
1080
- */
1081
- normalize(text) {
1082
- text = text.normalize('NFKC')
1083
- return text;
1084
- }
1085
+ class NFC extends UnicodeNormalizer {
1086
+ form = 'NFC';
1085
1087
  }
1088
+
1086
1089
  /**
1087
- * NFKD Normalizer.
1088
- * @extends Normalizer
1090
+ * A normalizer that applies Unicode normalization form D (NFD) to the input text.
1091
+ * Canonical Decomposition.
1092
+ * @extends UnicodeNormalizer
1089
1093
  */
1090
- class NFKD extends Normalizer {
1091
- /**
1092
- * Normalize text using NFKD normalization.
1093
- * @param {string} text The text to be normalized.
1094
- * @returns {string} The normalized text.
1095
- */
1096
- normalize(text) {
1097
- text = text.normalize('NFKD')
1098
- return text;
1099
- }
1094
+ class NFD extends UnicodeNormalizer {
1095
+ form = 'NFD';
1096
+ }
1097
+
1098
+ /**
1099
+ * A normalizer that applies Unicode normalization form KC (NFKC) to the input text.
1100
+ * Compatibility Decomposition, followed by Canonical Composition.
1101
+ * @extends UnicodeNormalizer
1102
+ */
1103
+ class NFKC extends UnicodeNormalizer {
1104
+ form = 'NFKC';
1105
+ }
1106
+
1107
+ /**
1108
+ * A normalizer that applies Unicode normalization form KD (NFKD) to the input text.
1109
+ * Compatibility Decomposition.
1110
+ * @extends UnicodeNormalizer
1111
+ */
1112
+ class NFKD extends UnicodeNormalizer {
1113
+ form = 'NFKD';
1100
1114
  }
1101
1115
 
1102
1116
  /**
@@ -20,6 +20,7 @@ export * from './configs.js';
20
20
 
21
21
  export * from './utils/audio.js';
22
22
  export * from './utils/image.js';
23
+ export * from './utils/video.js';
23
24
  export * from './utils/tensor.js';
24
25
  export * from './utils/maths.js';
25
26
 
@@ -150,6 +150,7 @@ function hertz_to_mel(freq, mel_scale = "htk") {
150
150
  throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');
151
151
  }
152
152
 
153
+ // @ts-expect-error ts(2322)
153
154
  return typeof freq === 'number' ? fn(freq) : freq.map(x => fn(x));
154
155
  }
155
156
 
@@ -173,6 +174,7 @@ function mel_to_hertz(mels, mel_scale = "htk") {
173
174
  throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');
174
175
  }
175
176
 
177
+ // @ts-expect-error ts(2322)
176
178
  return typeof mels === 'number' ? fn(mels) : mels.map(x => fn(x));
177
179
  }
178
180