@huggingface/transformers 3.3.3 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -3
- package/dist/ort-wasm-simd-threaded.jsep.mjs +124 -115
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.js +2480 -1457
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/{transformers.cjs → transformers.node.cjs} +1412 -2395
- package/dist/transformers.node.cjs.map +1 -0
- package/dist/transformers.node.min.cjs +2 -0
- package/dist/transformers.node.min.cjs.map +1 -0
- package/dist/transformers.node.min.mjs +2 -0
- package/dist/transformers.node.min.mjs.map +1 -0
- package/dist/{transformers.mjs → transformers.node.mjs} +1440 -2375
- package/dist/transformers.node.mjs.map +1 -0
- package/dist/transformers.web.js +35713 -0
- package/dist/transformers.web.js.map +1 -0
- package/dist/transformers.web.min.js +2 -0
- package/dist/transformers.web.min.js.map +1 -0
- package/package.json +6 -6
- package/src/backends/onnx.js +14 -15
- package/src/configs.js +4 -1
- package/src/env.js +1 -1
- package/src/generation/streamers.js +4 -3
- package/src/models/dac/feature_extraction_dac.js +3 -0
- package/src/models/encodec/feature_extraction_encodec.js +32 -0
- package/src/models/feature_extractors.js +2 -0
- package/src/models/idefics3/image_processing_idefics3.js +1 -1
- package/src/models/image_processors.js +1 -0
- package/src/models/processors.js +2 -0
- package/src/models/smolvlm/image_processing_smolvlm.js +2 -0
- package/src/models/smolvlm/processing_smolvlm.js +2 -0
- package/src/models/ultravox/processing_ultravox.js +54 -0
- package/src/models/whisper/common_whisper.js +7 -1
- package/src/models/whisper/feature_extraction_whisper.js +18 -10
- package/src/models.js +456 -76
- package/src/pipelines.js +111 -7
- package/src/tokenizers.js +42 -28
- package/src/transformers.js +1 -0
- package/src/utils/audio.js +2 -0
- package/src/utils/hub.js +140 -80
- package/src/utils/maths.js +1 -1
- package/src/utils/tensor.js +6 -3
- package/src/utils/video.js +128 -0
- package/types/backends/onnx.d.ts +2 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/configs.d.ts +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/dac/feature_extraction_dac.d.ts +4 -0
- package/types/models/dac/feature_extraction_dac.d.ts.map +1 -0
- package/types/models/encodec/feature_extraction_encodec.d.ts +13 -0
- package/types/models/encodec/feature_extraction_encodec.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/florence2/processing_florence2.d.ts +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/smolvlm/image_processing_smolvlm.d.ts +2 -0
- package/types/models/smolvlm/image_processing_smolvlm.d.ts.map +1 -0
- package/types/models/smolvlm/processing_smolvlm.d.ts +2 -0
- package/types/models/smolvlm/processing_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/processing_ultravox.d.ts +16 -0
- package/types/models/ultravox/processing_ultravox.d.ts.map +1 -0
- package/types/models/whisper/common_whisper.d.ts.map +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts +3 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/models.d.ts +132 -4
- package/types/models.d.ts.map +1 -1
- package/types/pipelines.d.ts +50 -4
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/tsconfig.tsbuildinfo +1 -1
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/hub.d.ts +19 -7
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/maths.d.ts +2 -2
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +17 -18
- package/types/utils/tensor.d.ts.map +1 -1
- package/types/utils/video.d.ts +37 -0
- package/types/utils/video.d.ts.map +1 -0
- package/dist/transformers.cjs.map +0 -1
- package/dist/transformers.min.cjs +0 -2
- package/dist/transformers.min.cjs.map +0 -1
- package/dist/transformers.min.mjs +0 -2
- package/dist/transformers.min.mjs.map +0 -1
- package/dist/transformers.mjs.map +0 -1
package/src/pipelines.js
CHANGED
|
@@ -1730,6 +1730,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
|
|
|
1730
1730
|
async _call(audio, kwargs = {}) {
|
|
1731
1731
|
switch (this.model.config.model_type) {
|
|
1732
1732
|
case 'whisper':
|
|
1733
|
+
case 'lite-whisper':
|
|
1733
1734
|
return this._call_whisper(audio, kwargs)
|
|
1734
1735
|
case 'wav2vec2':
|
|
1735
1736
|
case 'wav2vec2-bert':
|
|
@@ -2095,7 +2096,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image
|
|
|
2095
2096
|
|
|
2096
2097
|
/**
|
|
2097
2098
|
* @typedef {Object} ImageSegmentationPipelineOutput
|
|
2098
|
-
* @property {string} label The label of the segment.
|
|
2099
|
+
* @property {string|null} label The label of the segment.
|
|
2099
2100
|
* @property {number|null} score The score of the segment.
|
|
2100
2101
|
* @property {RawImage} mask The mask of the segment.
|
|
2101
2102
|
*
|
|
@@ -2165,14 +2166,30 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
|
|
|
2165
2166
|
const preparedImages = await prepareImages(images);
|
|
2166
2167
|
const imageSizes = preparedImages.map(x => [x.height, x.width]);
|
|
2167
2168
|
|
|
2168
|
-
const
|
|
2169
|
-
|
|
2169
|
+
const inputs = await this.processor(preparedImages);
|
|
2170
|
+
|
|
2171
|
+
const { inputNames, outputNames } = this.model.sessions['model'];
|
|
2172
|
+
if (!inputNames.includes('pixel_values')) {
|
|
2173
|
+
if (inputNames.length !== 1) {
|
|
2174
|
+
throw Error(`Expected a single input name, but got ${inputNames.length} inputs: ${inputNames}.`);
|
|
2175
|
+
}
|
|
2176
|
+
|
|
2177
|
+
const newName = inputNames[0];
|
|
2178
|
+
if (newName in inputs) {
|
|
2179
|
+
throw Error(`Input name ${newName} already exists in the inputs.`);
|
|
2180
|
+
}
|
|
2181
|
+
// To ensure compatibility with certain background-removal models,
|
|
2182
|
+
// we may need to perform a mapping of input to output names
|
|
2183
|
+
inputs[newName] = inputs.pixel_values;
|
|
2184
|
+
}
|
|
2185
|
+
|
|
2186
|
+
const output = await this.model(inputs);
|
|
2170
2187
|
|
|
2171
2188
|
let fn = null;
|
|
2172
2189
|
if (subtask !== null) {
|
|
2173
2190
|
fn = this.subtasks_mapping[subtask];
|
|
2174
|
-
} else {
|
|
2175
|
-
for (
|
|
2191
|
+
} else if (this.processor.image_processor) {
|
|
2192
|
+
for (const [task, func] of Object.entries(this.subtasks_mapping)) {
|
|
2176
2193
|
if (func in this.processor.image_processor) {
|
|
2177
2194
|
fn = this.processor.image_processor[func].bind(this.processor.image_processor);
|
|
2178
2195
|
subtask = task;
|
|
@@ -2186,7 +2203,23 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
|
|
|
2186
2203
|
|
|
2187
2204
|
/** @type {ImageSegmentationPipelineOutput[]} */
|
|
2188
2205
|
const annotation = [];
|
|
2189
|
-
if (subtask
|
|
2206
|
+
if (!subtask) {
|
|
2207
|
+
// Perform standard image segmentation
|
|
2208
|
+
const result = output[outputNames[0]];
|
|
2209
|
+
for (let i = 0; i < imageSizes.length; ++i) {
|
|
2210
|
+
const size = imageSizes[i];
|
|
2211
|
+
const item = result[i];
|
|
2212
|
+
if (item.data.some(x => x < 0 || x > 1)) {
|
|
2213
|
+
item.sigmoid_();
|
|
2214
|
+
}
|
|
2215
|
+
const mask = await RawImage.fromTensor(item.mul_(255).to('uint8')).resize(size[1], size[0]);
|
|
2216
|
+
annotation.push({
|
|
2217
|
+
label: null,
|
|
2218
|
+
score: null,
|
|
2219
|
+
mask
|
|
2220
|
+
});
|
|
2221
|
+
}
|
|
2222
|
+
} else if (subtask === 'panoptic' || subtask === 'instance') {
|
|
2190
2223
|
const processed = fn(
|
|
2191
2224
|
output,
|
|
2192
2225
|
threshold,
|
|
@@ -2242,6 +2275,63 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
|
|
|
2242
2275
|
}
|
|
2243
2276
|
}
|
|
2244
2277
|
|
|
2278
|
+
|
|
2279
|
+
/**
|
|
2280
|
+
* @typedef {Object} BackgroundRemovalPipelineOptions Parameters specific to image segmentation pipelines.
|
|
2281
|
+
*
|
|
2282
|
+
* @callback BackgroundRemovalPipelineCallback Segment the input images.
|
|
2283
|
+
* @param {ImagePipelineInputs} images The input images.
|
|
2284
|
+
* @param {BackgroundRemovalPipelineOptions} [options] The options to use for image segmentation.
|
|
2285
|
+
* @returns {Promise<RawImage[]>} The images with the background removed.
|
|
2286
|
+
*
|
|
2287
|
+
* @typedef {ImagePipelineConstructorArgs & BackgroundRemovalPipelineCallback & Disposable} BackgroundRemovalPipelineType
|
|
2288
|
+
*/
|
|
2289
|
+
|
|
2290
|
+
/**
|
|
2291
|
+
* Background removal pipeline using certain `AutoModelForXXXSegmentation`.
|
|
2292
|
+
* This pipeline removes the backgrounds of images.
|
|
2293
|
+
*
|
|
2294
|
+
* **Example:** Perform background removal with `Xenova/modnet`.
|
|
2295
|
+
* ```javascript
|
|
2296
|
+
* const segmenter = await pipeline('background-removal', 'Xenova/modnet');
|
|
2297
|
+
* const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/portrait-of-woman_small.jpg';
|
|
2298
|
+
* const output = await segmenter(url);
|
|
2299
|
+
* // [
|
|
2300
|
+
* // RawImage { data: Uint8ClampedArray(648000) [ ... ], width: 360, height: 450, channels: 4 }
|
|
2301
|
+
* // ]
|
|
2302
|
+
* ```
|
|
2303
|
+
*/
|
|
2304
|
+
export class BackgroundRemovalPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageSegmentationPipelineType} */ (ImageSegmentationPipeline)) {
|
|
2305
|
+
/**
|
|
2306
|
+
* Create a new BackgroundRemovalPipeline.
|
|
2307
|
+
* @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
|
|
2308
|
+
*/
|
|
2309
|
+
constructor(options) {
|
|
2310
|
+
super(options);
|
|
2311
|
+
}
|
|
2312
|
+
|
|
2313
|
+
/** @type {BackgroundRemovalPipelineCallback} */
|
|
2314
|
+
async _call(images, options = {}) {
|
|
2315
|
+
const isBatched = Array.isArray(images);
|
|
2316
|
+
|
|
2317
|
+
if (isBatched && images.length !== 1) {
|
|
2318
|
+
throw Error("Background removal pipeline currently only supports a batch size of 1.");
|
|
2319
|
+
}
|
|
2320
|
+
|
|
2321
|
+
const preparedImages = await prepareImages(images);
|
|
2322
|
+
|
|
2323
|
+
// @ts-expect-error TS2339
|
|
2324
|
+
const masks = await super._call(images, options);
|
|
2325
|
+
const result = preparedImages.map((img, i) => {
|
|
2326
|
+
const cloned = img.clone();
|
|
2327
|
+
cloned.putAlpha(masks[i].mask);
|
|
2328
|
+
return cloned;
|
|
2329
|
+
});
|
|
2330
|
+
|
|
2331
|
+
return result;
|
|
2332
|
+
}
|
|
2333
|
+
}
|
|
2334
|
+
|
|
2245
2335
|
/**
|
|
2246
2336
|
* @typedef {Object} ZeroShotImageClassificationOutput
|
|
2247
2337
|
* @property {string} label The label identified by the model. It is one of the suggested `candidate_label`.
|
|
@@ -2554,7 +2644,7 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T
|
|
|
2554
2644
|
const output = await this.model({ ...text_inputs, pixel_values });
|
|
2555
2645
|
|
|
2556
2646
|
let result;
|
|
2557
|
-
if('post_process_grounded_object_detection' in this.processor) {
|
|
2647
|
+
if ('post_process_grounded_object_detection' in this.processor) {
|
|
2558
2648
|
// @ts-ignore
|
|
2559
2649
|
const processed = this.processor.post_process_grounded_object_detection(
|
|
2560
2650
|
output,
|
|
@@ -3134,6 +3224,16 @@ const SUPPORTED_TASKS = Object.freeze({
|
|
|
3134
3224
|
},
|
|
3135
3225
|
"type": "multimodal",
|
|
3136
3226
|
},
|
|
3227
|
+
"background-removal": {
|
|
3228
|
+
// no tokenizer
|
|
3229
|
+
"pipeline": BackgroundRemovalPipeline,
|
|
3230
|
+
"model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
|
|
3231
|
+
"processor": AutoProcessor,
|
|
3232
|
+
"default": {
|
|
3233
|
+
"model": "Xenova/modnet",
|
|
3234
|
+
},
|
|
3235
|
+
"type": "image",
|
|
3236
|
+
},
|
|
3137
3237
|
|
|
3138
3238
|
"zero-shot-image-classification": {
|
|
3139
3239
|
"tokenizer": AutoTokenizer,
|
|
@@ -3299,6 +3399,8 @@ export async function pipeline(
|
|
|
3299
3399
|
revision = 'main',
|
|
3300
3400
|
device = null,
|
|
3301
3401
|
dtype = null,
|
|
3402
|
+
subfolder = 'onnx',
|
|
3403
|
+
use_external_data_format = null,
|
|
3302
3404
|
model_file_name = null,
|
|
3303
3405
|
session_options = {},
|
|
3304
3406
|
} = {}
|
|
@@ -3329,6 +3431,8 @@ export async function pipeline(
|
|
|
3329
3431
|
revision,
|
|
3330
3432
|
device,
|
|
3331
3433
|
dtype,
|
|
3434
|
+
subfolder,
|
|
3435
|
+
use_external_data_format,
|
|
3332
3436
|
model_file_name,
|
|
3333
3437
|
session_options,
|
|
3334
3438
|
}
|
package/src/tokenizers.js
CHANGED
|
@@ -995,6 +995,8 @@ class Normalizer extends Callable {
|
|
|
995
995
|
return new Replace(config);
|
|
996
996
|
case 'NFC':
|
|
997
997
|
return new NFC(config);
|
|
998
|
+
case 'NFD':
|
|
999
|
+
return new NFD(config);
|
|
998
1000
|
case 'NFKC':
|
|
999
1001
|
return new NFKC(config);
|
|
1000
1002
|
case 'NFKD':
|
|
@@ -1053,50 +1055,62 @@ class Replace extends Normalizer {
|
|
|
1053
1055
|
}
|
|
1054
1056
|
|
|
1055
1057
|
/**
|
|
1056
|
-
* A normalizer that applies Unicode normalization
|
|
1058
|
+
* A normalizer that applies Unicode normalization to the input text.
|
|
1057
1059
|
* @extends Normalizer
|
|
1060
|
+
* @abstract
|
|
1058
1061
|
*/
|
|
1059
|
-
class
|
|
1062
|
+
class UnicodeNormalizer extends Normalizer {
|
|
1063
|
+
/**
|
|
1064
|
+
* @type {string} The Unicode normalization form to apply.
|
|
1065
|
+
* Should be one of: 'NFC', 'NFD', 'NFKC', or 'NFKD'.
|
|
1066
|
+
*/
|
|
1067
|
+
form = undefined;
|
|
1068
|
+
|
|
1060
1069
|
/**
|
|
1061
|
-
* Normalize the input text by applying Unicode normalization
|
|
1070
|
+
* Normalize the input text by applying Unicode normalization.
|
|
1062
1071
|
* @param {string} text The input text to be normalized.
|
|
1063
1072
|
* @returns {string} The normalized text.
|
|
1064
1073
|
*/
|
|
1065
1074
|
normalize(text) {
|
|
1066
|
-
text = text.normalize(
|
|
1075
|
+
text = text.normalize(this.form)
|
|
1067
1076
|
return text;
|
|
1068
1077
|
}
|
|
1069
1078
|
}
|
|
1070
1079
|
|
|
1071
1080
|
/**
|
|
1072
|
-
*
|
|
1073
|
-
*
|
|
1081
|
+
* A normalizer that applies Unicode normalization form C (NFC) to the input text.
|
|
1082
|
+
* Canonical Decomposition, followed by Canonical Composition.
|
|
1083
|
+
* @extends UnicodeNormalizer
|
|
1074
1084
|
*/
|
|
1075
|
-
class
|
|
1076
|
-
|
|
1077
|
-
* Normalize text using NFKC normalization.
|
|
1078
|
-
* @param {string} text The text to be normalized.
|
|
1079
|
-
* @returns {string} The normalized text.
|
|
1080
|
-
*/
|
|
1081
|
-
normalize(text) {
|
|
1082
|
-
text = text.normalize('NFKC')
|
|
1083
|
-
return text;
|
|
1084
|
-
}
|
|
1085
|
+
class NFC extends UnicodeNormalizer {
|
|
1086
|
+
form = 'NFC';
|
|
1085
1087
|
}
|
|
1088
|
+
|
|
1086
1089
|
/**
|
|
1087
|
-
*
|
|
1088
|
-
*
|
|
1090
|
+
* A normalizer that applies Unicode normalization form D (NFD) to the input text.
|
|
1091
|
+
* Canonical Decomposition.
|
|
1092
|
+
* @extends UnicodeNormalizer
|
|
1089
1093
|
*/
|
|
1090
|
-
class
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1094
|
+
class NFD extends UnicodeNormalizer {
|
|
1095
|
+
form = 'NFD';
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
/**
|
|
1099
|
+
* A normalizer that applies Unicode normalization form KC (NFKC) to the input text.
|
|
1100
|
+
* Compatibility Decomposition, followed by Canonical Composition.
|
|
1101
|
+
* @extends UnicodeNormalizer
|
|
1102
|
+
*/
|
|
1103
|
+
class NFKC extends UnicodeNormalizer {
|
|
1104
|
+
form = 'NFKC';
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
/**
|
|
1108
|
+
* A normalizer that applies Unicode normalization form KD (NFKD) to the input text.
|
|
1109
|
+
* Compatibility Decomposition.
|
|
1110
|
+
* @extends UnicodeNormalizer
|
|
1111
|
+
*/
|
|
1112
|
+
class NFKD extends UnicodeNormalizer {
|
|
1113
|
+
form = 'NFKD';
|
|
1100
1114
|
}
|
|
1101
1115
|
|
|
1102
1116
|
/**
|
package/src/transformers.js
CHANGED
package/src/utils/audio.js
CHANGED
|
@@ -150,6 +150,7 @@ function hertz_to_mel(freq, mel_scale = "htk") {
|
|
|
150
150
|
throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');
|
|
151
151
|
}
|
|
152
152
|
|
|
153
|
+
// @ts-expect-error ts(2322)
|
|
153
154
|
return typeof freq === 'number' ? fn(freq) : freq.map(x => fn(x));
|
|
154
155
|
}
|
|
155
156
|
|
|
@@ -173,6 +174,7 @@ function mel_to_hertz(mels, mel_scale = "htk") {
|
|
|
173
174
|
throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');
|
|
174
175
|
}
|
|
175
176
|
|
|
177
|
+
// @ts-expect-error ts(2322)
|
|
176
178
|
return typeof mels === 'number' ? fn(mels) : mels.map(x => fn(x));
|
|
177
179
|
}
|
|
178
180
|
|