@huggingface/transformers 3.2.3 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -3
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/ort.bundle.min.mjs +2776 -0
- package/dist/transformers.cjs +792 -330
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +1150 -656
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +798 -331
- package/dist/transformers.mjs.map +1 -1
- package/package.json +3 -3
- package/src/base/feature_extraction_utils.js +9 -9
- package/src/base/image_processors_utils.js +12 -1
- package/src/base/processing_utils.js +24 -3
- package/src/configs.js +5 -0
- package/src/env.js +1 -2
- package/src/generation/streamers.js +5 -2
- package/src/models/auto/feature_extraction_auto.js +0 -16
- package/src/models/auto/processing_auto.js +0 -16
- package/src/models/convnext/image_processing_convnext.js +1 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
- package/src/models/florence2/processing_florence2.js +3 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +29 -0
- package/src/models/grounding_dino/processing_grounding_dino.js +101 -0
- package/src/models/idefics3/image_processing_idefics3.js +2 -0
- package/src/models/image_processors.js +1 -0
- package/src/models/janus/image_processing_janus.js +1 -0
- package/src/models/mgp_str/processing_mgp_str.js +2 -0
- package/src/models/paligemma/processing_paligemma.js +1 -0
- package/src/models/phi3_v/processing_phi3_v.js +1 -1
- package/src/models/processors.js +3 -2
- package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
- package/src/models/whisper/feature_extraction_whisper.js +1 -1
- package/src/models.js +72 -20
- package/src/ops/registry.js +10 -0
- package/src/pipelines.js +73 -23
- package/src/tokenizers.js +4 -7
- package/src/utils/audio.js +113 -1
- package/src/utils/core.js +26 -0
- package/src/utils/dtypes.js +2 -0
- package/src/utils/hub.js +1 -1
- package/src/utils/image.js +5 -18
- package/src/utils/maths.js +8 -6
- package/src/utils/tensor.js +134 -114
- package/types/base/feature_extraction_utils.d.ts +7 -7
- package/types/base/image_processors_utils.d.ts +7 -0
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +25 -19
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +1 -1
- package/types/generation/streamers.d.ts +3 -1
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +20 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts.map +1 -0
- package/types/models/grounding_dino/processing_grounding_dino.d.ts +27 -0
- package/types/models/grounding_dino/processing_grounding_dino.d.ts.map +1 -0
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
- package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -2
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
- package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
- package/types/models/whisper/generation_whisper.d.ts +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models.d.ts +40 -17
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +7 -12
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/tsconfig.tsbuildinfo +1 -0
- package/types/utils/audio.d.ts +25 -0
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/core.d.ts +6 -0
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +3 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +8 -6
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +22 -6
- package/types/utils/tensor.d.ts.map +1 -1
package/src/models.js
CHANGED
|
@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
270
270
|
} else if (session_options.externalData !== undefined) {
|
|
271
271
|
externalDataPromises = session_options.externalData.map(async (ext) => {
|
|
272
272
|
// if the external data is a string, fetch the file and replace the string with its content
|
|
273
|
+
// @ts-expect-error TS2339
|
|
273
274
|
if (typeof ext.data === "string") {
|
|
275
|
+
// @ts-expect-error TS2339
|
|
274
276
|
const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
|
|
277
|
+
// @ts-expect-error TS2698
|
|
275
278
|
return { ...ext, data: ext_buffer };
|
|
276
279
|
}
|
|
277
280
|
return ext;
|
|
@@ -529,14 +532,23 @@ async function encoderForward(self, model_inputs) {
|
|
|
529
532
|
encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids });
|
|
530
533
|
}
|
|
531
534
|
if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) {
|
|
535
|
+
if (!encoderFeeds.input_ids) {
|
|
536
|
+
throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.');
|
|
537
|
+
}
|
|
532
538
|
// Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it,
|
|
533
539
|
// but they weren't created by the tokenizer.
|
|
534
|
-
encoderFeeds.token_type_ids =
|
|
535
|
-
'int64',
|
|
536
|
-
new BigInt64Array(encoderFeeds.input_ids.data.length),
|
|
537
|
-
encoderFeeds.input_ids.dims
|
|
538
|
-
)
|
|
540
|
+
encoderFeeds.token_type_ids = zeros_like(encoderFeeds.input_ids);
|
|
539
541
|
}
|
|
542
|
+
if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) {
|
|
543
|
+
if (!encoderFeeds.pixel_values) {
|
|
544
|
+
throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.');
|
|
545
|
+
}
|
|
546
|
+
// Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it,
|
|
547
|
+
// but they weren't created by the processor.
|
|
548
|
+
const dims = encoderFeeds.pixel_values.dims;
|
|
549
|
+
encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]);
|
|
550
|
+
}
|
|
551
|
+
|
|
540
552
|
return await sessionRun(session, encoderFeeds);
|
|
541
553
|
}
|
|
542
554
|
|
|
@@ -1519,6 +1531,7 @@ export class PreTrainedModel extends Callable {
|
|
|
1519
1531
|
if (this.config.model_type === 'musicgen') {
|
|
1520
1532
|
// Custom logic (TODO: move to Musicgen class)
|
|
1521
1533
|
decoder_input_ids = Array.from({
|
|
1534
|
+
// @ts-expect-error TS2339
|
|
1522
1535
|
length: batch_size * this.config.decoder.num_codebooks
|
|
1523
1536
|
}, () => [decoder_start_token_id]);
|
|
1524
1537
|
|
|
@@ -1848,11 +1861,13 @@ export class PreTrainedModel extends Callable {
|
|
|
1848
1861
|
async encode_image({ pixel_values }) {
|
|
1849
1862
|
// image_inputs === { pixel_values }
|
|
1850
1863
|
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
|
|
1864
|
+
// @ts-expect-error TS2339
|
|
1851
1865
|
if (!this.config.num_image_tokens) {
|
|
1852
1866
|
console.warn(
|
|
1853
1867
|
'The number of image tokens was not set in the model configuration. ' +
|
|
1854
1868
|
`Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
|
|
1855
1869
|
)
|
|
1870
|
+
// @ts-expect-error TS2339
|
|
1856
1871
|
this.config.num_image_tokens = features.dims[1];
|
|
1857
1872
|
}
|
|
1858
1873
|
return features;
|
|
@@ -3280,6 +3295,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3280
3295
|
|
|
3281
3296
|
if (generation_config.return_token_timestamps) {
|
|
3282
3297
|
outputs["token_timestamps"] = this._extract_token_timestamps(
|
|
3298
|
+
// @ts-expect-error TS2345
|
|
3283
3299
|
outputs,
|
|
3284
3300
|
generation_config.alignment_heads,
|
|
3285
3301
|
generation_config.num_frames,
|
|
@@ -3315,6 +3331,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3315
3331
|
);
|
|
3316
3332
|
}
|
|
3317
3333
|
|
|
3334
|
+
// @ts-expect-error TS2339
|
|
3318
3335
|
let median_filter_width = this.config.median_filter_width;
|
|
3319
3336
|
if (median_filter_width === undefined) {
|
|
3320
3337
|
console.warn("Model config has no `median_filter_width`, using default value of 7.")
|
|
@@ -3325,6 +3342,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3325
3342
|
const batch = generate_outputs.cross_attentions;
|
|
3326
3343
|
// Create a list with `decoder_layers` elements, each a tensor of shape
|
|
3327
3344
|
// (batch size, attention_heads, output length, input length).
|
|
3345
|
+
// @ts-expect-error TS2339
|
|
3328
3346
|
const cross_attentions = Array.from({ length: this.config.decoder_layers },
|
|
3329
3347
|
// Concatenate the cross attentions for each layer across sequence length dimension.
|
|
3330
3348
|
(_, i) => cat(batch.map(x => x[i]), 2)
|
|
@@ -3468,6 +3486,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
|
|
|
3468
3486
|
attention_mask,
|
|
3469
3487
|
}) {
|
|
3470
3488
|
|
|
3489
|
+
// @ts-expect-error TS2339
|
|
3471
3490
|
const image_token_index = this.config.image_token_index;
|
|
3472
3491
|
|
|
3473
3492
|
const idsList = input_ids.tolist();
|
|
@@ -4453,6 +4472,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
4453
4472
|
const image_nums = vision_tokens.filter(x => x == image_token_id).length;
|
|
4454
4473
|
const video_nums = vision_tokens.filter(x => x == video_token_id).length;
|
|
4455
4474
|
|
|
4475
|
+
/** @type {number[][]} */
|
|
4456
4476
|
let llm_pos_ids_list = [];
|
|
4457
4477
|
let st = 0;
|
|
4458
4478
|
let remain_images = image_nums;
|
|
@@ -4522,6 +4542,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
4522
4542
|
// NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
|
|
4523
4543
|
// meaning to perform concatenation along dim=1, we can do the following:
|
|
4524
4544
|
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
4545
|
+
/** @type {number[]} */
|
|
4525
4546
|
const llm_positions = new Array(num_items);
|
|
4526
4547
|
let index = 0;
|
|
4527
4548
|
for (let x = 0; x < 3; ++x) {
|
|
@@ -4562,9 +4583,10 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
4562
4583
|
{ length: 3 * data.length },
|
|
4563
4584
|
(_, i) => data[i % data.length]
|
|
4564
4585
|
);
|
|
4586
|
+
/** @type {bigint[]} */
|
|
4565
4587
|
const mrope_position_deltas = Array.from(
|
|
4566
4588
|
{ length: dims[0] },
|
|
4567
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] +
|
|
4589
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
4568
4590
|
);
|
|
4569
4591
|
|
|
4570
4592
|
return [
|
|
@@ -5135,7 +5157,7 @@ export class DPTModel extends DPTPreTrainedModel { }
|
|
|
5135
5157
|
*
|
|
5136
5158
|
* **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
|
|
5137
5159
|
* ```javascript
|
|
5138
|
-
* import { DPTForDepthEstimation, AutoProcessor, RawImage,
|
|
5160
|
+
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
5139
5161
|
*
|
|
5140
5162
|
* // Load model and processor
|
|
5141
5163
|
* const model_id = 'Xenova/dpt-hybrid-midas';
|
|
@@ -5144,7 +5166,7 @@ export class DPTModel extends DPTPreTrainedModel { }
|
|
|
5144
5166
|
*
|
|
5145
5167
|
* // Load image from URL
|
|
5146
5168
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
5147
|
-
* const image = await RawImage.
|
|
5169
|
+
* const image = await RawImage.read(url);
|
|
5148
5170
|
*
|
|
5149
5171
|
* // Prepare image for the model
|
|
5150
5172
|
* const inputs = await processor(image);
|
|
@@ -5153,10 +5175,15 @@ export class DPTModel extends DPTPreTrainedModel { }
|
|
|
5153
5175
|
* const { predicted_depth } = await model(inputs);
|
|
5154
5176
|
*
|
|
5155
5177
|
* // Interpolate to original size
|
|
5156
|
-
* const prediction =
|
|
5178
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
5179
|
+
* size: image.size.reverse(),
|
|
5180
|
+
* mode: 'bilinear',
|
|
5181
|
+
* })).squeeze(1);
|
|
5157
5182
|
*
|
|
5158
5183
|
* // Visualize the prediction
|
|
5159
|
-
* const
|
|
5184
|
+
* const min = prediction.min().item();
|
|
5185
|
+
* const max = prediction.max().item();
|
|
5186
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
5160
5187
|
* const depth = RawImage.fromTensor(formatted);
|
|
5161
5188
|
* // RawImage {
|
|
5162
5189
|
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
@@ -5206,11 +5233,7 @@ export class GLPNPreTrainedModel extends PreTrainedModel { }
|
|
|
5206
5233
|
export class GLPNModel extends GLPNPreTrainedModel { }
|
|
5207
5234
|
|
|
5208
5235
|
/**
|
|
5209
|
-
*
|
|
5210
|
-
*
|
|
5211
|
-
* **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
|
|
5212
|
-
* ```javascript
|
|
5213
|
-
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
|
|
5236
|
+
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
5214
5237
|
*
|
|
5215
5238
|
* // Load model and processor
|
|
5216
5239
|
* const model_id = 'Xenova/glpn-kitti';
|
|
@@ -5219,7 +5242,7 @@ export class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
5219
5242
|
*
|
|
5220
5243
|
* // Load image from URL
|
|
5221
5244
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
5222
|
-
* const image = await RawImage.
|
|
5245
|
+
* const image = await RawImage.read(url);
|
|
5223
5246
|
*
|
|
5224
5247
|
* // Prepare image for the model
|
|
5225
5248
|
* const inputs = await processor(image);
|
|
@@ -5228,13 +5251,18 @@ export class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
5228
5251
|
* const { predicted_depth } = await model(inputs);
|
|
5229
5252
|
*
|
|
5230
5253
|
* // Interpolate to original size
|
|
5231
|
-
* const prediction =
|
|
5254
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
5255
|
+
* size: image.size.reverse(),
|
|
5256
|
+
* mode: 'bilinear',
|
|
5257
|
+
* })).squeeze(1);
|
|
5232
5258
|
*
|
|
5233
5259
|
* // Visualize the prediction
|
|
5234
|
-
* const
|
|
5260
|
+
* const min = prediction.min().item();
|
|
5261
|
+
* const max = prediction.max().item();
|
|
5262
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
5235
5263
|
* const depth = RawImage.fromTensor(formatted);
|
|
5236
5264
|
* // RawImage {
|
|
5237
|
-
* // data: Uint8Array(307200) [
|
|
5265
|
+
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
5238
5266
|
* // width: 640,
|
|
5239
5267
|
* // height: 480,
|
|
5240
5268
|
* // channels: 1
|
|
@@ -5409,6 +5437,8 @@ export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegiste
|
|
|
5409
5437
|
}
|
|
5410
5438
|
}
|
|
5411
5439
|
//////////////////////////////////////////////////
|
|
5440
|
+
export class GroundingDinoPreTrainedModel extends PreTrainedModel { }
|
|
5441
|
+
export class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel { }
|
|
5412
5442
|
|
|
5413
5443
|
//////////////////////////////////////////////////
|
|
5414
5444
|
export class YolosPreTrainedModel extends PreTrainedModel { }
|
|
@@ -6107,6 +6137,9 @@ export class WavLMForAudioFrameClassification extends WavLMPreTrainedModel {
|
|
|
6107
6137
|
}
|
|
6108
6138
|
}
|
|
6109
6139
|
|
|
6140
|
+
export class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel { }
|
|
6141
|
+
export class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel { }
|
|
6142
|
+
|
|
6110
6143
|
//////////////////////////////////////////////////
|
|
6111
6144
|
// SpeechT5 models
|
|
6112
6145
|
/**
|
|
@@ -6201,10 +6234,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
|
|
|
6201
6234
|
|
|
6202
6235
|
const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
|
|
6203
6236
|
|
|
6237
|
+
// @ts-expect-error TS2339
|
|
6204
6238
|
const r = encoder_outputs.dims[1] / this.config.reduction_factor;
|
|
6205
6239
|
const maxlen = Math.floor(r * maxlenratio);
|
|
6206
6240
|
const minlen = Math.floor(r * minlenratio);
|
|
6207
6241
|
|
|
6242
|
+
// @ts-expect-error TS2339
|
|
6208
6243
|
const num_mel_bins = this.config.num_mel_bins;
|
|
6209
6244
|
|
|
6210
6245
|
let spectrogramParts = [];
|
|
@@ -6569,11 +6604,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
|
|
|
6569
6604
|
*/
|
|
6570
6605
|
_apply_and_filter_by_delay_pattern_mask(outputs) {
|
|
6571
6606
|
const [bs_x_codebooks, seqLength] = outputs.dims;
|
|
6607
|
+
// @ts-expect-error TS2339
|
|
6572
6608
|
const num_codebooks = this.config.decoder.num_codebooks;
|
|
6573
6609
|
const upperBound = (seqLength - num_codebooks);
|
|
6574
6610
|
|
|
6575
6611
|
let newDataSize = 0;
|
|
6576
6612
|
for (let i = 0; i < outputs.size; ++i) {
|
|
6613
|
+
// @ts-expect-error TS2339
|
|
6577
6614
|
if (outputs.data[i] === this.config.decoder.pad_token_id) {
|
|
6578
6615
|
continue;
|
|
6579
6616
|
}
|
|
@@ -6603,7 +6640,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
|
|
|
6603
6640
|
let clonedInputIds = structuredClone(input_ids);
|
|
6604
6641
|
for (let i = 0; i < clonedInputIds.length; ++i) {
|
|
6605
6642
|
for (let j = 0; j < clonedInputIds[i].length; ++j) {
|
|
6643
|
+
// @ts-expect-error TS2339
|
|
6606
6644
|
if ((i % this.config.decoder.num_codebooks) >= j) {
|
|
6645
|
+
// @ts-expect-error TS2339
|
|
6607
6646
|
clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
|
|
6608
6647
|
}
|
|
6609
6648
|
}
|
|
@@ -6760,6 +6799,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
|
|
|
6760
6799
|
'past_key_values',
|
|
6761
6800
|
];
|
|
6762
6801
|
|
|
6802
|
+
/**
|
|
6803
|
+
* @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
|
|
6804
|
+
*/
|
|
6763
6805
|
constructor(...args) {
|
|
6764
6806
|
super(...args);
|
|
6765
6807
|
|
|
@@ -7061,6 +7103,8 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
|
|
|
7061
7103
|
|
|
7062
7104
|
['maskformer', ['MaskFormerModel', MaskFormerModel]],
|
|
7063
7105
|
['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]],
|
|
7106
|
+
|
|
7107
|
+
['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]],
|
|
7064
7108
|
]);
|
|
7065
7109
|
|
|
7066
7110
|
const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
|
|
@@ -7305,6 +7349,7 @@ const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
|
|
|
7305
7349
|
const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
|
|
7306
7350
|
['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]],
|
|
7307
7351
|
['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]],
|
|
7352
|
+
['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]],
|
|
7308
7353
|
]);
|
|
7309
7354
|
|
|
7310
7355
|
const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
|
|
@@ -7728,10 +7773,17 @@ export class SequenceClassifierOutput extends ModelOutput {
|
|
|
7728
7773
|
/**
|
|
7729
7774
|
* @param {Object} output The output of the model.
|
|
7730
7775
|
* @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
|
|
7776
|
+
* @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
|
|
7777
|
+
* Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
|
7731
7778
|
*/
|
|
7732
|
-
constructor({ logits }) {
|
|
7779
|
+
constructor({ logits, ...attentions }) {
|
|
7733
7780
|
super();
|
|
7734
7781
|
this.logits = logits;
|
|
7782
|
+
const attentions_list = Object.values(attentions);
|
|
7783
|
+
if (attentions_list.length > 0) {
|
|
7784
|
+
// Only set attentions if they are not empty
|
|
7785
|
+
this.attentions = attentions_list;
|
|
7786
|
+
}
|
|
7735
7787
|
}
|
|
7736
7788
|
}
|
|
7737
7789
|
|
package/src/ops/registry.js
CHANGED
|
@@ -36,6 +36,16 @@ export class TensorOpRegistry {
|
|
|
36
36
|
// executionProviders: ['webgpu'],
|
|
37
37
|
};
|
|
38
38
|
|
|
39
|
+
static get nearest_interpolate_4d() {
|
|
40
|
+
if (!this._nearest_interpolate_4d) {
|
|
41
|
+
this._nearest_interpolate_4d = wrap(
|
|
42
|
+
[8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
|
|
43
|
+
this.session_options,
|
|
44
|
+
'y',
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
return this._nearest_interpolate_4d;
|
|
48
|
+
}
|
|
39
49
|
static get bilinear_interpolate_4d() {
|
|
40
50
|
if (!this._bilinear_interpolate_4d) {
|
|
41
51
|
this._bilinear_interpolate_4d = wrap(
|
package/src/pipelines.js
CHANGED
|
@@ -64,12 +64,13 @@ import {
|
|
|
64
64
|
round,
|
|
65
65
|
} from './utils/maths.js';
|
|
66
66
|
import {
|
|
67
|
-
read_audio
|
|
67
|
+
read_audio,
|
|
68
|
+
RawAudio
|
|
68
69
|
} from './utils/audio.js';
|
|
69
70
|
import {
|
|
70
71
|
Tensor,
|
|
71
72
|
mean_pooling,
|
|
72
|
-
|
|
73
|
+
interpolate_4d,
|
|
73
74
|
quantize_embeddings,
|
|
74
75
|
topk,
|
|
75
76
|
} from './utils/tensor.js';
|
|
@@ -294,6 +295,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi
|
|
|
294
295
|
|
|
295
296
|
// TODO: Use softmax tensor function
|
|
296
297
|
const function_to_apply =
|
|
298
|
+
// @ts-expect-error TS2339
|
|
297
299
|
this.model.config.problem_type === 'multi_label_classification'
|
|
298
300
|
? batch => batch.sigmoid()
|
|
299
301
|
: batch => new Tensor(
|
|
@@ -302,6 +304,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi
|
|
|
302
304
|
batch.dims,
|
|
303
305
|
); // single_label_classification (default)
|
|
304
306
|
|
|
307
|
+
// @ts-expect-error TS2339
|
|
305
308
|
const id2label = this.model.config.id2label;
|
|
306
309
|
|
|
307
310
|
const toReturn = [];
|
|
@@ -404,6 +407,7 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP
|
|
|
404
407
|
const outputs = await this.model(model_inputs)
|
|
405
408
|
|
|
406
409
|
const logits = outputs.logits;
|
|
410
|
+
// @ts-expect-error TS2339
|
|
407
411
|
const id2label = this.model.config.id2label;
|
|
408
412
|
|
|
409
413
|
const toReturn = [];
|
|
@@ -743,11 +747,14 @@ export class Text2TextGenerationPipeline extends (/** @type {new (options: TextP
|
|
|
743
747
|
|
|
744
748
|
|
|
745
749
|
// Add global prefix, if present
|
|
750
|
+
// @ts-expect-error TS2339
|
|
746
751
|
if (this.model.config.prefix) {
|
|
752
|
+
// @ts-expect-error TS2339
|
|
747
753
|
texts = texts.map(x => this.model.config.prefix + x)
|
|
748
754
|
}
|
|
749
755
|
|
|
750
756
|
// Handle task specific params:
|
|
757
|
+
// @ts-expect-error TS2339
|
|
751
758
|
const task_specific_params = this.model.config.task_specific_params
|
|
752
759
|
if (task_specific_params && task_specific_params[this.task]) {
|
|
753
760
|
// Add prefixes, if present
|
|
@@ -1486,6 +1493,7 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio
|
|
|
1486
1493
|
const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
|
|
1487
1494
|
const preparedAudios = await prepareAudios(audio, sampling_rate);
|
|
1488
1495
|
|
|
1496
|
+
// @ts-expect-error TS2339
|
|
1489
1497
|
const id2label = this.model.config.id2label;
|
|
1490
1498
|
|
|
1491
1499
|
const toReturn = [];
|
|
@@ -1796,6 +1804,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
|
|
|
1796
1804
|
audio = [/** @type {AudioInput} */ (audio)];
|
|
1797
1805
|
}
|
|
1798
1806
|
|
|
1807
|
+
// @ts-expect-error TS2339
|
|
1799
1808
|
const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
|
|
1800
1809
|
const hop_length = this.processor.feature_extractor.config.hop_length;
|
|
1801
1810
|
|
|
@@ -1861,7 +1870,9 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
|
|
|
1861
1870
|
|
|
1862
1871
|
// TODO: Right now we only get top beam
|
|
1863
1872
|
if (return_timestamps === 'word') {
|
|
1873
|
+
// @ts-expect-error TS2339
|
|
1864
1874
|
chunk.tokens = data.sequences.tolist()[0];
|
|
1875
|
+
// @ts-expect-error TS2339
|
|
1865
1876
|
chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
|
|
1866
1877
|
(/** @type {number} */ x) => round(x, 2)
|
|
1867
1878
|
);
|
|
@@ -1906,7 +1917,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
|
|
|
1906
1917
|
const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
|
|
1907
1918
|
const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
|
|
1908
1919
|
|
|
1909
|
-
const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
|
|
1920
|
+
const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
|
|
1910
1921
|
toReturn.push({ text });
|
|
1911
1922
|
}
|
|
1912
1923
|
return single ? toReturn[0] : toReturn;
|
|
@@ -2055,6 +2066,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image
|
|
|
2055
2066
|
const { pixel_values } = await this.processor(preparedImages);
|
|
2056
2067
|
const output = await this.model({ pixel_values });
|
|
2057
2068
|
|
|
2069
|
+
// @ts-expect-error TS2339
|
|
2058
2070
|
const id2label = this.model.config.id2label;
|
|
2059
2071
|
|
|
2060
2072
|
/** @type {ImageClassificationOutput[]} */
|
|
@@ -2169,6 +2181,7 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi
|
|
|
2169
2181
|
}
|
|
2170
2182
|
}
|
|
2171
2183
|
|
|
2184
|
+
// @ts-expect-error TS2339
|
|
2172
2185
|
const id2label = this.model.config.id2label;
|
|
2173
2186
|
|
|
2174
2187
|
/** @type {ImageSegmentationPipelineOutput[]} */
|
|
@@ -2395,6 +2408,7 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe
|
|
|
2395
2408
|
const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
|
|
2396
2409
|
|
|
2397
2410
|
// Add labels
|
|
2411
|
+
// @ts-expect-error TS2339
|
|
2398
2412
|
const id2label = this.model.config.id2label;
|
|
2399
2413
|
|
|
2400
2414
|
// Format output
|
|
@@ -2539,13 +2553,35 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T
|
|
|
2539
2553
|
// Run model with both text and pixel inputs
|
|
2540
2554
|
const output = await this.model({ ...text_inputs, pixel_values });
|
|
2541
2555
|
|
|
2542
|
-
|
|
2543
|
-
|
|
2544
|
-
|
|
2545
|
-
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2556
|
+
let result;
|
|
2557
|
+
if('post_process_grounded_object_detection' in this.processor) {
|
|
2558
|
+
// @ts-ignore
|
|
2559
|
+
const processed = this.processor.post_process_grounded_object_detection(
|
|
2560
|
+
output,
|
|
2561
|
+
text_inputs.input_ids,
|
|
2562
|
+
{
|
|
2563
|
+
// TODO: support separate threshold values
|
|
2564
|
+
box_threshold: threshold,
|
|
2565
|
+
text_threshold: threshold,
|
|
2566
|
+
target_sizes: imageSize,
|
|
2567
|
+
},
|
|
2568
|
+
)[0];
|
|
2569
|
+
result = processed.boxes.map((box, i) => ({
|
|
2570
|
+
score: processed.scores[i],
|
|
2571
|
+
label: processed.labels[i],
|
|
2572
|
+
box: get_bounding_box(box, !percentage),
|
|
2573
|
+
}))
|
|
2574
|
+
} else {
|
|
2575
|
+
// @ts-ignore
|
|
2576
|
+
const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
|
|
2577
|
+
result = processed.boxes.map((box, i) => ({
|
|
2578
|
+
score: processed.scores[i],
|
|
2579
|
+
label: candidate_labels[processed.classes[i]],
|
|
2580
|
+
box: get_bounding_box(box, !percentage),
|
|
2581
|
+
}))
|
|
2582
|
+
}
|
|
2583
|
+
result.sort((a, b) => b.score - a.score);
|
|
2584
|
+
|
|
2549
2585
|
if (top_k !== null) {
|
|
2550
2586
|
result = result.slice(0, top_k);
|
|
2551
2587
|
}
|
|
@@ -2614,6 +2650,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
|
|
|
2614
2650
|
// Run model
|
|
2615
2651
|
const output = await this.model.generate({
|
|
2616
2652
|
inputs: pixel_values,
|
|
2653
|
+
// @ts-expect-error TS2339
|
|
2617
2654
|
max_length: this.model.config.decoder.max_position_embeddings,
|
|
2618
2655
|
decoder_input_ids,
|
|
2619
2656
|
...generate_kwargs,
|
|
@@ -2664,7 +2701,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
|
|
|
2664
2701
|
* const synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { quantized: false });
|
|
2665
2702
|
* const speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
|
|
2666
2703
|
* const out = await synthesizer('Hello, my dog is cute', { speaker_embeddings });
|
|
2667
|
-
* // {
|
|
2704
|
+
* // RawAudio {
|
|
2668
2705
|
* // audio: Float32Array(26112) [-0.00005657337896991521, 0.00020583874720614403, ...],
|
|
2669
2706
|
* // sampling_rate: 16000
|
|
2670
2707
|
* // }
|
|
@@ -2684,7 +2721,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
|
|
|
2684
2721
|
* ```javascript
|
|
2685
2722
|
* const synthesizer = await pipeline('text-to-speech', 'Xenova/mms-tts-fra');
|
|
2686
2723
|
* const out = await synthesizer('Bonjour');
|
|
2687
|
-
* // {
|
|
2724
|
+
* // RawAudio {
|
|
2688
2725
|
* // audio: Float32Array(23808) [-0.00037693005288019776, 0.0003325853613205254, ...],
|
|
2689
2726
|
* // sampling_rate: 16000
|
|
2690
2727
|
* // }
|
|
@@ -2729,11 +2766,12 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi
|
|
|
2729
2766
|
// Generate waveform
|
|
2730
2767
|
const { waveform } = await this.model(inputs);
|
|
2731
2768
|
|
|
2769
|
+
// @ts-expect-error TS2339
|
|
2732
2770
|
const sampling_rate = this.model.config.sampling_rate;
|
|
2733
|
-
return
|
|
2734
|
-
|
|
2771
|
+
return new RawAudio(
|
|
2772
|
+
waveform.data,
|
|
2735
2773
|
sampling_rate,
|
|
2736
|
-
|
|
2774
|
+
)
|
|
2737
2775
|
}
|
|
2738
2776
|
|
|
2739
2777
|
async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) {
|
|
@@ -2773,10 +2811,10 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi
|
|
|
2773
2811
|
const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder });
|
|
2774
2812
|
|
|
2775
2813
|
const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
|
|
2776
|
-
return
|
|
2777
|
-
|
|
2814
|
+
return new RawAudio(
|
|
2815
|
+
waveform.data,
|
|
2778
2816
|
sampling_rate,
|
|
2779
|
-
|
|
2817
|
+
)
|
|
2780
2818
|
}
|
|
2781
2819
|
}
|
|
2782
2820
|
|
|
@@ -2886,11 +2924,23 @@ export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipe
|
|
|
2886
2924
|
|
|
2887
2925
|
const toReturn = [];
|
|
2888
2926
|
for (let i = 0; i < preparedImages.length; ++i) {
|
|
2889
|
-
const
|
|
2890
|
-
const
|
|
2927
|
+
const batch = predicted_depth[i];
|
|
2928
|
+
const [height, width] = batch.dims.slice(-2);
|
|
2929
|
+
const [new_width, new_height] = preparedImages[i].size;
|
|
2930
|
+
|
|
2931
|
+
// Interpolate to original size
|
|
2932
|
+
const prediction = (await interpolate_4d(batch.view(1, 1, height, width), {
|
|
2933
|
+
size: [new_height, new_width],
|
|
2934
|
+
mode: 'bilinear',
|
|
2935
|
+
})).view(new_height, new_width);
|
|
2936
|
+
|
|
2937
|
+
const minval = /** @type {number} */(prediction.min().item());
|
|
2938
|
+
const maxval = /** @type {number} */(prediction.max().item());
|
|
2939
|
+
const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
|
|
2940
|
+
const depth = RawImage.fromTensor(formatted);
|
|
2891
2941
|
toReturn.push({
|
|
2892
|
-
predicted_depth:
|
|
2893
|
-
depth
|
|
2942
|
+
predicted_depth: prediction,
|
|
2943
|
+
depth,
|
|
2894
2944
|
});
|
|
2895
2945
|
}
|
|
2896
2946
|
|
|
@@ -3368,4 +3418,4 @@ async function loadItems(mapping, model, pretrainedOptions) {
|
|
|
3368
3418
|
}
|
|
3369
3419
|
|
|
3370
3420
|
return result;
|
|
3371
|
-
}
|
|
3421
|
+
}
|
package/src/tokenizers.js
CHANGED
|
@@ -47,10 +47,8 @@ import {
|
|
|
47
47
|
import { Template } from '@huggingface/jinja';
|
|
48
48
|
|
|
49
49
|
import {
|
|
50
|
-
WHISPER_LANGUAGE_MAPPING
|
|
51
|
-
whisper_language_to_code,
|
|
50
|
+
WHISPER_LANGUAGE_MAPPING
|
|
52
51
|
} from './models/whisper/common_whisper.js';
|
|
53
|
-
import { GITHUB_ISSUE_URL } from './utils/constants.js';
|
|
54
52
|
|
|
55
53
|
/**
|
|
56
54
|
* @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
|
|
@@ -535,7 +533,7 @@ class Unigram extends TokenizerModel {
|
|
|
535
533
|
* Create a new Unigram tokenizer model.
|
|
536
534
|
* @param {Object} config The configuration object for the Unigram model.
|
|
537
535
|
* @param {number} config.unk_id The ID of the unknown token
|
|
538
|
-
* @param {
|
|
536
|
+
* @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
|
|
539
537
|
* @param {Object} moreConfig Additional configuration object for the Unigram model.
|
|
540
538
|
*/
|
|
541
539
|
constructor(config, moreConfig) {
|
|
@@ -543,11 +541,10 @@ class Unigram extends TokenizerModel {
|
|
|
543
541
|
|
|
544
542
|
const vocabSize = config.vocab.length;
|
|
545
543
|
this.vocab = new Array(vocabSize);
|
|
544
|
+
/** @type {number[]} */
|
|
546
545
|
this.scores = new Array(vocabSize);
|
|
547
546
|
for (let i = 0; i < vocabSize; ++i) {
|
|
548
|
-
|
|
549
|
-
this.vocab[i] = piece[0];
|
|
550
|
-
this.scores[i] = piece[1];
|
|
547
|
+
[this.vocab[i], this.scores[i]] = config.vocab[i];
|
|
551
548
|
}
|
|
552
549
|
|
|
553
550
|
this.unk_token_id = config.unk_id;
|